Ejemplo n.º 1
0
def cut(recording):
    return MonoCut(
        id="cut",
        start=0,
        duration=1.0,
        channel=0,
        recording=recording,
        supervisions=[
            SupervisionSegment(id="sup",
                               recording_id=recording.id,
                               start=0,
                               duration=0.5)
        ],
    )
Ejemplo n.º 2
0
def test_cut_move_to_memory_audio_serialization():
    path = "test/fixtures/mono_c0.wav"
    cut = dummy_cut(0, duration=0.5).drop_recording()
    cut.recording = Recording.from_file(path)

    cut_with_audio = cut.move_to_memory()

    assert cut.custom is None  # original cut is unmodified

    data = cut_with_audio.to_dict()
    cut_deserialized = MonoCut.from_dict(data)

    np.testing.assert_equal(cut_deserialized.load_audio(),
                            cut_with_audio.load_audio())
Ejemplo n.º 3
0
def test_cut_load_custom_recording_pad_both():
    sampling_rate = 16000
    duration = 52.4
    audio = np.random.randn(1, compute_num_samples(
        duration, sampling_rate)).astype(np.float32)
    audio /= np.abs(audio).max()  # normalize to [-1, 1]
    with NamedTemporaryFile(suffix=".wav") as f:
        torchaudio.save(f.name, torch.from_numpy(audio), sampling_rate)
        f.flush()
        os.fsync(f)
        recording = Recording.from_file(f.name)

        # Note: MonoCut doesn't normally have an "alignment" attribute,
        #       and a "load_alignment()" method.
        #       We are dynamically extending it.
        cut = MonoCut(
            id="x",
            start=0,
            duration=duration,
            channel=0,
            recording=dummy_recording(0, duration=duration),
        )
        cut.my_favorite_song = recording

        cut_pad = cut.pad(duration=duration + 1,
                          direction="left").pad(duration=60.0,
                                                direction="right")

        restored_audio = cut_pad.load_my_favorite_song()
        assert restored_audio.shape == (1, 960000)  # 16000 * 60

        np.testing.assert_almost_equal(0, restored_audio[:, :sampling_rate])
        np.testing.assert_almost_equal(
            audio,
            restored_audio[:, sampling_rate:audio.shape[1] + sampling_rate])
        np.testing.assert_almost_equal(
            0, restored_audio[:, sampling_rate + audio.shape[1]:])
Ejemplo n.º 4
0
def test_cut_load_temporal_array_pad(pad_value):
    """Check the array loaded via TemporalArray is padded along with the cut."""
    with NamedTemporaryFile(suffix=".h5") as f, NumpyHdf5Writer(
            f.name) as writer:
        cut = MonoCut(
            id="x",
            start=0,
            duration=52.4,  # 131 frames x 0.4s frame shift == 52.4s
            channel=0,
            recording=dummy_recording(1),
        )

        alignment = np.random.randint(500, size=131)
        cut.alignment = writer.store_array(key="utt1",
                                           value=alignment,
                                           frame_shift=0.4,
                                           temporal_dim=0)
        cut_pad = cut.pad(duration=60.0,
                          pad_value_dict={"alignment": pad_value})

        alignment_pad = cut_pad.load_alignment()
        assert alignment_pad.shape == (150, )  # 60.0 / 0.4 == 150
        np.testing.assert_equal(alignment_pad[:131], alignment)
        np.testing.assert_equal(alignment_pad[131:], pad_value)
Ejemplo n.º 5
0
def random_cut_set(n_cuts=100) -> CutSet:
    return CutSet.from_cuts(
        MonoCut(
            id=uuid4(),
            start=round(random.uniform(0, 5), ndigits=8),
            duration=round(random.uniform(3, 10), ndigits=8),
            channel=0,
            recording=Recording(
                id=uuid4(),
                sources=[],
                sampling_rate=16000,
                num_samples=1600000,
                duration=100.0,
            ),
        ) for _ in range(n_cuts))
Ejemplo n.º 6
0
def test_augmentation_chain_randomized(
    recording: Recording,
    rir: Recording,
    target_sampling_rate: int,
    sp_factor: float,
    vp_factor: float,
    reverb: bool,
    resample_first: bool,
    cut_duration: Seconds,
):
    if resample_first:
        recording_aug = (
            recording.resample(target_sampling_rate)
            .perturb_speed(sp_factor)
            .perturb_volume(vp_factor)
        )
    else:
        recording_aug = (
            recording.perturb_speed(sp_factor)
            .resample(target_sampling_rate)
            .perturb_volume(vp_factor)
        )
    if reverb:
        recording_aug = recording_aug.reverb_rir(rir)

    audio_aug = recording_aug.load_audio()
    assert audio_aug.shape[1] == recording_aug.num_samples

    cut_aug = MonoCut(
        id="dummy",
        start=0.5125,
        duration=cut_duration,
        channel=0,
        recording=recording_aug,
    )
    assert cut_aug.load_audio().shape[1] == cut_aug.num_samples
Ejemplo n.º 7
0
def libri_cut_with_supervision(libri_recording_orig):
    return MonoCut(
        id="libri_cut_1",
        start=0,
        duration=libri_recording_orig.duration,
        channel=0,
        supervisions=[
            SupervisionSegment(
                id="sup",
                recording_id="rec",
                start=0,
                duration=libri_recording_orig.duration,
            )
        ],
        recording=libri_recording_orig,
    )
Ejemplo n.º 8
0
def random_cut_set(n_cuts=100) -> CutSet:
    sr = 16000
    return CutSet.from_cuts(
        MonoCut(
            id=uuid4(),
            start=random.randint(0, 5 * sr) / sr,
            duration=random.randint(3 * sr, 10 * sr) / sr,
            channel=0,
            recording=Recording(
                id=uuid4(),
                sources=[],
                sampling_rate=16000,
                num_samples=1600000,
                duration=100.0,
            ),
        )
        for _ in range(n_cuts)
    )
Ejemplo n.º 9
0
 def with_cut(
     self,
     sampling_rate: int,
     num_samples: int,
     features: bool = True,
     supervision: bool = False,
     alignment: bool = False,
     custom_field: bool = False,
     frame_shift: Seconds = 0.01,
     use_zeroes: bool = False,
 ) -> MonoCut:
     duration = num_samples / sampling_rate
     cut = MonoCut(
         id=str(uuid4()),
         start=0,
         duration=duration,
         channel=0,
         recording=self.with_recording(
             sampling_rate=sampling_rate,
             num_samples=num_samples,
             use_zeros=use_zeroes,
         ),
     )
     if features:
         cut = self._with_features(
             cut, frame_shift=frame_shift, sampling_rate=sampling_rate
         )
     if supervision:
         cut.supervisions.append(
             SupervisionSegment(
                 id=f"sup-{cut.id}",
                 recording_id=cut.recording_id,
                 start=0,
                 duration=cut.duration,
                 text="irrelevant",
                 alignment=self._with_alignment(cut, "irrelevant")
                 if alignment
                 else None,
             )
         )
     if custom_field:
         self._with_custom_temporal_array(cut=cut, frame_shift=frame_shift)
     return cut
Ejemplo n.º 10
0
    def write(self, manifest: MonoCut) -> bool:
        """
        Converts a Cut to a dict, pickles it, and then stores into a tarfile.

        :param manifest: the manifest to be written.
        :return: bool indicating whether the writing was successful.
        """
        with suppress_and_warn(Exception, enabled=self.fault_tolerant):
            cut = manifest.move_to_memory(
                audio_format=self.audio_format,
                load_audio=self.load_audio,
                load_features=self.load_features,
                load_custom=self.load_custom,
            )
            data = pickle.dumps(cut.to_dict())
            self.writer.write({"__key__": cut.id, "data": data})
            return True
        # Will get here if an exception happened.
        return False
Ejemplo n.º 11
0
def cut_set():
    cut = MonoCut(
        id="cut-1",
        start=0.0,
        duration=10.0,
        channel=0,
        features=Features(
            type="fbank",
            num_frames=100,
            num_features=40,
            frame_shift=0.01,
            sampling_rate=16000,
            start=0.0,
            duration=10.0,
            storage_type="lilcom",
            storage_path="irrelevant",
            storage_key="irrelevant",
        ),
        recording=Recording(
            id="rec-1",
            sampling_rate=16000,
            num_samples=160000,
            duration=10.0,
            sources=[
                AudioSource(type="file", channels=[0], source="irrelevant")
            ],
        ),
        supervisions=[
            SupervisionSegment(id="sup-1",
                               recording_id="irrelevant",
                               start=0.5,
                               duration=6.0),
            SupervisionSegment(id="sup-2",
                               recording_id="irrelevant",
                               start=7.0,
                               duration=2.0),
        ],
    )
    return CutSet.from_cuts([
        cut,
        fastcopy(cut, id="cut-nosup", supervisions=[]),
        fastcopy(cut, id="cut-norec", recording=None),
        fastcopy(cut, id="cut-nofeat", features=None),
        cut.pad(duration=30.0, direction="left"),
        cut.pad(duration=30.0, direction="right"),
        cut.pad(duration=30.0, direction="both"),
        cut.mix(cut, offset_other_by=5.0, snr=8),
    ])
def mono_cut():
    """
    Scenario::

        |-----------------Recording-----------------|
           "Hey, Matt!"  "Yes?"
        |--------------| |-----|  "Oh, nothing"
                             |------------------|
        |-------------------Cut1--------------------|
    """
    rec = Recording(id="rec1",
                    duration=10.0,
                    sampling_rate=8000,
                    num_samples=80000,
                    sources=[...])
    sups = [
        SupervisionSegment(id="sup1",
                           recording_id="rec1",
                           start=0.0,
                           duration=3.37,
                           text="Hey, Matt!"),
        SupervisionSegment(id="sup2",
                           recording_id="rec1",
                           start=4.5,
                           duration=0.9,
                           text="Yes?"),
        SupervisionSegment(id="sup3",
                           recording_id="rec1",
                           start=4.9,
                           duration=4.3,
                           text="Oh, nothing"),
    ]
    return MonoCut(
        id="rec1-cut1",
        start=0.0,
        duration=10.0,
        channel=0,
        recording=rec,
        supervisions=sups,
    )
Ejemplo n.º 13
0
def cut_set():
    cut = MonoCut(id='cut-1',
                  start=0.0,
                  duration=10.0,
                  channel=0,
                  features=Features(
                      type='fbank',
                      num_frames=100,
                      num_features=40,
                      frame_shift=0.01,
                      sampling_rate=16000,
                      start=0.0,
                      duration=10.0,
                      storage_type='lilcom',
                      storage_path='irrelevant',
                      storage_key='irrelevant',
                  ),
                  recording=Recording(id='rec-1',
                                      sampling_rate=16000,
                                      num_samples=160000,
                                      duration=10.0,
                                      sources=[
                                          AudioSource(type='file',
                                                      channels=[0],
                                                      source='irrelevant')
                                      ]),
                  supervisions=[
                      SupervisionSegment(id='sup-1',
                                         recording_id='irrelevant',
                                         start=0.5,
                                         duration=6.0),
                      SupervisionSegment(id='sup-2',
                                         recording_id='irrelevant',
                                         start=7.0,
                                         duration=2.0)
                  ])
    return CutSet.from_cuts([
        cut,
        fastcopy(cut, id='cut-nosup', supervisions=[]),
        fastcopy(cut, id='cut-norec', recording=None),
        fastcopy(cut, id='cut-nofeat', features=None),
        cut.pad(duration=30.0, direction='left'),
        cut.pad(duration=30.0, direction='right'),
        cut.pad(duration=30.0, direction='both'),
        cut.mix(cut, offset_other_by=5.0, snr=8)
    ])
Ejemplo n.º 14
0
def test_padding_issue_478():
    """
    https://github.com/lhotse-speech/lhotse/issues/478
    """
    with NamedTemporaryFile(suffix=".h5") as f, NumpyHdf5Writer(
            f.name) as writer:

        # Prepare data for cut 1.
        cut1 = MonoCut("c1",
                       start=0,
                       duration=4.9,
                       channel=0,
                       recording=dummy_recording(1))
        ali1 = np.random.randint(500, size=(121, ))
        cut1.label_alignment = writer.store_array("c1",
                                                  ali1,
                                                  frame_shift=0.04,
                                                  temporal_dim=0)

        # Prepare data for cut 2.
        cut2 = MonoCut("c2",
                       start=0,
                       duration=4.895,
                       channel=0,
                       recording=dummy_recording(2))
        ali2 = np.random.randint(500, size=(121, ))
        cut2.label_alignment = writer.store_array("c2",
                                                  ali2,
                                                  frame_shift=0.04,
                                                  temporal_dim=0)

        # Test collation behavior on this cutset.
        cuts = CutSet.from_cuts([cut1, cut2])
        label_alignments, label_alignment_lens = collate_custom_field(
            cuts, "label_alignment")

        np.testing.assert_equal(label_alignments[0].numpy(), ali1)
        np.testing.assert_equal(label_alignments[1].numpy(), ali2)
Ejemplo n.º 15
0
 def _with_features(self, cut: MonoCut, frame_shift: Seconds) -> MonoCut:
     d = TemporaryDirectory()
     self.dirs.append(d)
     extractor = Fbank(config=FbankConfig(frame_shift=frame_shift))
     with LilcomHdf5Writer(d.name) as storage:
         return cut.compute_and_store_features(extractor, storage=storage)
Ejemplo n.º 16
0
def cut(recording):
    return MonoCut(id="cut",
                   start=0,
                   duration=1.0,
                   channel=0,
                   recording=recording)
Ejemplo n.º 17
0
def _prepare_voxceleb_trials(
    manifests: Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]
) -> Dict[str, Tuple[CutSet, CutSet]]:
    """
    Prepare the trials file for the VoxCeleb1 corpus.
    """
    recordings = manifests["recordings"]
    supervisions = manifests["supervisions"]
    cuts_utt1_pos, cuts_utt2_pos, cuts_utt1_neg, cuts_utt2_neg = [], [], [], []
    urlretrieve_progress(VOXCELEB1_TRIALS_URL, filename="voxceleb_trials.txt")
    with open("voxceleb_trials.txt", "r") as f:
        for idx, line in enumerate(f):
            target, utt1, utt2 = line.strip().split(" ")
            # id10270/x6uYqmx31kE/00001.wav -> id10270-x6uYqmx31kE-00001
            utt1 = "-".join(utt1.split(".")[0].split("/"))
            utt2 = "-".join(utt2.split(".")[0].split("/"))
            if utt1 not in recordings or utt2 not in recordings:
                logging.warning(
                    f"Trial {idx} contains unknown recording: {utt1} or {utt2}"
                )
                continue
            if target == "1":
                cuts_utt1_pos.append(
                    MonoCut(
                        id=f"trial-{idx}",
                        recording=recordings[utt1],
                        start=0,
                        duration=recordings[utt1].duration,
                        supervisions=supervisions[utt1],
                        channel=0,
                    )
                )
                cuts_utt2_pos.append(
                    MonoCut(
                        id=f"trial-{idx}",
                        recording=recordings[utt2],
                        start=0,
                        duration=recordings[utt2].duration,
                        supervisions=supervisions[utt2],
                        channel=0,
                    )
                )
            else:
                cuts_utt1_neg.append(
                    MonoCut(
                        id=f"trial-{idx}",
                        recording=recordings[utt1],
                        start=0,
                        duration=recordings[utt1].duration,
                        supervisions=supervisions[utt1],
                        channel=0,
                    )
                )
                cuts_utt2_neg.append(
                    MonoCut(
                        id=f"trial-{idx}",
                        recording=recordings[utt2],
                        start=0,
                        duration=recordings[utt2].duration,
                        supervisions=supervisions[utt2],
                        channel=0,
                    )
                )
    return {
        "pos_trials": (
            CutSet.from_cuts(cuts_utt1_pos),
            CutSet.from_cuts(cuts_utt2_pos),
        ),
        "neg_trials": (
            CutSet.from_cuts(cuts_utt1_neg),
            CutSet.from_cuts(cuts_utt2_neg),
        ),
    }
Ejemplo n.º 18
0
 def test_cut_audio_mask(self):
     cut = MonoCut(
         "cut", start=0, duration=2, channel=0, recording=Mock(sampling_rate=16000)
     )
     mask = cut.supervisions_audio_mask()
     assert mask.sum() == 0