def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "file":
             datasets.Value("string"),
             "audio":
             datasets.Audio(sampling_rate=16_000),
             "text":
             datasets.Value("string"),
             "speaker_id":
             datasets.Value("int64"),
             "chapter_id":
             datasets.Value("int64"),
             "id":
             datasets.Value("string"),
         }),
         supervised_keys=("file", "text"),
         homepage=_URL,
         citation=_CITATION,
         task_templates=[
             AutomaticSpeechRecognition(audio_column="audio",
                                        transcription_column="text")
         ],
     )
Esempio n. 2
0
    def _info(self):
        features = datasets.Features({
            "speaker_id":
            datasets.Value("string"),
            "age":
            datasets.Value("string"),
            "gender":
            datasets.ClassLabel(names=_SEX),
            "region_of_birth":
            datasets.ClassLabel(names=_REGIONS),
            "region_of_youth":
            datasets.ClassLabel(names=_REGIONS),
            "text":
            datasets.Value("string"),
            "path":
            datasets.Value("string"),
            "audio":
            datasets.Audio(sampling_rate=16_000)
        })

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_URL,
            task_templates=[
                AutomaticSpeechRecognition(audio_file_path_column="path",
                                           transcription_column="text")
            ],
        )
Esempio n. 3
0
    def _info(self):
        features = datasets.Features({
            "utterance_id":
            datasets.Value("string"),
            "session":
            datasets.Value("string"),
            "test":
            datasets.Value("string"),
            "prompt":
            datasets.Value("string"),
            "transcript":
            datasets.Value("string"),
            "phonemes":
            datasets.Sequence(datasets.Value("string")),
            "correctness":
            datasets.Value("bool"),
            "aq_index":
            datasets.Value("float"),
            "duration_frames":
            datasets.Value("uint64"),
            "audio":
            datasets.Audio(sampling_rate=16_000)
        })

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage="https://psst.study/",
            task_templates=[
                AutomaticSpeechRecognition(audio_file_path_column="filename",
                                           transcription_column="transcript")
            ],
        )
Esempio n. 4
0
 def _info(self):
     return ds.DatasetInfo(
         description="",
         citation="",
         homepage="",
         license="",
         features=ds.Features(
             {
                 "client_id": ds.Value("string"),
                 "path": ds.Value("string"),
                 "audio": ds.Audio(sampling_rate=48_000),
                 "sentence": ds.Value("string"),
                 "up_votes": ds.Value("int64"),
                 "down_votes": ds.Value("int64"),
                 "age": ds.Value("string"),
                 "gender": ds.Value("string"),
                 "accent": ds.Value("string"),
                 "locale": ds.Value("string"),
                 "segment": ds.Value("string"),
             }
         ),
         task_templates=[
             AutomaticSpeechRecognition(
                 audio_file_path_column="path", transcription_column="sentence"
             )
         ],
     )
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "file": datasets.Value("string"),
                 "audio": datasets.Audio(sampling_rate=16_000),
                 "text": datasets.Value("string"),
                 "phonetic_detail": datasets.Sequence(
                     {
                         "start": datasets.Value("int64"),
                         "stop": datasets.Value("int64"),
                         "utterance": datasets.Value("string"),
                     }
                 ),
                 "word_detail": datasets.Sequence(
                     {
                         "start": datasets.Value("int64"),
                         "stop": datasets.Value("int64"),
                         "utterance": datasets.Value("string"),
                     }
                 ),
                 "dialect_region": datasets.Value("string"),
                 "sentence_type": datasets.Value("string"),
                 "speaker_id": datasets.Value("string"),
                 "id": datasets.Value("string"),
             }
         ),
         supervised_keys=("file", "text"),
         homepage=_HOMEPAGE,
         citation=_CITATION,
         task_templates=[AutomaticSpeechRecognition(audio_column="audio", transcription_column="text")],
     )
Esempio n. 6
0
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             client_id=datasets.Value("string"),
             file=datasets.Value("string"),
             audio=datasets.Audio(sampling_rate=16_000),
             sentence=datasets.Value("string"),
             translation=datasets.Value("string"),
             id=datasets.Value("string"),
         ),
         supervised_keys=("file", "translation"),
         homepage=_HOMEPAGE,
         citation=_CITATION,
     )
Esempio n. 7
0
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "id": datasets.Value("string"),
                 "audio": datasets.Audio(sampling_rate=22050),
                 "file": datasets.Value("string"),
                 "text": datasets.Value("string"),
                 "normalized_text": datasets.Value("string"),
             }
         ),
         supervised_keys=("file", "text"),
         homepage=_URL,
         citation=_CITATION,
         task_templates=[AutomaticSpeechRecognition(audio_file_path_column="file", transcription_column="text")],
     )
Esempio n. 8
0
 def _info(self):
     return datasets.DatasetInfo(
         # This is the description that will appear on the datasets page.
         description=_DESCRIPTION,
         features=datasets.Features({
             "speaker_id":
             datasets.Value("string"),
             "path":
             datasets.Value("string"),
             "audio":
             datasets.Audio(sampling_rate=16_000),
             "sentence":
             datasets.Value("string"),
         }),
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )
    def _info(self):
        features = datasets.Features(
            {
                "client_id": datasets.Value("string"),
                "path": datasets.Value("string"),
                "audio": datasets.Audio(sampling_rate=48_000),
                "sentence": datasets.Value("string"),
                "age": datasets.Value("string"),
                "gender": datasets.Value("string"),
                "language": datasets.ClassLabel(names=_LANGUAGES),
            }
        )

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )
Esempio n. 10
0
    def _info(self):
        features = datasets.Features({
            "path":
            datasets.Value("string"),
            "audio":
            datasets.Audio(sampling_rate=48_000),
            "sentence":
            datasets.Value("string"),
        })

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
            task_templates=[
                AutomaticSpeechRecognition(audio_column="audio",
                                           transcription_column="sentence")
            ],
        )
Esempio n. 11
0
    def _info(self):
        features = datasets.Features({
            "client_id":
            datasets.Value("string"),
            "path":
            datasets.Value("string"),
            "audio":
            datasets.Audio(sampling_rate=48_000),
            "sentence":
            datasets.Value("string"),
            "up_votes":
            datasets.Value("int64"),
            "down_votes":
            datasets.Value("int64"),
            "age":
            datasets.Value("string"),
            "gender":
            datasets.Value("string"),
            "accent":
            datasets.Value("string"),
            "locale":
            datasets.Value("string"),
            "segment":
            datasets.Value("string"),
        })

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
            task_templates=[
                AutomaticSpeechRecognition(audio_file_path_column="path",
                                           transcription_column="sentence")
            ],
        )
Esempio n. 12
0
class Superb(datasets.GeneratorBasedBuilder):
    """Superb dataset."""

    BUILDER_CONFIGS = [
        SuperbConfig(
            name="asr",
            description=textwrap.dedent(
                """\
            ASR transcribes utterances into words. While PR analyzes the
            improvement in modeling phonetics, ASR reflects the significance of
            the improvement in a real-world scenario. LibriSpeech
            train-clean-100/dev-clean/test-clean subsets are used for
            training/validation/testing. The evaluation metric is word error
            rate (WER)."""
            ),
            features=datasets.Features(
                {
                    "file": datasets.Value("string"),
                    "audio": datasets.Audio(sampling_rate=16_000),
                    "text": datasets.Value("string"),
                    "speaker_id": datasets.Value("int64"),
                    "chapter_id": datasets.Value("int64"),
                    "id": datasets.Value("string"),
                }
            ),
            supervised_keys=("file", "text"),
            url="http://www.openslr.org/12",
            data_url="http://www.openslr.org/resources/12/",
            task_templates=[AutomaticSpeechRecognition(audio_file_path_column="file", transcription_column="text")],
        ),
        SuperbConfig(
            name="ks",
            description=textwrap.dedent(
                """\
            Keyword Spotting (KS) detects preregistered keywords by classifying utterances into a predefined set of
            words. The task is usually performed on-device for the fast response time. Thus, accuracy, model size, and
            inference time are all crucial. SUPERB uses the widely used Speech Commands dataset v1.0 for the task.
            The dataset consists of ten classes of keywords, a class for silence, and an unknown class to include the
            false positive. The evaluation metric is accuracy (ACC)"""
            ),
            features=datasets.Features(
                {
                    "file": datasets.Value("string"),
                    "audio": datasets.Audio(sampling_rate=16_000),
                    "label": datasets.ClassLabel(
                        names=[
                            "yes",
                            "no",
                            "up",
                            "down",
                            "left",
                            "right",
                            "on",
                            "off",
                            "stop",
                            "go",
                            "_silence_",
                            "_unknown_",
                        ]
                    ),
                }
            ),
            supervised_keys=("file", "label"),
            url="https://www.tensorflow.org/datasets/catalog/speech_commands",
            data_url="http://download.tensorflow.org/data/{filename}",
        ),
        SuperbConfig(
            name="ic",
            description=textwrap.dedent(
                """\
            Intent Classification (IC) classifies utterances into predefined classes to determine the intent of
            speakers. SUPERB uses the Fluent Speech Commands dataset, where each utterance is tagged with three intent
            labels: action, object, and location. The evaluation metric is accuracy (ACC)."""
            ),
            features=datasets.Features(
                {
                    "file": datasets.Value("string"),
                    "audio": datasets.Audio(sampling_rate=16_000),
                    "speaker_id": datasets.Value("string"),
                    "text": datasets.Value("string"),
                    "action": datasets.ClassLabel(
                        names=["activate", "bring", "change language", "deactivate", "decrease", "increase"]
                    ),
                    "object": datasets.ClassLabel(
                        names=[
                            "Chinese",
                            "English",
                            "German",
                            "Korean",
                            "heat",
                            "juice",
                            "lamp",
                            "lights",
                            "music",
                            "newspaper",
                            "none",
                            "shoes",
                            "socks",
                            "volume",
                        ]
                    ),
                    "location": datasets.ClassLabel(names=["bedroom", "kitchen", "none", "washroom"]),
                }
            ),
            supervised_keys=None,
            url="https://fluent.ai/fluent-speech-commands-a-dataset-for-spoken-language-understanding-research/",
            data_url="http://fluent.ai:2052/jf8398hf30f0381738rucj3828chfdnchs.tar.gz",
        ),
        SuperbConfig(
            name="si",
            description=textwrap.dedent(
                """\
            Speaker Identification (SI) classifies each utterance for its speaker identity as a multi-class
            classification, where speakers are in the same predefined set for both training and testing. The widely
            used VoxCeleb1 dataset is adopted, and the evaluation metric is accuracy (ACC)."""
            ),
            features=datasets.Features(
                {
                    "file": datasets.Value("string"),
                    "audio": datasets.Audio(sampling_rate=16_000),
                    # VoxCeleb1 contains 1251 speaker IDs in range ["id10001",..."id11251"]
                    "label": datasets.ClassLabel(names=[f"id{i + 10001}" for i in range(1251)]),
                }
            ),
            supervised_keys=("file", "label"),
            url="https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html",
        ),
        SuperbConfig(
            name="sd",
            description=textwrap.dedent(
                """\
            Speaker Diarization (SD) predicts `who is speaking when` for each timestamp, and multiple speakers can
            speak simultaneously. The model has to encode rich speaker characteristics for each frame and should be
            able to represent mixtures of signals. [LibriMix] is adopted where LibriSpeech
            train-clean-100/dev-clean/test-clean are used to generate mixtures for training/validation/testing.
            We focus on the two-speaker scenario as the first step. The time-coded speaker labels were generated using
            alignments from Kaldi LibriSpeech ASR model. The evaluation metric is diarization error rate (DER)."""
            ),
            features=datasets.Features(
                {
                    "record_id": datasets.Value("string"),
                    "file": datasets.Value("string"),
                    "audio": datasets.Audio(sampling_rate=16_000),
                    "start": datasets.Value("int64"),
                    "end": datasets.Value("int64"),
                    "speakers": [
                        {
                            "speaker_id": datasets.Value("string"),
                            "start": datasets.Value("int64"),
                            "end": datasets.Value("int64"),
                        }
                    ],
                }
            ),  # TODO
            supervised_keys=None,  # TODO
            url="https://github.com/ftshijt/LibriMix",
            data_url="https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/{split}/{filename}",
        ),
        SuperbConfig(
            name="er",
            description=textwrap.dedent(
                """\
            Emotion Recognition (ER) predicts an emotion class for each utterance. The most widely used ER dataset
            IEMOCAP is adopted, and we follow the conventional evaluation protocol: we drop the unbalanced emotion
            classes to leave the final four classes with a similar amount of data points and cross-validate on five
            folds of the standard splits. The evaluation metric is accuracy (ACC)."""
            ),
            features=datasets.Features(
                {
                    "file": datasets.Value("string"),
                    "audio": datasets.Audio(sampling_rate=16_000),
                    "label": datasets.ClassLabel(names=["neu", "hap", "ang", "sad"]),
                }
            ),
            supervised_keys=("file", "label"),
            url="https://sail.usc.edu/iemocap/",
        ),
    ]
Esempio n. 13
0
def prepare_datasets(which, args, preload_fn, dataset_transform, dataset_transform_train, small_hooks_mf0=None, small_hooks=None, valid_hooks=None, test_hooks=None):
    timer = time.time()

    if small_hooks_mf0 is None:
        small_hooks_mf0 = [MetricsHook_mf0(), VisualOutputHook_mf0(True, True, True)]
    if small_hooks is None:
        small_hooks = [MetricsHook(), VisualOutputHook(True, True, False, False)]
    if valid_hooks is None:
        valid_hooks = [MetricsHook(write_estimations=True), VisualOutputHook(False, False, True, True), SaveBestModelHook(args.logdir)]
    if test_hooks is None:
        test_hooks = [MetricsHook(write_summaries=False, print_detailed=True, write_estimations=True)]

    validation_datasets = []
    test_datasets = []
    train_data = []
    if datasets.medleydb.prefix in which:
        medleydb_train, medleydb_test, medleydb_validation, medleydb_small_validation = datasets.medleydb.prepare(preload_fn, threads=args.threads)
        medleydb_test_dataset = datasets.AADataset(medleydb_test, args, dataset_transform)
        medleydb_validation_dataset = datasets.AADataset(medleydb_validation, args, dataset_transform)
        medleydb_small_validation_dataset = datasets.AADataset(medleydb_small_validation, args, dataset_transform)
        validation_datasets += [
            VD("small_"+datasets.medleydb.prefix, medleydb_small_validation_dataset, args.evaluate_small_every, small_hooks),
            VD(datasets.medleydb.prefix, medleydb_validation_dataset, args.evaluate_every, valid_hooks),
        ]
        test_datasets += [
            VD(datasets.medleydb.prefix, medleydb_test_dataset, 0, test_hooks),
            VD(datasets.medleydb.prefix, medleydb_validation_dataset, 0, test_hooks),
        ]
        train_data += medleydb_train

    if datasets.mdb_melody_synth.prefix in which:
        mdb_melody_synth_train, mdb_melody_synth_test, mdb_melody_synth_validation, _ = datasets.mdb_melody_synth.prepare(preload_fn)
        mdb_melody_synth_test_dataset = datasets.AADataset(mdb_melody_synth_test, args, dataset_transform)
        mdb_melody_synth_validation_dataset = datasets.AADataset(mdb_melody_synth_validation, args, dataset_transform)
        validation_datasets += [
            VD(datasets.mdb_melody_synth.prefix, mdb_melody_synth_validation_dataset, args.evaluate_every, valid_hooks),
        ]
        test_datasets += [
            VD(datasets.mdb_melody_synth.prefix, mdb_melody_synth_test_dataset, 0, test_hooks),
            VD(datasets.mdb_melody_synth.prefix, mdb_melody_synth_validation_dataset, 0, test_hooks),
        ]
        train_data += mdb_melody_synth_train

    if datasets.mdb_stem_synth.prefix in which:
        mdb_stem_synth_train, mdb_stem_synth_test, mdb_stem_synth_validation, mdb_stem_synth_small_validation = datasets.mdb_stem_synth.prepare(preload_fn)
        mdb_stem_synth_small_validation_dataset = datasets.AADataset(mdb_stem_synth_small_validation, args, dataset_transform)
        mdb_stem_synth_test_dataset = datasets.AADataset(mdb_stem_synth_test, args, dataset_transform)
        mdb_stem_synth_validation_dataset = datasets.AADataset(mdb_stem_synth_validation, args, dataset_transform)
        validation_datasets += [
            VD("small_"+datasets.mdb_stem_synth.prefix, mdb_stem_synth_small_validation_dataset, args.evaluate_small_every, small_hooks),
            VD(datasets.mdb_stem_synth.prefix, mdb_stem_synth_validation_dataset, args.evaluate_every, valid_hooks),
        ]
        test_datasets += [
            VD(datasets.mdb_stem_synth.prefix, mdb_stem_synth_test_dataset, 0, test_hooks),
            VD(datasets.mdb_stem_synth.prefix, mdb_stem_synth_validation_dataset, 0, test_hooks),
        ]
        train_data += mdb_stem_synth_train

    if datasets.mdb_mf0_synth.prefix in which:
        _, _, mdb_mf0_synth_small_validation = datasets.mdb_mf0_synth.prepare(preload_fn)
        mdb_mf0_synth_small_validation_dataset = datasets.AADataset(mdb_mf0_synth_small_validation, args, dataset_transform)
        validation_datasets += [
            VD("small_"+datasets.mdb_mf0_synth.prefix, mdb_mf0_synth_small_validation_dataset, args.evaluate_small_every, small_hooks_mf0),
        ]
    
    if datasets.wjazzd.prefix in which:
        wjazzd_train, wjazzd_test, wjazzd_validation, wjazzd_small_validation = datasets.wjazzd.prepare(preload_fn)
        wjazzd_test_dataset = datasets.AADataset(wjazzd_test, args, dataset_transform)
        wjazzd_validation_dataset = datasets.AADataset(wjazzd_validation, args, dataset_transform)
        wjazzd_small_validation_dataset = datasets.AADataset(wjazzd_small_validation, args, dataset_transform)
        validation_datasets += [
            VD("small_"+datasets.wjazzd.prefix, wjazzd_small_validation_dataset, args.evaluate_small_every, small_hooks),
            VD(datasets.wjazzd.prefix, wjazzd_validation_dataset, args.evaluate_small_every, valid_hooks),
        ]
        test_datasets += [
            VD(datasets.wjazzd.prefix, wjazzd_test_dataset, 0, test_hooks),
            VD(datasets.wjazzd.prefix, wjazzd_validation_dataset, 0, test_hooks),
        ]
        train_data += wjazzd_train

    if datasets.orchset.prefix in which:
        orchset_test, orchset_small_validation = datasets.orchset.prepare(preload_fn)
        orchset_test_dataset = datasets.AADataset(orchset_test, args, dataset_transform)
        orchset_small_validation_dataset = datasets.AADataset(orchset_small_validation, args, dataset_transform)
        validation_datasets.append(VD("small_"+datasets.orchset.prefix, orchset_small_validation_dataset, args.evaluate_small_every, small_hooks))
        test_datasets.append(VD(datasets.orchset.prefix, orchset_test_dataset, 0, test_hooks))

    if datasets.adc2004.prefix in which:
        adc2004_test = datasets.adc2004.prepare(preload_fn)
        adc2004_test_dataset = datasets.AADataset(adc2004_test, args, dataset_transform)
        test_datasets.append(VD(datasets.adc2004.prefix, adc2004_test_dataset, 0, test_hooks))

    if datasets.mirex05.prefix in which:
        mirex05_test = datasets.mirex05.prepare(preload_fn)
        mirex05_test_dataset = datasets.AADataset(mirex05_test, args, dataset_transform)
        test_datasets.append(VD(datasets.mirex05.prefix, mirex05_test_dataset, 0, test_hooks))

    if "fairerhopes" in which:
        harfa_audio = datasets.Audio("/mnt/tera/jirka/V1/MatthewEntwistle_FairerHopes/MatthewEntwistle_FairerHopes_STEMS/MatthewEntwistle_FairerHopes_STEM_10.wav",
                    "augment_low")
        harfa_annot = datasets.Annotation.from_time_series("data/MatthewEntwistle_FairerHopes_STEM_10_clean.csv", "fairerhopes")
        harfa = datasets.AnnotatedAudio(harfa_annot, harfa_audio)
        preload_fn(harfa)
        # harfa.audio.samples *= 5
        mirex05_test_dataset = datasets.AADataset([harfa], args, dataset_transform)
        test_datasets.append(VD("fairerhopes", mirex05_test_dataset, 0, test_hooks))

    if train_data:
        hop_size = args.hop_size if args.hop_size is not None else None
        train_dataset = datasets.AADataset(train_data, args, dataset_transform_train, shuffle=True, hop_size=hop_size)
    else:
        # Return at least one dataset as training, since its parameters are used in network initialization
        print("Warning: using automatically selected train_dataset")
        train_dataset = (test_datasets+validation_datasets)[0].dataset

    print("datasets ready in {:.2f}s".format(time.time() - timer))

    return train_dataset, test_datasets, validation_datasets
Esempio n. 14
0
    def test_word_time_stamp_integration(self):
        import torch

        ds = load_dataset("common_voice", "en", split="train", streaming=True)
        ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
        ds_iter = iter(ds)
        sample = next(ds_iter)

        processor = AutoProcessor.from_pretrained(
            "patrickvonplaten/wav2vec2-base-100h-with-lm")
        model = Wav2Vec2ForCTC.from_pretrained(
            "patrickvonplaten/wav2vec2-base-100h-with-lm")

        # compare to filename `common_voice_en_100038.mp3` of dataset viewer on https://huggingface.co/datasets/common_voice/viewer/en/train
        input_values = processor(sample["audio"]["array"],
                                 return_tensors="pt").input_values

        with torch.no_grad():
            logits = model(input_values).logits.cpu().numpy()

        output = processor.decode(logits[0], output_word_offsets=True)

        time_offset = model.config.inputs_to_logits_ratio / processor.feature_extractor.sampling_rate
        word_time_stamps = [{
            "start_time": d["start_offset"] * time_offset,
            "end_time": d["end_offset"] * time_offset,
            "word": d["word"],
        } for d in output["word_offsets"]]

        EXPECTED_TEXT = "WHY DOES A MILE SANDRA LOOK LIKE SHE WANTS TO CONSUME JOHN SNOW ON THE RIVER AT THE WALL"

        # output words
        self.assertEqual(
            " ".join(self.get_from_offsets(word_time_stamps, "word")),
            EXPECTED_TEXT)
        self.assertEqual(
            " ".join(self.get_from_offsets(word_time_stamps, "word")),
            output.text)

        # output times
        start_times = [
            round(x, 2)
            for x in self.get_from_offsets(word_time_stamps, "start_time")
        ]
        end_times = [
            round(x, 2)
            for x in self.get_from_offsets(word_time_stamps, "end_time")
        ]

        # fmt: off
        self.assertListEqual(
            start_times,
            [
                1.42,
                1.64,
                2.12,
                2.26,
                2.54,
                3.0,
                3.24,
                3.6,
                3.8,
                4.1,
                4.26,
                4.94,
                5.28,
                5.66,
                5.78,
                5.94,
                6.32,
                6.54,
                6.66,
            ],
        )

        self.assertListEqual(
            end_times,
            [
                1.54,
                1.88,
                2.14,
                2.46,
                2.9,
                3.18,
                3.54,
                3.72,
                4.02,
                4.18,
                4.76,
                5.16,
                5.56,
                5.7,
                5.86,
                6.2,
                6.38,
                6.62,
                6.94,
            ],
        )
Esempio n. 15
0
class Superb(ds.GeneratorBasedBuilder):
    BUILDER_CONFIGS = [
        SuperbConfig(
            name="asr",
            features=ds.Features({
                "file": ds.Value("string"),
                "audio": ds.Audio(sampling_rate=16_000),
                "text": ds.Value("string"),
                "speaker_id": ds.Value("int64"),
                "chapter_id": ds.Value("int64"),
                "id": ds.Value("string"),
            }),
            supervised_keys=("file", "text"),
            data_url="http://www.openslr.org/resources/12/",
            task_templates=[
                AutomaticSpeechRecognition(audio_file_path_column="file",
                                           transcription_column="text")
            ],
        ),
        SuperbConfig(
            name="ks",
            features=ds.Features({
                "file":
                ds.Value("string"),
                "audio":
                ds.Audio(sampling_rate=16_000),
                "label":
                ds.ClassLabel(names=[
                    "yes",
                    "no",
                    "up",
                    "down",
                    "left",
                    "right",
                    "on",
                    "off",
                    "stop",
                    "go",
                    "_silence_",
                    "_unknown_",
                ]),
            }),
            supervised_keys=("file", "label"),
            data_url="http://download.tensorflow.org/data/{filename}",
        ),
        SuperbConfig(
            name="ic",
            features=ds.Features({
                "file":
                ds.Value("string"),
                "audio":
                ds.Audio(sampling_rate=16_000),
                "speaker_id":
                ds.Value("string"),
                "text":
                ds.Value("string"),
                "action":
                ds.ClassLabel(names=[
                    "activate",
                    "bring",
                    "change language",
                    "deactivate",
                    "decrease",
                    "increase",
                ]),
                "object":
                ds.ClassLabel(names=[
                    "Chinese",
                    "English",
                    "German",
                    "Korean",
                    "heat",
                    "juice",
                    "lamp",
                    "lights",
                    "music",
                    "newspaper",
                    "none",
                    "shoes",
                    "socks",
                    "volume",
                ]),
                "location":
                ds.ClassLabel(
                    names=["bedroom", "kitchen", "none", "washroom"]),
            }),
            supervised_keys=None,
            data_url=
            "http://fluent.ai:2052/jf8398hf30f0381738rucj3828chfdnchs.tar.gz",
        ),
        SuperbConfig(
            name="si",
            features=ds.Features({
                "file":
                ds.Value("string"),
                "audio":
                ds.Audio(sampling_rate=16_000),
                "label":
                ds.ClassLabel(names=[f"id{i + 10001}" for i in range(1251)]),
            }),
            supervised_keys=("file", "label"),
        ),
        SuperbConfig(
            name="sd",
            features=ds.Features({
                "record_id":
                ds.Value("string"),
                "file":
                ds.Value("string"),
                "audio":
                ds.Audio(sampling_rate=16_000),
                "start":
                ds.Value("int64"),
                "end":
                ds.Value("int64"),
                "speakers": [{
                    "speaker_id": ds.Value("string"),
                    "start": ds.Value("int64"),
                    "end": ds.Value("int64"),
                }],
            }),
            supervised_keys=None,
            data_url=
            "https://huggingface.co/datasets/superb/superb-data/resolve/main/sd/{split}/{filename}",
        ),
        SuperbConfig(
            name="er",
            features=ds.Features({
                "file":
                ds.Value("string"),
                "audio":
                ds.Audio(sampling_rate=16_000),
                "label":
                ds.ClassLabel(names=["neu", "hap", "ang", "sad"]),
            }),
            supervised_keys=("file", "label"),
        ),
    ]
Esempio n. 16
0
def prepare_datasets(which,
                     args,
                     preload_fn,
                     dataset_transform,
                     dataset_transform_train,
                     small_hooks_mf0=None,
                     small_hooks=None,
                     valid_hooks=None,
                     test_hooks=None):
    timer = time.time()

    if small_hooks_mf0 is None:
        small_hooks_mf0 = [
            MetricsHook_mf0(),
            VisualOutputHook_mf0(True, True, False)
        ]
    if small_hooks is None:
        small_hooks = [
            MetricsHook(),
            VisualOutputHook(True, True, False, False)
        ]
    if valid_hooks is None:
        valid_hooks = [
            MetricsHook(),
            VisualOutputHook(False, False, True, True),
            SaveBestModelHook(args.logdir),
            CSVOutputWriterHook()
        ]
    if test_hooks is None:
        test_hooks = [CSVOutputWriterHook()]
        if args.save_salience:
            test_hooks.append(SaveSaliencesHook())

    validation_datasets = []
    test_datasets = []
    train_data = []

    if args.predict:
        output_path = os.path.splitext(os.path.basename(args.predict))[0]
        uid = os.path.splitext(os.path.basename(args.predict))[0]
        # prepare audio
        audio = datasets.Audio(args.predict, uid)
        aa = datasets.AnnotatedAudio((None, uid), audio)
        preload_fn(aa)
        predict_dataset = datasets.AADataset([aa], args, dataset_transform)

        output_file = None
        if args.output_file:
            output_file = args.output_file
        test_datasets += [
            VD("predict", predict_dataset, 0, [
                CSVOutputWriterHook(output_path="./predict_outputs",
                                    output_file=output_file,
                                    output_format=args.output_format)
            ]),
        ]
        return predict_dataset, test_datasets, []

    if datasets.musicnet_mir.prefix in which:
        musicnet_train, musicnet_test, musicnet_validation, musicnet_small_validation = datasets.musicnet_mir.prepare(
            preload_fn, threads=args.threads)
        musicnet_test_dataset = datasets.AADataset(musicnet_test, args,
                                                   dataset_transform)
        musicnet_validation_dataset = datasets.AADataset(
            musicnet_validation, args, dataset_transform)
        musicnet_small_validation_dataset = datasets.AADataset(
            musicnet_small_validation, args, dataset_transform)

        validation_datasets += [
            VD(datasets.musicnet_mir.prefix, musicnet_validation_dataset,
               args.evaluate_every, valid_hooks),
            VD("small_" + datasets.musicnet_mir.prefix,
               musicnet_small_validation_dataset, args.evaluate_small_every,
               small_hooks_mf0),
        ]

        test_datasets += [
            # VD(datasets.musicnet_mir.prefix, musicnet_validation_dataset, 0, valid_hooks),
            VD(datasets.musicnet_mir.prefix, musicnet_test_dataset, 0,
               test_hooks),
        ]

        train_data += musicnet_train

    if datasets.maps.prefix in which:
        maps_train, maps_test, maps_validation, maps_small_validation = datasets.maps.prepare(
            preload_fn, threads=args.threads)
        maps_test_dataset = datasets.AADataset(maps_test, args,
                                               dataset_transform)
        maps_validation_dataset = datasets.AADataset(maps_validation, args,
                                                     dataset_transform)
        maps_small_validation_dataset = datasets.AADataset(
            maps_small_validation, args, dataset_transform)

        validation_datasets += [
            VD(datasets.maps.prefix, maps_validation_dataset,
               args.evaluate_every, valid_hooks),
            VD("small_" + datasets.maps.prefix, maps_small_validation_dataset,
               args.evaluate_small_every, small_hooks_mf0),
        ]

        test_datasets += [
            VD(datasets.maps.prefix, maps_test_dataset, 0, test_hooks),
            # VD(datasets.maps.prefix, maps_validation_dataset, 0, test_hooks),
        ]

        train_data += maps_train

    if datasets.medleydb.prefix in which or datasets.medleydb.prefix + "_mel4" in which:
        if datasets.medleydb.prefix + "_mel4" in which:
            annotation_type = "MELODY4"
        else:
            annotation_type = "MELODY2"
        medleydb_train, medleydb_test, medleydb_validation, medleydb_small_validation = datasets.medleydb.prepare(
            preload_fn, threads=args.threads, annotation_type=annotation_type)
        medleydb_test_dataset = datasets.AADataset(medleydb_test, args,
                                                   dataset_transform)
        medleydb_validation_dataset = datasets.AADataset(
            medleydb_validation, args, dataset_transform)
        medleydb_small_validation_dataset = datasets.AADataset(
            medleydb_small_validation, args, dataset_transform)

        validation_datasets += [
            VD(datasets.medleydb.prefix, medleydb_validation_dataset,
               args.evaluate_every, valid_hooks),
        ]
        if datasets.medleydb.prefix + "_mel4" in which:
            validation_datasets += [
                VD("small_" + datasets.medleydb.prefix,
                   medleydb_small_validation_dataset,
                   args.evaluate_small_every, small_hooks_mf0),
            ]
        else:
            validation_datasets += [
                VD("small_" + datasets.medleydb.prefix,
                   medleydb_small_validation_dataset,
                   args.evaluate_small_every, small_hooks),
            ]

        test_datasets += [
            VD(datasets.medleydb.prefix, medleydb_test_dataset, 0, test_hooks),
            VD(datasets.medleydb.prefix, medleydb_validation_dataset, 0,
               test_hooks),
        ]

        train_data += medleydb_train

    if datasets.mdb_melody_synth.prefix in which:
        mdb_melody_synth_train, mdb_melody_synth_test, mdb_melody_synth_validation, _ = datasets.mdb_melody_synth.prepare(
            preload_fn, subsets=("test", "validation"))
        mdb_melody_synth_test_dataset = datasets.AADataset(
            mdb_melody_synth_test, args, dataset_transform)
        mdb_melody_synth_validation_dataset = datasets.AADataset(
            mdb_melody_synth_validation, args, dataset_transform)
        validation_datasets += [
            VD(datasets.mdb_melody_synth.prefix,
               mdb_melody_synth_validation_dataset, args.evaluate_every,
               valid_hooks),
        ]
        test_datasets += [
            VD(datasets.mdb_melody_synth.prefix, mdb_melody_synth_test_dataset,
               0, test_hooks),
            VD(datasets.mdb_melody_synth.prefix,
               mdb_melody_synth_validation_dataset, 0, test_hooks),
        ]
        train_data += mdb_melody_synth_train

    if datasets.mdb_stem_synth.prefix in which:
        mdb_stem_synth_train, mdb_stem_synth_test, mdb_stem_synth_validation, mdb_stem_synth_small_validation = datasets.mdb_stem_synth.prepare(
            preload_fn)
        mdb_stem_synth_small_validation_dataset = datasets.AADataset(
            mdb_stem_synth_small_validation, args, dataset_transform)
        mdb_stem_synth_test_dataset = datasets.AADataset(
            mdb_stem_synth_test, args, dataset_transform)
        mdb_stem_synth_validation_dataset = datasets.AADataset(
            mdb_stem_synth_validation, args, dataset_transform)
        validation_datasets += [
            VD("small_" + datasets.mdb_stem_synth.prefix,
               mdb_stem_synth_small_validation_dataset,
               args.evaluate_small_every, small_hooks),
            VD(datasets.mdb_stem_synth.prefix,
               mdb_stem_synth_validation_dataset, args.evaluate_every,
               valid_hooks),
        ]
        test_datasets += [
            VD(datasets.mdb_stem_synth.prefix, mdb_stem_synth_test_dataset, 0,
               test_hooks),
            VD(datasets.mdb_stem_synth.prefix,
               mdb_stem_synth_validation_dataset, 0, test_hooks),
        ]
        train_data += mdb_stem_synth_train

    if datasets.mdb_mf0_synth.prefix in which:
        _, _, mdb_mf0_synth_small_validation = datasets.mdb_mf0_synth.prepare(
            preload_fn)
        mdb_mf0_synth_small_validation_dataset = datasets.AADataset(
            mdb_mf0_synth_small_validation, args, dataset_transform)
        validation_datasets += [
            VD("small_" + datasets.mdb_mf0_synth.prefix,
               mdb_mf0_synth_small_validation_dataset,
               args.evaluate_small_every, small_hooks_mf0),
        ]

    if datasets.wjazzd.prefix in which:
        wjazzd_train, wjazzd_test, wjazzd_validation, wjazzd_small_validation = datasets.wjazzd.prepare(
            preload_fn, subsets=("test", "validation"))
        wjazzd_test_dataset = datasets.AADataset(wjazzd_test, args,
                                                 dataset_transform)
        wjazzd_validation_dataset = datasets.AADataset(wjazzd_validation, args,
                                                       dataset_transform)
        wjazzd_small_validation_dataset = datasets.AADataset(
            wjazzd_small_validation, args, dataset_transform)
        validation_datasets += [
            VD("small_" + datasets.wjazzd.prefix,
               wjazzd_small_validation_dataset, args.evaluate_small_every,
               small_hooks),
            VD(datasets.wjazzd.prefix, wjazzd_validation_dataset,
               args.evaluate_small_every, valid_hooks),
        ]
        test_datasets += [
            VD(datasets.wjazzd.prefix, wjazzd_test_dataset, 0, test_hooks),
            VD(datasets.wjazzd.prefix, wjazzd_validation_dataset, 0,
               test_hooks),
        ]
        train_data += wjazzd_train

    if datasets.orchset.prefix in which:
        orchset_test, orchset_small_validation = datasets.orchset.prepare(
            preload_fn)
        orchset_test_dataset = datasets.AADataset(orchset_test, args,
                                                  dataset_transform)
        orchset_small_validation_dataset = datasets.AADataset(
            orchset_small_validation, args, dataset_transform)
        validation_datasets.append(
            VD("small_" + datasets.orchset.prefix,
               orchset_small_validation_dataset, args.evaluate_small_every,
               small_hooks))
        test_datasets.append(
            VD(datasets.orchset.prefix, orchset_test_dataset, 0, test_hooks))

    if datasets.adc2004.prefix in which:
        adc2004_test = datasets.adc2004.prepare(preload_fn)
        adc2004_test_dataset = datasets.AADataset(adc2004_test, args,
                                                  dataset_transform)
        test_datasets.append(
            VD(datasets.adc2004.prefix, adc2004_test_dataset, 0, test_hooks))

    if datasets.mirex05.prefix in which:
        mirex05_test = datasets.mirex05.prepare(preload_fn)
        mirex05_test_dataset = datasets.AADataset(mirex05_test, args,
                                                  dataset_transform)
        test_datasets.append(
            VD(datasets.mirex05.prefix, mirex05_test_dataset, 0, test_hooks))

    if "fairerhopes" in which:
        harfa_audio = datasets.Audio(
            "/mnt/tera/jirka/V1/MatthewEntwistle_FairerHopes/MatthewEntwistle_FairerHopes_STEMS/MatthewEntwistle_FairerHopes_STEM_10.wav",
            "augment_low")
        harfa_annot = datasets.Annotation.from_time_series(
            "data/MatthewEntwistle_FairerHopes_STEM_10_clean.csv",
            "fairerhopes")
        harfa = datasets.AnnotatedAudio(harfa_annot, harfa_audio)
        preload_fn(harfa)
        # harfa.audio.samples *= 5
        mirex05_test_dataset = datasets.AADataset([harfa], args,
                                                  dataset_transform)
        test_datasets.append(
            VD("fairerhopes", mirex05_test_dataset, 0, test_hooks))

    if train_data:
        hop_size = args.hop_size if args.hop_size is not None else None
        train_dataset = datasets.AADataset(train_data,
                                           args,
                                           dataset_transform_train,
                                           shuffle=True,
                                           hop_size=hop_size)
    else:
        # Return at least one dataset as training, since its parameters are used in network initialization
        print("Warning: using automatically selected train_dataset")
        train_dataset = (test_datasets + validation_datasets)[0].dataset

    print("datasets ready in {:.2f}s".format(time.time() - timer))

    return train_dataset, test_datasets, validation_datasets