def load_datasets(do_remove_special_characters=True,
                  do_replace_hatted_characters=True,
                  resample=True):
    common_voice_train = load_dataset("common_voice",
                                      "tr",
                                      split="train+validation",
                                      cache_dir=cache_dir)
    common_voice_test = load_dataset("common_voice",
                                     "tr",
                                     split="test",
                                     cache_dir=cache_dir)
    c_names = 'accent,age,client_id,down_votes,gender'
    c_names += ',locale,segment,up_votes'
    c_names = c_names.split(',')
    common_voice_train = common_voice_train.remove_columns(c_names)
    common_voice_test = common_voice_test.remove_columns(c_names)
    if do_remove_special_characters:
        common_voice_train = common_voice_train.map(remove_special_characters)
        common_voice_test = common_voice_test.map(remove_special_characters)
    if do_remove_special_characters:
        common_voice_train = common_voice_train.map(replace_hatted_characters)
        common_voice_test = common_voice_test.map(replace_hatted_characters)
    if resample:
        common_voice_train = common_voice_train.cast_column(
            'audio', Audio(sampling_rate=16_000))
        common_voice_test = common_voice_test.cast_column(
            'audio', Audio(sampling_rate=16_000))
    return common_voice_train, common_voice_test
Example #2
0
    def test_push_dataset_to_hub_custom_features_audio(self):
        audio_path = os.path.join(os.path.dirname(__file__), "features", "data", "test_audio_44100.wav")
        data = {"x": [audio_path, None], "y": [0, -1]}
        features = Features({"x": Audio(), "y": Value("int32")})
        ds = Dataset.from_dict(data, features=features)

        for embed_external_files in [True, False]:
            ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
            try:
                ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token)
                hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload")

                self.assertListEqual(ds.column_names, hub_ds.column_names)
                self.assertListEqual(list(ds.features.keys()), list(hub_ds.features.keys()))
                self.assertDictEqual(ds.features, hub_ds.features)
                np.testing.assert_equal(ds[0]["x"]["array"], hub_ds[0]["x"]["array"])
                self.assertEqual(
                    ds[1], hub_ds[1]
                )  # don't test hub_ds[0] since audio decoding might be slightly different
                hub_ds = hub_ds.cast_column("x", Audio(decode=False))
                elem = hub_ds[0]["x"]
                path, bytes_ = elem["path"], elem["bytes"]
                self.assertTrue(bool(path) == (not embed_external_files))
                self.assertTrue(bool(bytes_) == embed_external_files)
            finally:
                self.cleanup_repo(ds_name)
def process_dataset(dataset: IterableDataset, cfg: HFDatasetConvertionConfig):
    """
    Top level method that processes a given IterableDataset to Nemo compatible dataset.
    It also writes out a nemo compatible manifest file.

    Args:
        dataset: HF Dataset.
        cfg: HFDatasetConvertionConfig
    """
    dataset = dataset.cast_column("audio", Audio(cfg.sampling_rate, mono=True))

    if cfg.split_output_dir is None:
        basedir = cfg.resolved_output_dir
        manifest_filename = f"{cfg.path.replace('/', '_')}_manifest.json"
    else:
        basedir = cfg.split_output_dir
        split = os.path.split(cfg.split_output_dir)[-1]
        manifest_filename = f"{split}_{cfg.path.replace('/', '_')}_manifest.json"

        if not os.path.exists(cfg.split_output_dir):
            os.makedirs(cfg.split_output_dir, exist_ok=True)

        cfg.split = split

    manifest_filepath = os.path.abspath(
        os.path.join(basedir, manifest_filename))

    if cfg.streaming:
        convert_streaming_dataset_to_nemo(dataset,
                                          cfg,
                                          basedir=basedir,
                                          manifest_filepath=manifest_filepath)
    else:
        convert_offline_dataset_to_nemo(dataset,
                                        cfg,
                                        basedir=basedir,
                                        manifest_filepath=manifest_filepath)

    print()
    print("Dataset conversion finished !")
Example #4
0
def main(args):
    # load dataset
    dataset = load_dataset(args.dataset,
                           args.config,
                           split=args.split,
                           use_auth_token=True)

    # for testing: only process the first two examples as a test
    # dataset = dataset.select(range(10))

    # load processor
    feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id)
    sampling_rate = feature_extractor.sampling_rate

    # resample audio
    dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))

    # load eval pipeline
    if args.device is None:
        args.device = 0 if torch.cuda.is_available() else -1
    asr = pipeline("automatic-speech-recognition",
                   model=args.model_id,
                   device=args.device)

    # map function to decode audio
    def map_to_pred(batch):
        prediction = asr(batch["audio"]["array"],
                         chunk_length_s=args.chunk_length_s,
                         stride_length_s=args.stride_length_s)

        batch["prediction"] = prediction["text"]
        batch["target"] = normalize_text(batch["sentence"])
        return batch

    # run inference on all examples
    result = dataset.map(map_to_pred, remove_columns=dataset.column_names)

    # compute and log_results
    # do not change function below
    log_results(result, args)
Example #5
0
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)


# In[20]:


from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)


# In[23]:


common_voice_train = common_voice_train.cast_column("audio", Audio(sampling_rate=16_000))
common_voice_test = common_voice_test.cast_column("audio", Audio(sampling_rate=16_000))


# In[26]:


rand_int = random.randint(0, len(common_voice_train)-1)

print("Target text:", common_voice_train[rand_int]["sentence"])
print("Input array shape:", common_voice_train[rand_int]["audio"]["array"].shape)
print("Sampling rate:", common_voice_train[rand_int]["audio"]["sampling_rate"])


# In[27]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                              tokenizer=tokenizer)

processor.save_pretrained("../model")

#  Load a custom audio file. // Trial
#sa, sr = torchaudio.load('/home/scutum/covost_tr/tr/raw/clips/common_voice_tr_20210689.mp3')
#sa.shape, sr
#sa
#type(sa)
#np.asarray(sa.reshape(-1)).shape
#sa = librosa.resample(np.asarray(sa.reshape(-1)), 44_100, 16_000)
#sa.shape

common_voice_train = common_voice_train.cast_column(
    "audio", Audio(sampling_rate=16_000))
common_voice_test = common_voice_test.cast_column("audio",
                                                  Audio(sampling_rate=16_000))

import IPython.display as ipd
rand_int = random.randint(0, len(common_voice_train) - 1)

print(common_voice_train[rand_int]["sentence"])
ipd.Audio(data=common_voice_train[rand_int]["audio"]["array"],
          autoplay=True,
          rate=16000)

rand_int = random.randint(0, len(common_voice_train) - 1)

print("Target text:", common_voice_train[rand_int]["sentence"])
print("Input array shape:",