def load_datasets(do_remove_special_characters=True, do_replace_hatted_characters=True, resample=True): common_voice_train = load_dataset("common_voice", "tr", split="train+validation", cache_dir=cache_dir) common_voice_test = load_dataset("common_voice", "tr", split="test", cache_dir=cache_dir) c_names = 'accent,age,client_id,down_votes,gender' c_names += ',locale,segment,up_votes' c_names = c_names.split(',') common_voice_train = common_voice_train.remove_columns(c_names) common_voice_test = common_voice_test.remove_columns(c_names) if do_remove_special_characters: common_voice_train = common_voice_train.map(remove_special_characters) common_voice_test = common_voice_test.map(remove_special_characters) if do_remove_special_characters: common_voice_train = common_voice_train.map(replace_hatted_characters) common_voice_test = common_voice_test.map(replace_hatted_characters) if resample: common_voice_train = common_voice_train.cast_column( 'audio', Audio(sampling_rate=16_000)) common_voice_test = common_voice_test.cast_column( 'audio', Audio(sampling_rate=16_000)) return common_voice_train, common_voice_test
def test_push_dataset_to_hub_custom_features_audio(self): audio_path = os.path.join(os.path.dirname(__file__), "features", "data", "test_audio_44100.wav") data = {"x": [audio_path, None], "y": [0, -1]} features = Features({"x": Audio(), "y": Value("int32")}) ds = Dataset.from_dict(data, features=features) for embed_external_files in [True, False]: ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token) hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload") self.assertListEqual(ds.column_names, hub_ds.column_names) self.assertListEqual(list(ds.features.keys()), list(hub_ds.features.keys())) self.assertDictEqual(ds.features, hub_ds.features) np.testing.assert_equal(ds[0]["x"]["array"], hub_ds[0]["x"]["array"]) self.assertEqual( ds[1], hub_ds[1] ) # don't test hub_ds[0] since audio decoding might be slightly different hub_ds = hub_ds.cast_column("x", Audio(decode=False)) elem = hub_ds[0]["x"] path, bytes_ = elem["path"], elem["bytes"] self.assertTrue(bool(path) == (not embed_external_files)) self.assertTrue(bool(bytes_) == embed_external_files) finally: self.cleanup_repo(ds_name)
def process_dataset(dataset: IterableDataset, cfg: HFDatasetConvertionConfig): """ Top level method that processes a given IterableDataset to Nemo compatible dataset. It also writes out a nemo compatible manifest file. Args: dataset: HF Dataset. cfg: HFDatasetConvertionConfig """ dataset = dataset.cast_column("audio", Audio(cfg.sampling_rate, mono=True)) if cfg.split_output_dir is None: basedir = cfg.resolved_output_dir manifest_filename = f"{cfg.path.replace('/', '_')}_manifest.json" else: basedir = cfg.split_output_dir split = os.path.split(cfg.split_output_dir)[-1] manifest_filename = f"{split}_{cfg.path.replace('/', '_')}_manifest.json" if not os.path.exists(cfg.split_output_dir): os.makedirs(cfg.split_output_dir, exist_ok=True) cfg.split = split manifest_filepath = os.path.abspath( os.path.join(basedir, manifest_filename)) if cfg.streaming: convert_streaming_dataset_to_nemo(dataset, cfg, basedir=basedir, manifest_filepath=manifest_filepath) else: convert_offline_dataset_to_nemo(dataset, cfg, basedir=basedir, manifest_filepath=manifest_filepath) print() print("Dataset conversion finished !")
def main(args): # load dataset dataset = load_dataset(args.dataset, args.config, split=args.split, use_auth_token=True) # for testing: only process the first two examples as a test # dataset = dataset.select(range(10)) # load processor feature_extractor = AutoFeatureExtractor.from_pretrained(args.model_id) sampling_rate = feature_extractor.sampling_rate # resample audio dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate)) # load eval pipeline if args.device is None: args.device = 0 if torch.cuda.is_available() else -1 asr = pipeline("automatic-speech-recognition", model=args.model_id, device=args.device) # map function to decode audio def map_to_pred(batch): prediction = asr(batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s) batch["prediction"] = prediction["text"] batch["target"] = normalize_text(batch["sentence"]) return batch # run inference on all examples result = dataset.map(map_to_pred, remove_columns=dataset.column_names) # compute and log_results # do not change function below log_results(result, args)
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True) # In[20]: from transformers import Wav2Vec2Processor processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) # In[23]: common_voice_train = common_voice_train.cast_column("audio", Audio(sampling_rate=16_000)) common_voice_test = common_voice_test.cast_column("audio", Audio(sampling_rate=16_000)) # In[26]: rand_int = random.randint(0, len(common_voice_train)-1) print("Target text:", common_voice_train[rand_int]["sentence"]) print("Input array shape:", common_voice_train[rand_int]["audio"]["array"].shape) print("Sampling rate:", common_voice_train[rand_int]["audio"]["sampling_rate"]) # In[27]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor.save_pretrained("../model") # Load a custom audio file. // Trial #sa, sr = torchaudio.load('/home/scutum/covost_tr/tr/raw/clips/common_voice_tr_20210689.mp3') #sa.shape, sr #sa #type(sa) #np.asarray(sa.reshape(-1)).shape #sa = librosa.resample(np.asarray(sa.reshape(-1)), 44_100, 16_000) #sa.shape common_voice_train = common_voice_train.cast_column( "audio", Audio(sampling_rate=16_000)) common_voice_test = common_voice_test.cast_column("audio", Audio(sampling_rate=16_000)) import IPython.display as ipd rand_int = random.randint(0, len(common_voice_train) - 1) print(common_voice_train[rand_int]["sentence"]) ipd.Audio(data=common_voice_train[rand_int]["audio"]["array"], autoplay=True, rate=16000) rand_int = random.randint(0, len(common_voice_train) - 1) print("Target text:", common_voice_train[rand_int]["sentence"]) print("Input array shape:",