def main(argv): speech_file = argv[1] feature_type = argv[2] speech_conf = { "sample_rate": 16000, "frame_ms": 25, "stride_ms": 10, "feature_type": feature_type, "preemphasis": 0.97, "normalize_signal": True, "normalize_feature": True, "normalize_per_feature": False, "num_feature_bins": 80, } signal = read_raw_audio(speech_file, speech_conf["sample_rate"]) nsf = NumpySpeechFeaturizer(speech_conf) sf = TFSpeechFeaturizer(speech_conf) ft = nsf.stft(signal) print(ft.shape, np.mean(ft)) ft = sf.stft(signal).numpy() print(ft.shape, np.mean(ft)) ft = sf.extract(signal) plt.figure(figsize=(16, 2.5)) ax = plt.gca() ax.set_title(f"{feature_type}", fontweight="bold") librosa.display.specshow(ft.T, cmap="magma") v1 = np.linspace(ft.min(), ft.max(), 8, endpoint=True) plt.colorbar(pad=0.01, fraction=0.02, ax=ax, format="%.2f", ticks=v1) plt.tight_layout()
def __init__(self, path='ConformerS.h5'): # fetch and load the config of the model config = Config('tamil_tech/configs/conformer_new_config.yml', learning=True) # load speech and text featurizers speech_featurizer = TFSpeechFeaturizer(config.speech_config) text_featurizer = CharFeaturizer(config.decoder_config) # check if model already exists in given path, else download the model in the given path if os.path.exists(path): pass else: print("Downloading Model...") file_id = config.file_id download_file_from_google_drive(file_id, path) print("Downloaded Model Successfully...") # load model using config self.model = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) # set shape of the featurizer and build the model self.model._build(speech_featurizer.shape) # load weights of the model self.model.load_weights(path, by_name=True) # display model summary self.model.summary(line_length=120) # set featurizers for the model self.model.add_featurizers(speech_featurizer, text_featurizer) print("Loaded Model...!")
def test_ds2(): config = Config(DEFAULT_YAML) text_featurizer = CharFeaturizer(config.decoder_config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) model = DeepSpeech2(vocabulary_size=text_featurizer.num_classes, **config.model_config) model._build(speech_featurizer.shape) model.summary(line_length=150) model.add_featurizers(speech_featurizer=speech_featurizer, text_featurizer=text_featurizer) concrete_func = model.make_tflite_function(greedy=False).get_concrete_function() converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func]) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.experimental_new_converter = True converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] converter.convert() print("Converted successfully with beam search") concrete_func = model.make_tflite_function(greedy=True).get_concrete_function() converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func]) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.experimental_new_converter = True converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] converter.convert() print("Converted successfully with greedy")
def __init__(self, args): super().__init__() with open(args.am_config) as f: am_config = yaml.load(f, Loader=yaml.Loader) with open(am_config['speech_config']) as f: self.speech_config = yaml.load(f, Loader=yaml.Loader) self.speech_featurizer = TFSpeechFeaturizer(self.speech_config) self.am = self.build_am(args.am_config, args.am_model) with open(args.iam_config) as f: iam_config = yaml.load(f, Loader=yaml.Loader) iam_config.update(self.speech_config) iam_config['n_mels'] = iam_config['asr_features'] iam_config['hop_size'] = iam_config['asr_downsample'] * iam_config[ 'sample_rate'] * iam_config['stride_ms'] // 1000 self.iam, self.pqmf = self.build_iam(iam_config, args.iam_model) with open(args.sv_config) as f: sv_config = yaml.load(f, Loader=yaml.Loader) sv_config.update(self.speech_config) self.sv = self.build_sv(sv_config, args.sv_model) with open(args.vc_config) as f: vc_config = yaml.load(f, Loader=yaml.Loader) vc_config.update(self.speech_config) vc_config['hop_size'] = vc_config['sample_rate'] * vc_config[ 'stride_ms'] // 1000 vc_config['sampling_rate'] = vc_config['sample_rate'] self.vc = self.build_vc(vc_config, args.vc_model)
def test_featurizer(): config = { "output_path_prefix": "/data/models/asr/conformer_sentencepiece_subword", "model_type": "unigram", "target_vocab_size": 8000, "blank_at_zero": True, "beam_width": 5, "norm_score": True, "corpus_files": [ "/data/datasets/LibriSpeech/train-clean-100/transcripts.tsv" "/data/datasets/LibriSpeech/train-clean-360/transcripts.tsv" "/data/datasets/LibriSpeech/train-other-500/transcripts.tsv" ] } config_speech = { "sample_rate": 16000, "frame_ms": 25, "stride_ms": 10, "num_feature_bins": 80, 'feature_type': "log_mel_spectrogram", "preemphasis": 0.97, "normalize_signal": True, "normalize_feature": True, "normalize_per_feature": False } text_featurizer_sentencepiece = SentencePieceFeaturizer.load_from_file( config, None) subwords_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), os.pardir, os.pardir, "vocabularies", "librispeech_train_4_1030.subwords") text_featurizer_subwords = SubwordFeaturizer.load_from_file( config, subwords_path) speech_featurizer = TFSpeechFeaturizer(config_speech) data_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "transcripts_librispeech_train_clean_100.tsv") def get_data(featurizer: TextFeaturizer): train_dataset = ASRSliceDataset(data_paths=[data_path], speech_featurizer=speech_featurizer, text_featurizer=featurizer, stage="train", shuffle=False) train_data = train_dataset.create(1) return next(iter(train_data)) data_sentencepiece = get_data(text_featurizer_sentencepiece) data_subwords = get_data(text_featurizer_subwords) assert len(data_sentencepiece) == len(data_subwords) assert data_sentencepiece[0].shape == data_subwords[0].shape assert data_sentencepiece[0].dtype == data_subwords[0].dtype
def test_iextract(): config = { "output_path_prefix": "/data/models/asr/conformer_sentencepiece_subword", "model_type": "unigram", "target_vocab_size": 8000, "blank_at_zero": True, "beam_width": 5, "norm_score": True, "corpus_files": [ "/data/datasets/LibriSpeech/train-clean-100/transcripts.tsv" "/data/datasets/LibriSpeech/train-clean-360/transcripts.tsv" "/data/datasets/LibriSpeech/train-other-500/transcripts.tsv"]} config_speech = { "sample_rate": 16000, "frame_ms": 25, "stride_ms": 10, "num_feature_bins": 80, 'feature_type': "log_mel_spectrogram", "preemphasis": 0.97, "normalize_signal": True, "normalize_feature": True, "normalize_per_frame": False} text_featurizer_sentencepiece = SentencePieceFeaturizer.load_from_file(config, None) speech_featurizer = TFSpeechFeaturizer(config_speech) data_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "transcripts_librispeech_train_clean_100.tsv") train_dataset = ASRSliceTestDataset( data_paths=[data_path], speech_featurizer=speech_featurizer, text_featurizer=text_featurizer_sentencepiece, stage="train", shuffle=False ) train_data = train_dataset.create(1) batch = next(iter(train_data)) file_paths, features, input_length, labels = batch labels = text_featurizer_sentencepiece.iextract(labels) labels = labels.numpy()[0].decode("utf-8") # Open transcript file_path = file_paths[0].numpy().decode("utf-8") file_path = re.sub(r"(?<!\s)-[0-9]{4}.flac", ".trans.txt", file_path) print(file_path) with open(file_path, "r") as f: lines = f.read().splitlines() m = re.search(r"[0-9]+-[0-9]+-[0-9]+\s+([\w\s]+)", lines[0]) transcript = m.groups(1)[0].lower() assert(labels == transcript)
def main(): args = parse_args() with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) model = TFSpeechFeaturizer(config) executor = ProcessPoolExecutor(max_workers=cpu_count()) all_filenames = find_files(args.dataset, args.suffix) futures = [] print('num files total: %d' % len(all_filenames), all_filenames[0]) suffix = args.suffix.replace('*', '') # for file in all_filenames: # futures.append(executor.submit(partial(process_file, file, model, suffix))) # results = [future.result() for future in tqdm(futures)] for file in tqdm(all_filenames): process_file(file, model, suffix)
data = "/mnt/Data/ML/ASR/Raw/LibriSpeech/train-clean-100/transcripts.tsv" text_featurizer = CharFeaturizer({ "vocabulary": None, "blank_at_zero": True, "beam_width": 5, "norm_score": True }) speech_featurizer = TFSpeechFeaturizer({ "sample_rate": 16000, "frame_ms": 25, "stride_ms": 10, "num_feature_bins": 80, "feature_type": "log_mel_spectrogram", "preemphasis": 0.97, "normalize_signal": True, "normalize_feature": True, "normalize_per_feature": False }) dataset = ASRSliceDataset(stage="train", speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, data_paths=[data], augmentations=augments, shuffle=True).create(4).take(100) while True: print("--------------------------------------------")
assert args.saved tf.config.optimizer.set_experimental_options( {"auto_mixed_precision": args.mxp}) env_util.setup_devices([args.device], cpu=args.cpu) from tensorflow_asr.configs.config import Config from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer, SentencePieceFeaturizer, CharFeaturizer from tensorflow_asr.models.ctc.jasper import Jasper from tensorflow_asr.utils import app_util config = Config(args.config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) if args.sentence_piece: print("Use SentencePiece ...") text_featurizer = SentencePieceFeaturizer(config.decoder_config) elif args.subwords: print("Use subwords ...") text_featurizer = SubwordFeaturizer(config.decoder_config) else: print("Use characters ...") text_featurizer = CharFeaturizer(config.decoder_config) tf.random.set_seed(0) if args.tfrecords: test_dataset = ASRTFRecordDataset(
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": args.mxp}) setup_devices([args.device]) from tensorflow_asr.configs.user_config import UserConfig from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer from tensorflow_asr.runners.base_runners import BaseTester from model import DeepSpeech2 tf.random.set_seed(0) assert args.export config = UserConfig(DEFAULT_YAML, args.config, learning=True) speech_featurizer = TFSpeechFeaturizer(config["speech_config"]) text_featurizer = CharFeaturizer(config["decoder_config"]) # Build DS2 model ds2_model = DeepSpeech2(input_shape=speech_featurizer.shape, arch_config=config["model_config"], num_classes=text_featurizer.num_classes, name="deepspeech2") ds2_model._build(speech_featurizer.shape) ds2_model.load_weights(args.saved, by_name=True) ds2_model.summary(line_length=150) ds2_model.add_featurizers(speech_featurizer, text_featurizer) if args.tfrecords: test_dataset = ASRTFRecordDataset( data_paths=config["learning_config"]["dataset_config"]["test_paths"], tfrecords_dir=config["learning_config"]["dataset_config"]["tfrecords_dir"],
def test_contextnet(): config = Config(DEFAULT_YAML) text_featurizer = CharFeaturizer(config.decoder_config) speech_featurizer = TFSpeechFeaturizer(config.speech_config) model = ContextNet(vocabulary_size=text_featurizer.num_classes, **config.model_config) model.make(speech_featurizer.shape) model.summary(line_length=150) model.add_featurizers(speech_featurizer=speech_featurizer, text_featurizer=text_featurizer) concrete_func = model.make_tflite_function( timestamp=False).get_concrete_function() converter = tf.lite.TFLiteConverter.from_concrete_functions( [concrete_func]) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.experimental_new_converter = True converter.target_spec.supported_ops = [ tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS ] tflite = converter.convert() logger.info("Converted successfully with no timestamp") concrete_func = model.make_tflite_function( timestamp=True).get_concrete_function() converter = tf.lite.TFLiteConverter.from_concrete_functions( [concrete_func]) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.experimental_new_converter = True converter.target_spec.supported_ops = [ tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS ] converter.convert() logger.info("Converted successfully with timestamp") tflitemodel = tf.lite.Interpreter(model_content=tflite) signal = tf.random.normal([4000]) input_details = tflitemodel.get_input_details() output_details = tflitemodel.get_output_details() tflitemodel.resize_tensor_input(input_details[0]["index"], [4000]) tflitemodel.allocate_tensors() tflitemodel.set_tensor(input_details[0]["index"], signal) tflitemodel.set_tensor(input_details[1]["index"], tf.constant(text_featurizer.blank, dtype=tf.int32)) tflitemodel.set_tensor( input_details[2]["index"], tf.zeros([ config.model_config["prediction_num_rnns"], 2, 1, config.model_config["prediction_rnn_units"] ], dtype=tf.float32)) tflitemodel.invoke() hyp = tflitemodel.get_tensor(output_details[0]["index"]) logger.info(hyp)
def main(): parser = argparse.ArgumentParser(prog="Conformer Training") parser.add_argument("--config", type=str, default=DEFAULT_YAML, help="The file path of model configuration file") parser.add_argument("--max_ckpts", type=int, default=10, help="Max number of checkpoints to keep") parser.add_argument("--tbs", type=int, default=None, help="Train batch size per replica") parser.add_argument("--ebs", type=int, default=None, help="Evaluation batch size per replica") parser.add_argument("--acs", type=int, default=None, help="Train accumulation steps") parser.add_argument("--devices", type=int, nargs="*", default=[0], help="Devices' ids to apply distributed training") parser.add_argument("--mxp", default=False, action="store_true", help="Enable mixed precision") parser.add_argument("--subwords", type=str, default=None, help="Path to file that stores generated subwords") parser.add_argument("--subwords_corpus", nargs="*", type=str, default=[], help="Transcript files for generating subwords") parser.add_argument( "--train-dir", '-td', nargs='*', default=["en_ng_male_train.tsv", "en_ng_female_train.tsv"]) parser.add_argument("--train-reg-dir", '-trd', nargs='*', default=[ "libritts_train-clean-100.tsv", "libritts_train-clean-360.tsv", "libritts_train-other-500.tsv" ]) parser.add_argument( "--dev-dir", '-dd', nargs='*', default=["en_ng_male_eval.tsv", "en_ng_female_eval.tsv"]) parser.add_argument("--dev-reg-dir", '-drd', nargs='*', default=["libritts_test-other.tsv"]) args = parser.parse_args() tf.config.optimizer.set_experimental_options( {"auto_mixed_precision": args.mxp}) strategy = setup_strategy(args.devices) config = Config(args.config, learning=True) config.train_dir = args.train_dir config.dev_dir = args.dev_dir config.train_reg_dir = args.train_reg_dir config.dev_reg_dir = args.dev_reg_dir with open(config.speech_config) as f: speech_config = yaml.load(f, Loader=yaml.Loader) speech_featurizer = TFSpeechFeaturizer(speech_config) if args.subwords and os.path.exists(args.subwords): print("Loading subwords ...") text_featurizer = SubwordFeaturizer.load_from_file( config.decoder_config, args.subwords) else: print("Generating subwords ...") text_featurizer = SubwordFeaturizer.build_from_corpus( config.decoder_config, corpus_files=args.subwords_corpus) text_featurizer.save_to_file(args.subwords) train_dataset = Dataset(data_paths=config.train_dir, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, augmentations=config.learning_config.augmentations, stage="train", cache=False, shuffle=False) train_reg_dataset = DatasetInf( data_paths=config.train_reg_dir, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, augmentations=config.learning_config.augmentations, stage="train", cache=False, shuffle=False) eval_dataset = Dataset(data_paths=config.dev_dir, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, stage="eval", cache=False, shuffle=False) eval_reg_dataset = DatasetInf( data_paths=config.dev_reg_dir, speech_featurizer=speech_featurizer, text_featurizer=text_featurizer, augmentations=config.learning_config.augmentations, stage="eval", cache=False, shuffle=False) conformer_trainer = MultiReaderTransducerTrainer( config=config.learning_config.running_config, text_featurizer=text_featurizer, strategy=strategy) with conformer_trainer.strategy.scope(): # build model conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes) conformer._build(speech_featurizer.shape) conformer.summary(line_length=120) optimizer = tf.keras.optimizers.Adam( TransformerSchedule(d_model=conformer.dmodel, warmup_steps=config.learning_config. optimizer_config["warmup_steps"], max_lr=(0.05 / math.sqrt(conformer.dmodel))), beta_1=config.learning_config.optimizer_config["beta1"], beta_2=config.learning_config.optimizer_config["beta2"], epsilon=config.learning_config.optimizer_config["epsilon"]) conformer_trainer.compile(model=conformer, optimizer=optimizer, max_to_keep=args.max_ckpts) conformer_trainer.fit( train_dataset, train_reg_dataset, # alpha for regularising dataset; alpha = 1 for training dataset 1., eval_dataset, eval_reg_dataset, train_bs=args.tbs, eval_bs=args.ebs, train_acs=args.acs)
def main(): """Run training process.""" parser = argparse.ArgumentParser( description= "Train MultiBand MelGAN (See detail in examples/multiband_melgan/train_multiband_melgan.py)" ) parser.add_argument("--feature", '-f', required=True) parser.add_argument("--speaker", '-s', required=True) parser.add_argument("--config", '-c', required=True) parser.add_argument("--resume", '-r', required=True) args = parser.parse_args() # return strategy STRATEGY = return_strategy() # load and save config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) with open(config['speech_config']) as f: speech_config = yaml.load(f, Loader=yaml.Loader) config.update(speech_config) config['hop_size'] = config['sample_rate'] * config['stride_ms'] // 1000 config['sampling_rate'] = config['sample_rate'] config.update(vars(args)) config["version"] = tensorflow_tts.__version__ for key, value in config.items(): logging.info(f"{key} = {value}") with STRATEGY.scope(): encoder = Encoder(**config['encoder']) generator = MelGANGeneratorVQ( encoder=encoder, config=MultiBandMelGANGeneratorConfig( **config["multiband_melgan_generator_params"]), name="multi_band_melgan_generator", ) generator.set_shape(config['n_mels'], config['gc_channels']) pqmf = TFPQMF( MultiBandMelGANGeneratorConfig( **config["multiband_melgan_generator_params"]), dtype=tf.float32, name="pqmf", ) # dummy input to build model. fake_mels = tf.random.uniform(shape=[1, 100, config['n_mels']], dtype=tf.float32) fake_gc = tf.random.uniform(shape=[1, config['gc_channels']], dtype=tf.float32) y_mb_hat = generator(mels=fake_mels, gc=fake_gc, training=False)['y_mb_hat'] y_hat = pqmf.synthesis(y_mb_hat) generator.load_weights(args.resume) generator.summary() speech_featurizer = TFSpeechFeaturizer(speech_config) if args.feature.endswith('_mel.npy'): mels = tf.constant(np.load(args.feature), tf.float32) else: signal, _ = librosa.load(args.feature, sr=config['sample_rate']) mels = speech_featurizer.tf_extract(signal) mels = tf.reshape(mels, [1, -1, config['n_mels']]) gc = tf.constant( np.load(args.speaker).reshape([1, config['gc_channels']]), tf.float32) # gc = tf.constant(np.zeros(256).reshape([1, config['gc_channels']]), tf.float32) output = generator(mels=mels, gc=gc, training=False)['y_mb_hat'] y_hat = pqmf.synthesis(output).numpy().reshape([-1]) print('output:', y_hat.shape) save_name = args.feature.replace('.wav', '_gen_vc.wav') save_name = args.feature.replace('_mel.npy', '_gen_vc.wav') save_name = save_name.split('/')[-1] wavfile.write(save_name, config['sample_rate'], y_hat) def depreemphasis(signal: np.ndarray, coeff=0.97): if not coeff or coeff <= 0.0: return signal x = np.zeros(signal.shape[0], dtype=np.float32) x[0] = signal[0] for n in range(1, signal.shape[0], 1): x[n] = coeff * x[n - 1] + signal[n] return x y_hat = depreemphasis(y_hat) wavfile.write(save_name.replace('.wav', '_depre.wav'), config['sample_rate'], y_hat)