def main(argv): data = np.loadtxt(FLAGS.input, dtype=str, delimiter=",") # calculate the number of loops to run the test num = len(data[0]) batch_size = FLAGS.batch_size num_loops = num / batch_size assert num % batch_size == 0 with tf.device("/gpu:0"): tf.set_random_seed(1234) tfconf = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=tfconf) as sess: params = model_registry.GetParams( 'asr.librispeech.Librispeech960Wpm', 'Test') params.cluster.worker.gpus_per_replica = 1 cluster = cluster_factory.Cluster(params.cluster) with cluster, tf.device(cluster.GetPlacer()): params.vn.global_vn = False params.random_seed = 1234 params.is_eval = True model = params.cls(params) task = model.GetTask() saver = tf.train.Saver() saver.restore(sess, FLAGS.checkpoint) # define the placeholders input_tf = tf.placeholder(tf.float32, shape=[batch_size, None]) tgt_tf = tf.placeholder(tf.string) sample_rate_tf = tf.placeholder(tf.int32) mask_tf = tf.placeholder(tf.float32, shape=[batch_size, None, 80]) # generate the features and inputs features = create_features(input_tf, sample_rate_tf, mask_tf) shape = tf.shape(features) inputs = create_inputs(model, features, tgt_tf, batch_size, mask_tf) # loss metrics = task.FPropDefaultTheta(inputs) loss = tf.get_collection("per_loss")[0] # prediction decoded_outputs = task.Decode(inputs) dec_metrics_dict = task.CreateDecoderMetrics() correct = 0 for l in range(num_loops): data_sub = data[:, l * batch_size:(l + 1) * batch_size] audios_np, sample_rate, tgt_np, mask_freq = Read_input( data_sub, batch_size) feed_dict = { input_tf: audios_np, sample_rate_tf: sample_rate, tgt_tf: tgt_np, mask_tf: mask_freq } losses = sess.run(loss, feed_dict) predictions = sess.run(decoded_outputs, feed_dict) task.PostProcessDecodeOut(predictions, dec_metrics_dict) wer_value = dec_metrics_dict['wer'].value * 100. for i in range(batch_size): print("pred:{}".format(predictions['topk_decoded'][i, 0])) print("targ:{}".format(tgt_np[i].lower())) print("true: {}".format(data_sub[1, i].lower())) if predictions['topk_decoded'][i, 0] == tgt_np[i].lower(): correct += 1 print("------------------------------") print("example {} succeeds".format(i)) print("Now, the WER is: {0:.2f}%".format(wer_value)) print("num of examples succeed: {}".format(correct)) print("success rate: {}%".format(correct / float(num) * 100))
def __init__( self, sess, batch_size=1, lr_stage1=100, lr_stage2=0.1, num_iter_stage1=1000, num_iter_stage2=4000, th=None, psd_max_ori=None, ): self.sess = sess self.num_iter_stage1 = num_iter_stage1 self.num_iter_stage2 = num_iter_stage2 self.batch_size = batch_size self.lr_stage1 = lr_stage1 self.lr_stage2 = lr_stage2 tf.set_random_seed(1234) params = model_registry.GetParams("asr.librispeech.Librispeech960Wpm", "Test") params.random_seed = 1234 params.is_eval = True params.cluster.worker.gpus_per_replica = 1 cluster = cluster_factory.Cluster(params.cluster) with cluster, tf.device(cluster.GetPlacer()): model = params.cls(params) self.delta_large = tf.Variable( np.zeros((batch_size, FLAGS.max_length_dataset), dtype=np.float32), name="qq_delta", ) # placeholders self.input_tf = tf.placeholder(tf.float32, shape=[batch_size, None], name="qq_input") self.tgt_tf = tf.placeholder(tf.string) self.rir = tf.placeholder(tf.float32) self.sample_rate_tf = tf.placeholder(tf.int32, name="qq_sample_rate") self.mask = tf.placeholder(dtype=np.float32, shape=[batch_size, None], name="qq_mask") self.mask_freq = tf.placeholder(dtype=np.float32, shape=[batch_size, None, 80]) self.noise = tf.placeholder(np.float32, shape=[batch_size, None], name="qq_noise") self.maxlen = tf.placeholder(np.int32) self.lr = tf.placeholder(np.float32) self.lengths = tf.placeholder( np.int32, shape=[ batch_size, ], ) # variable self.rescale = tf.Variable( np.ones( (batch_size, 1), dtype=np.float32) * FLAGS.initial_bound, name="qq_rescale", ) # extract the delta self.delta = tf.slice(tf.identity(self.delta_large), [0, 0], [batch_size, self.maxlen]) self.apply_delta = tf.clip_by_value(self.delta, -self.rescale, self.rescale) self.before_rir = tf.clip_by_value( self.apply_delta * self.mask + self.input_tf, -(2**15), 2**15 - 1) self.new_input = (create_speech_rir( self.before_rir, self.rir, self.lengths, self.maxlen, self.batch_size, ) * self.mask) self.pass_in = tf.clip_by_value(self.new_input + self.noise, -(2**15), 2**15 - 1) # generate the inputs that are needed for the lingvo model self.features = create_features(self.pass_in, self.sample_rate_tf, self.mask_freq) self.inputs = create_inputs(model, self.features, self.tgt_tf, self.batch_size, self.mask_freq) task = model.GetTask() metrics = task.FPropDefaultTheta(self.inputs) # self.celoss with the shape (batch_size) self.celoss = tf.get_collection("per_loss")[0] self.decoded = task.Decode(self.inputs) self.optimizer1 = tf.train.AdamOptimizer(self.lr) grad1, var1 = self.optimizer1.compute_gradients( self.celoss, [self.delta_large])[0] self.train1 = self.optimizer1.apply_gradients([(tf.sign(grad1), var1)])
def main(argv): data = np.loadtxt(FLAGS.input, dtype=str, delimiter=",") # calculate the number of loops to run the test num = len(data[0]) batch_size = FLAGS.batch_size num_loops = num / batch_size assert num % batch_size == 0 with tf.device("/gpu:0"): tf.set_random_seed(1234) tfconf = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=tfconf) as sess: params = model_registry.GetParams( "asr.librispeech.Librispeech960Wpm", "Test") params.cluster.worker.gpus_per_replica = 1 cluster = cluster_factory.Cluster(params.cluster) with cluster, tf.device(cluster.GetPlacer()): params.vn.global_vn = False params.random_seed = 1234 params.is_eval = True model = params.cls(params) task = model.GetTask() saver = tf.train.Saver() saver.restore(sess, FLAGS.checkpoint) # define the placeholders input_tf = tf.placeholder(tf.float32, shape=[batch_size, None]) tgt_tf = tf.placeholder(tf.string) sample_rate_tf = tf.placeholder(tf.int32) mask_tf = tf.placeholder(tf.float32, shape=[batch_size, None, 80]) rir_tf = tf.placeholder(tf.float32) lengths = tf.placeholder( np.int32, shape=[ batch_size, ], ) maxlen = tf.placeholder(np.int32) mask = tf.placeholder(dtype=np.float32, shape=[batch_size, None]) # generate the features and inputs new_input = (create_speech_rir(input_tf, rir_tf, lengths, maxlen, batch_size) * mask) features = create_features(new_input, sample_rate_tf, mask_tf) shape = tf.shape(features) inputs = create_inputs(model, features, tgt_tf, batch_size, mask_tf) # loss metrics = task.FPropDefaultTheta(inputs) loss = tf.get_collection("per_loss")[0] # prediction decoded_outputs = task.Decode(inputs) dec_metrics_dict = task.CreateDecoderMetrics() success_rates = [] for num_room in range(FLAGS.num_test_rooms): correct = 0 rir = Readrir(num_room) for l in range(num_loops): data_sub = data[:, l * batch_size:(l + 1) * batch_size] ( audios_np, sample_rate, tgt_np, mask_freq, lengths_np, max_len, masks, ) = Read_input(data_sub, batch_size) feed_dict = { input_tf: audios_np, sample_rate_tf: sample_rate, tgt_tf: tgt_np, mask_tf: mask_freq, rir_tf: rir, lengths: lengths_np, maxlen: max_len, mask: masks, } losses = sess.run(loss, feed_dict) predictions = sess.run(decoded_outputs, feed_dict) task.PostProcessDecodeOut(predictions, dec_metrics_dict) wer_value = dec_metrics_dict["wer"].value * 100.0 for i in range(batch_size): print("example: {}, loss_ce: {}".format( l * batch_size + i, losses[i])) print("pred:{}".format( predictions["topk_decoded"][i, 0])) print("targ:{}".format(tgt_np[i].lower())) print("true: {}".format(data_sub[1, i].lower())) if predictions["topk_decoded"][ i, 0] == tgt_np[i].lower(): correct += 1 print("--------------------------------") print("Now, the WER is: {0:.2f}%".format(wer_value)) print("num of examples succeed for room {}: {}".format( num_room, correct)) success_rate = correct / float(num) * 100 print("success rate for room {}: {}%".format( num_room, success_rate)) success_rates.append(success_rate) success_ave = float(sum(success_rates)) / len(success_rates) print("success rate overall: {}%".format(success_ave))
def __init__(self, sess, batch_size=1, lr_stage1=100, lr_stage2=0.1, num_iter_stage1=1000, num_iter_stage2=4000, th=None, psd_max_ori=None): self.sess = sess self.num_iter_stage1 = num_iter_stage1 self.num_iter_stage2 = num_iter_stage2 self.batch_size = batch_size self.lr_stage1 = lr_stage1 tf.set_random_seed(1234) params = model_registry.GetParams('asr.librispeech.Librispeech960Wpm', 'Test') params.random_seed = 1234 params.is_eval = True params.cluster.worker.gpus_per_replica = 1 cluster = cluster_factory.Cluster(params.cluster) with cluster, tf.device(cluster.GetPlacer()): model = params.cls(params) self.delta_large = tf.Variable(np.zeros((batch_size, FLAGS.max_length_dataset), dtype=np.float32), name='qq_delta') # placeholders self.input_tf = tf.placeholder(tf.float32, shape=[batch_size, None], name='qq_input') self.tgt_tf = tf.placeholder(tf.string) self.sample_rate_tf = tf.placeholder(tf.int32, name='qq_sample_rate') self.th = tf.placeholder(tf.float32, shape=[batch_size, None, None], name='qq_th') self.psd_max_ori = tf.placeholder(tf.float32, shape=[batch_size], name='qq_psd') self.mask = tf.placeholder(dtype=np.float32, shape=[batch_size, None], name='qq_mask') self.mask_freq = tf.placeholder(dtype=np.float32, shape=[batch_size, None, 80]) self.noise = tf.placeholder(np.float32, shape=[batch_size, None], name="qq_noise") self.maxlen = tf.placeholder(np.int32) self.lr_stage2 = tf.placeholder(np.float32) # variable self.rescale = tf.Variable(np.ones((batch_size,1), dtype=np.float32), name='qq_rescale') self.alpha = tf.Variable(np.ones((batch_size), dtype=np.float32) * 0.05, name='qq_alpha') # extract the delta self.delta = tf.slice(tf.identity(self.delta_large), [0, 0], [batch_size, self.maxlen]) self.apply_delta = tf.clip_by_value(self.delta, -FLAGS.initial_bound, FLAGS.initial_bound) * self.rescale self.new_input = self.apply_delta * self.mask + self.input_tf self.pass_in = tf.clip_by_value(self.new_input + self.noise, -2**15, 2**15-1) # generate the inputs that are needed for the lingvo model self.features = create_features(self.pass_in, self.sample_rate_tf, self.mask_freq) self.inputs = create_inputs(model, self.features, self.tgt_tf, self.batch_size, self.mask_freq) task = model.GetTask() metrics = task.FPropDefaultTheta(self.inputs) # self.celoss with the shape (batch_size) self.celoss = tf.get_collection("per_loss")[0] self.decoded = task.Decode(self.inputs) # compute the loss for masking threshold self.loss_th_list = [] self.transform = Transform(FLAGS.window_size) for i in range(self.batch_size): logits_delta = self.transform((self.apply_delta[i, :]), (self.psd_max_ori)[i]) loss_th = tf.reduce_mean(tf.nn.relu(logits_delta - (self.th)[i])) loss_th = tf.expand_dims(loss_th, dim=0) self.loss_th_list.append(loss_th) self.loss_th = tf.concat(self.loss_th_list, axis=0) self.optimizer1 = tf.train.AdamOptimizer(self.lr_stage1) self.optimizer2 = tf.train.AdamOptimizer(self.lr_stage2) grad1, var1 = self.optimizer1.compute_gradients(self.celoss, [self.delta_large])[0] grad21, var21 = self.optimizer2.compute_gradients(self.celoss, [self.delta_large])[0] grad22, var22 = self.optimizer2.compute_gradients(self.alpha * self.loss_th, [self.delta_large])[0] self.train1 = self.optimizer1.apply_gradients([(tf.sign(grad1), var1)]) self.train21 = self.optimizer2.apply_gradients([(grad21, var21)]) self.train22 = self.optimizer2.apply_gradients([(grad22, var22)]) self.train2 = tf.group(self.train21, self.train22)
def main(): checkpoint = "./model/ckpt-00908156" parser = argparse.ArgumentParser(description=None) parser.add_argument('--dirs', type=str, nargs='+', required=True, help='Filepath of original input audio') args = parser.parse_args() while len(sys.argv) > 1: sys.argv.pop() with tf.device("/gpu:0"): tf.set_random_seed(1234) tfconf = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=tfconf) as sess: params = model_registry.GetParams( 'asr.librispeech.Librispeech960Wpm', 'Test') params.cluster.worker.gpus_per_replica = 1 cluster = cluster_factory.Cluster(params.cluster) with cluster, tf.device(cluster.GetPlacer()): params.vn.global_vn = False params.random_seed = 1234 params.is_eval = True model = params.cls(params) task = model.GetTask() saver = tf.train.Saver() saver.restore(sess, checkpoint) input_tf = tf.placeholder(tf.float32, shape=[1, None]) tgt_tf = tf.placeholder(tf.string) sample_rate_tf = tf.placeholder(tf.int32) mask_tf = tf.placeholder(tf.float32, shape=[1, None, 80]) features = create_features(input_tf, sample_rate_tf, mask_tf) shape = tf.shape(features) inputs = create_inputs(model, features, tgt_tf, 1, mask_tf) metrics = task.FPropDefaultTheta(inputs) loss = tf.get_collection("per_loss")[0] # prediction decoded_outputs = task.Decode(inputs) dec_metrics_dict = task.CreateDecoderMetrics() for audio_dir in args.dirs: file_names = _get_file_names(audio_dir) transcriptions = {} for fidx, file_name in enumerate(file_names): audios_np, sample_rate, tgt_np, mask_freq = _decode_audio( audio_dir, file_name) feed_dict = { input_tf: audios_np, sample_rate_tf: sample_rate, tgt_tf: tgt_np, mask_tf: mask_freq } try: losses = sess.run(loss, feed_dict) predictions = sess.run(decoded_outputs, feed_dict) except: print("Error in transcribing: ", file_name) continue task.PostProcessDecodeOut(predictions, dec_metrics_dict) wer_value = dec_metrics_dict['wer'].value * 100. transcriptions[file_name] = predictions[ 'topk_decoded'][0, 0].lower() print( fidx, "pred-{},{} : {}".format( audio_dir, file_name, predictions['topk_decoded'][0, 0])) with open(join(audio_dir, "transcriptions.json"), 'w') as f: f.write(json.dumps(transcriptions))