Exemple #1
0
def main(argv):
    data = np.loadtxt(FLAGS.input, dtype=str, delimiter=",")
    # calculate the number of loops to run the test
    num = len(data[0])
    batch_size = FLAGS.batch_size
    num_loops = num / batch_size
    assert num % batch_size == 0

    with tf.device("/gpu:0"):
        tf.set_random_seed(1234)
        tfconf = tf.ConfigProto(allow_soft_placement=True)
        with tf.Session(config=tfconf) as sess:
            params = model_registry.GetParams(
                'asr.librispeech.Librispeech960Wpm', 'Test')
            params.cluster.worker.gpus_per_replica = 1
            cluster = cluster_factory.Cluster(params.cluster)
            with cluster, tf.device(cluster.GetPlacer()):
                params.vn.global_vn = False
                params.random_seed = 1234
                params.is_eval = True
                model = params.cls(params)
                task = model.GetTask()
                saver = tf.train.Saver()
                saver.restore(sess, FLAGS.checkpoint)

                # define the placeholders
                input_tf = tf.placeholder(tf.float32, shape=[batch_size, None])
                tgt_tf = tf.placeholder(tf.string)
                sample_rate_tf = tf.placeholder(tf.int32)
                mask_tf = tf.placeholder(tf.float32,
                                         shape=[batch_size, None, 80])

                # generate the features and inputs
                features = create_features(input_tf, sample_rate_tf, mask_tf)
                shape = tf.shape(features)
                inputs = create_inputs(model, features, tgt_tf, batch_size,
                                       mask_tf)

                # loss
                metrics = task.FPropDefaultTheta(inputs)
                loss = tf.get_collection("per_loss")[0]

                # prediction
                decoded_outputs = task.Decode(inputs)
                dec_metrics_dict = task.CreateDecoderMetrics()

                correct = 0
                for l in range(num_loops):
                    data_sub = data[:, l * batch_size:(l + 1) * batch_size]
                    audios_np, sample_rate, tgt_np, mask_freq = Read_input(
                        data_sub, batch_size)
                    feed_dict = {
                        input_tf: audios_np,
                        sample_rate_tf: sample_rate,
                        tgt_tf: tgt_np,
                        mask_tf: mask_freq
                    }

                    losses = sess.run(loss, feed_dict)
                    predictions = sess.run(decoded_outputs, feed_dict)

                    task.PostProcessDecodeOut(predictions, dec_metrics_dict)
                    wer_value = dec_metrics_dict['wer'].value * 100.

                    for i in range(batch_size):
                        print("pred:{}".format(predictions['topk_decoded'][i,
                                                                           0]))
                        print("targ:{}".format(tgt_np[i].lower()))
                        print("true: {}".format(data_sub[1, i].lower()))

                        if predictions['topk_decoded'][i,
                                                       0] == tgt_np[i].lower():
                            correct += 1
                            print("------------------------------")
                            print("example {} succeeds".format(i))

                    print("Now, the WER is: {0:.2f}%".format(wer_value))
                print("num of examples succeed: {}".format(correct))
                print("success rate: {}%".format(correct / float(num) * 100))
Exemple #2
0
    def __init__(
        self,
        sess,
        batch_size=1,
        lr_stage1=100,
        lr_stage2=0.1,
        num_iter_stage1=1000,
        num_iter_stage2=4000,
        th=None,
        psd_max_ori=None,
    ):

        self.sess = sess
        self.num_iter_stage1 = num_iter_stage1
        self.num_iter_stage2 = num_iter_stage2
        self.batch_size = batch_size
        self.lr_stage1 = lr_stage1
        self.lr_stage2 = lr_stage2

        tf.set_random_seed(1234)
        params = model_registry.GetParams("asr.librispeech.Librispeech960Wpm",
                                          "Test")
        params.random_seed = 1234
        params.is_eval = True
        params.cluster.worker.gpus_per_replica = 1
        cluster = cluster_factory.Cluster(params.cluster)
        with cluster, tf.device(cluster.GetPlacer()):
            model = params.cls(params)
            self.delta_large = tf.Variable(
                np.zeros((batch_size, FLAGS.max_length_dataset),
                         dtype=np.float32),
                name="qq_delta",
            )

            # placeholders
            self.input_tf = tf.placeholder(tf.float32,
                                           shape=[batch_size, None],
                                           name="qq_input")
            self.tgt_tf = tf.placeholder(tf.string)
            self.rir = tf.placeholder(tf.float32)

            self.sample_rate_tf = tf.placeholder(tf.int32,
                                                 name="qq_sample_rate")
            self.mask = tf.placeholder(dtype=np.float32,
                                       shape=[batch_size, None],
                                       name="qq_mask")
            self.mask_freq = tf.placeholder(dtype=np.float32,
                                            shape=[batch_size, None, 80])
            self.noise = tf.placeholder(np.float32,
                                        shape=[batch_size, None],
                                        name="qq_noise")
            self.maxlen = tf.placeholder(np.int32)
            self.lr = tf.placeholder(np.float32)
            self.lengths = tf.placeholder(
                np.int32,
                shape=[
                    batch_size,
                ],
            )

            # variable
            self.rescale = tf.Variable(
                np.ones(
                    (batch_size, 1), dtype=np.float32) * FLAGS.initial_bound,
                name="qq_rescale",
            )

            # extract the delta
            self.delta = tf.slice(tf.identity(self.delta_large), [0, 0],
                                  [batch_size, self.maxlen])
            self.apply_delta = tf.clip_by_value(self.delta, -self.rescale,
                                                self.rescale)
            self.before_rir = tf.clip_by_value(
                self.apply_delta * self.mask + self.input_tf, -(2**15),
                2**15 - 1)
            self.new_input = (create_speech_rir(
                self.before_rir,
                self.rir,
                self.lengths,
                self.maxlen,
                self.batch_size,
            ) * self.mask)
            self.pass_in = tf.clip_by_value(self.new_input + self.noise,
                                            -(2**15), 2**15 - 1)

            # generate the inputs that are needed for the lingvo model
            self.features = create_features(self.pass_in, self.sample_rate_tf,
                                            self.mask_freq)
            self.inputs = create_inputs(model, self.features, self.tgt_tf,
                                        self.batch_size, self.mask_freq)

            task = model.GetTask()
            metrics = task.FPropDefaultTheta(self.inputs)

            # self.celoss with the shape (batch_size)
            self.celoss = tf.get_collection("per_loss")[0]
            self.decoded = task.Decode(self.inputs)

        self.optimizer1 = tf.train.AdamOptimizer(self.lr)
        grad1, var1 = self.optimizer1.compute_gradients(
            self.celoss, [self.delta_large])[0]
        self.train1 = self.optimizer1.apply_gradients([(tf.sign(grad1), var1)])
Exemple #3
0
def main(argv):
    data = np.loadtxt(FLAGS.input, dtype=str, delimiter=",")
    # calculate the number of loops to run the test
    num = len(data[0])
    batch_size = FLAGS.batch_size
    num_loops = num / batch_size
    assert num % batch_size == 0

    with tf.device("/gpu:0"):
        tf.set_random_seed(1234)
        tfconf = tf.ConfigProto(allow_soft_placement=True)
        with tf.Session(config=tfconf) as sess:
            params = model_registry.GetParams(
                "asr.librispeech.Librispeech960Wpm", "Test")
            params.cluster.worker.gpus_per_replica = 1
            cluster = cluster_factory.Cluster(params.cluster)
            with cluster, tf.device(cluster.GetPlacer()):
                params.vn.global_vn = False
                params.random_seed = 1234
                params.is_eval = True
                model = params.cls(params)
                task = model.GetTask()
                saver = tf.train.Saver()
                saver.restore(sess, FLAGS.checkpoint)

                # define the placeholders
                input_tf = tf.placeholder(tf.float32, shape=[batch_size, None])
                tgt_tf = tf.placeholder(tf.string)
                sample_rate_tf = tf.placeholder(tf.int32)
                mask_tf = tf.placeholder(tf.float32,
                                         shape=[batch_size, None, 80])
                rir_tf = tf.placeholder(tf.float32)
                lengths = tf.placeholder(
                    np.int32,
                    shape=[
                        batch_size,
                    ],
                )
                maxlen = tf.placeholder(np.int32)
                mask = tf.placeholder(dtype=np.float32,
                                      shape=[batch_size, None])

                # generate the features and inputs
                new_input = (create_speech_rir(input_tf, rir_tf, lengths,
                                               maxlen, batch_size) * mask)
                features = create_features(new_input, sample_rate_tf, mask_tf)
                shape = tf.shape(features)
                inputs = create_inputs(model, features, tgt_tf, batch_size,
                                       mask_tf)

                # loss
                metrics = task.FPropDefaultTheta(inputs)
                loss = tf.get_collection("per_loss")[0]

                # prediction
                decoded_outputs = task.Decode(inputs)
                dec_metrics_dict = task.CreateDecoderMetrics()

                success_rates = []
                for num_room in range(FLAGS.num_test_rooms):
                    correct = 0
                    rir = Readrir(num_room)

                    for l in range(num_loops):
                        data_sub = data[:, l * batch_size:(l + 1) * batch_size]
                        (
                            audios_np,
                            sample_rate,
                            tgt_np,
                            mask_freq,
                            lengths_np,
                            max_len,
                            masks,
                        ) = Read_input(data_sub, batch_size)

                        feed_dict = {
                            input_tf: audios_np,
                            sample_rate_tf: sample_rate,
                            tgt_tf: tgt_np,
                            mask_tf: mask_freq,
                            rir_tf: rir,
                            lengths: lengths_np,
                            maxlen: max_len,
                            mask: masks,
                        }

                        losses = sess.run(loss, feed_dict)
                        predictions = sess.run(decoded_outputs, feed_dict)

                        task.PostProcessDecodeOut(predictions,
                                                  dec_metrics_dict)
                        wer_value = dec_metrics_dict["wer"].value * 100.0

                        for i in range(batch_size):
                            print("example: {}, loss_ce: {}".format(
                                l * batch_size + i, losses[i]))
                            print("pred:{}".format(
                                predictions["topk_decoded"][i, 0]))
                            print("targ:{}".format(tgt_np[i].lower()))
                            print("true: {}".format(data_sub[1, i].lower()))

                            if predictions["topk_decoded"][
                                    i, 0] == tgt_np[i].lower():
                                correct += 1

                        print("--------------------------------")
                        print("Now, the WER is: {0:.2f}%".format(wer_value))

                    print("num of examples succeed for room {}: {}".format(
                        num_room, correct))
                    success_rate = correct / float(num) * 100
                    print("success rate for room {}: {}%".format(
                        num_room, success_rate))

                    success_rates.append(success_rate)
                success_ave = float(sum(success_rates)) / len(success_rates)
                print("success rate overall: {}%".format(success_ave))
Exemple #4
0
 def __init__(self, sess, batch_size=1,
              lr_stage1=100, lr_stage2=0.1, num_iter_stage1=1000, num_iter_stage2=4000, th=None,
                     psd_max_ori=None):
    
     self.sess = sess
     self.num_iter_stage1 = num_iter_stage1
     self.num_iter_stage2 = num_iter_stage2
     self.batch_size = batch_size     
     self.lr_stage1 = lr_stage1
     
     tf.set_random_seed(1234)
     params = model_registry.GetParams('asr.librispeech.Librispeech960Wpm', 'Test')
     params.random_seed = 1234
     params.is_eval = True
     params.cluster.worker.gpus_per_replica = 1
     cluster = cluster_factory.Cluster(params.cluster)
     with cluster, tf.device(cluster.GetPlacer()):
         model = params.cls(params)
         self.delta_large = tf.Variable(np.zeros((batch_size, FLAGS.max_length_dataset), dtype=np.float32), name='qq_delta')
         
         # placeholders
         self.input_tf = tf.placeholder(tf.float32, shape=[batch_size, None], name='qq_input')
         self.tgt_tf = tf.placeholder(tf.string)
         self.sample_rate_tf = tf.placeholder(tf.int32, name='qq_sample_rate')             
         self.th = tf.placeholder(tf.float32, shape=[batch_size, None, None], name='qq_th')
         self.psd_max_ori = tf.placeholder(tf.float32, shape=[batch_size], name='qq_psd')            
         self.mask = tf.placeholder(dtype=np.float32, shape=[batch_size, None], name='qq_mask')   
         self.mask_freq = tf.placeholder(dtype=np.float32, shape=[batch_size, None, 80])   
         self.noise = tf.placeholder(np.float32, shape=[batch_size, None], name="qq_noise")
         self.maxlen = tf.placeholder(np.int32)
         self.lr_stage2 = tf.placeholder(np.float32)
         
         # variable
         self.rescale = tf.Variable(np.ones((batch_size,1), dtype=np.float32), name='qq_rescale')
         self.alpha = tf.Variable(np.ones((batch_size), dtype=np.float32) * 0.05, name='qq_alpha')        
         
         # extract the delta
         self.delta = tf.slice(tf.identity(self.delta_large), [0, 0], [batch_size, self.maxlen])                      
         self.apply_delta = tf.clip_by_value(self.delta, -FLAGS.initial_bound, FLAGS.initial_bound) * self.rescale
         self.new_input = self.apply_delta * self.mask + self.input_tf            
         self.pass_in = tf.clip_by_value(self.new_input + self.noise, -2**15, 2**15-1)
    
         # generate the inputs that are needed for the lingvo model
         self.features = create_features(self.pass_in, self.sample_rate_tf, self.mask_freq)
         self.inputs = create_inputs(model, self.features, self.tgt_tf, self.batch_size, self.mask_freq)  
     
         task = model.GetTask()
         metrics = task.FPropDefaultTheta(self.inputs)
         # self.celoss with the shape (batch_size)
         self.celoss = tf.get_collection("per_loss")[0]         
         self.decoded = task.Decode(self.inputs)
     
     
     # compute the loss for masking threshold
     self.loss_th_list = []
     self.transform = Transform(FLAGS.window_size)
     for i in range(self.batch_size):
         logits_delta = self.transform((self.apply_delta[i, :]), (self.psd_max_ori)[i])
         loss_th =  tf.reduce_mean(tf.nn.relu(logits_delta - (self.th)[i]))            
         loss_th = tf.expand_dims(loss_th, dim=0) 
         self.loss_th_list.append(loss_th)
     self.loss_th = tf.concat(self.loss_th_list, axis=0)
     
     
     self.optimizer1 = tf.train.AdamOptimizer(self.lr_stage1)
     self.optimizer2 = tf.train.AdamOptimizer(self.lr_stage2)
          
     grad1, var1 = self.optimizer1.compute_gradients(self.celoss, [self.delta_large])[0]      
     grad21, var21 = self.optimizer2.compute_gradients(self.celoss, [self.delta_large])[0]
     grad22, var22 = self.optimizer2.compute_gradients(self.alpha * self.loss_th, [self.delta_large])[0]
     
     self.train1 = self.optimizer1.apply_gradients([(tf.sign(grad1), var1)])
     self.train21 = self.optimizer2.apply_gradients([(grad21, var21)])
     self.train22 = self.optimizer2.apply_gradients([(grad22, var22)])
     self.train2 = tf.group(self.train21, self.train22)       
def main():

    checkpoint = "./model/ckpt-00908156"

    parser = argparse.ArgumentParser(description=None)
    parser.add_argument('--dirs',
                        type=str,
                        nargs='+',
                        required=True,
                        help='Filepath of original input audio')

    args = parser.parse_args()
    while len(sys.argv) > 1:
        sys.argv.pop()

    with tf.device("/gpu:0"):
        tf.set_random_seed(1234)
        tfconf = tf.ConfigProto(allow_soft_placement=True)
        with tf.Session(config=tfconf) as sess:
            params = model_registry.GetParams(
                'asr.librispeech.Librispeech960Wpm', 'Test')
            params.cluster.worker.gpus_per_replica = 1
            cluster = cluster_factory.Cluster(params.cluster)
            with cluster, tf.device(cluster.GetPlacer()):
                params.vn.global_vn = False
                params.random_seed = 1234
                params.is_eval = True
                model = params.cls(params)
                task = model.GetTask()
                saver = tf.train.Saver()
                saver.restore(sess, checkpoint)

                input_tf = tf.placeholder(tf.float32, shape=[1, None])
                tgt_tf = tf.placeholder(tf.string)
                sample_rate_tf = tf.placeholder(tf.int32)
                mask_tf = tf.placeholder(tf.float32, shape=[1, None, 80])

                features = create_features(input_tf, sample_rate_tf, mask_tf)
                shape = tf.shape(features)
                inputs = create_inputs(model, features, tgt_tf, 1, mask_tf)

                metrics = task.FPropDefaultTheta(inputs)
                loss = tf.get_collection("per_loss")[0]

                # prediction
                decoded_outputs = task.Decode(inputs)
                dec_metrics_dict = task.CreateDecoderMetrics()

                for audio_dir in args.dirs:
                    file_names = _get_file_names(audio_dir)
                    transcriptions = {}
                    for fidx, file_name in enumerate(file_names):
                        audios_np, sample_rate, tgt_np, mask_freq = _decode_audio(
                            audio_dir, file_name)

                        feed_dict = {
                            input_tf: audios_np,
                            sample_rate_tf: sample_rate,
                            tgt_tf: tgt_np,
                            mask_tf: mask_freq
                        }

                        try:
                            losses = sess.run(loss, feed_dict)
                            predictions = sess.run(decoded_outputs, feed_dict)
                        except:
                            print("Error in transcribing: ", file_name)
                            continue

                        task.PostProcessDecodeOut(predictions,
                                                  dec_metrics_dict)
                        wer_value = dec_metrics_dict['wer'].value * 100.
                        transcriptions[file_name] = predictions[
                            'topk_decoded'][0, 0].lower()

                        print(
                            fidx, "pred-{},{} : {}".format(
                                audio_dir, file_name,
                                predictions['topk_decoded'][0, 0]))

                    with open(join(audio_dir, "transcriptions.json"),
                              'w') as f:
                        f.write(json.dumps(transcriptions))