def test_sfb(self): ''' test sfb op''' with self.cached_session(use_gpu=False, force_gpu=False): sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) power_spc, phase_spc = py_x_ops.analyfiltbank( input_data, sample_rate) logging.info('Shape of power_spc: {}'.format( power_spc.eval().shape)) logging.info('Shape of phase_spc: {}'.format( phase_spc.eval().shape)) output = py_x_ops.synthfiltbank(power_spc.eval(), phase_spc.eval(), sample_rate) self.assertEqual(tf.rank(output).eval(), 1) logging.info('Shape of recovered signal: {}'.format( output.eval().shape)) # beginning 400 samples are different, due to the overlap and add self.assertAllClose(output.eval().flatten()[500:550], input_data[500:550], rtol=1e-4, atol=1e-4)
def test_afb(self): ''' test afb op''' with self.session(): sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) power_spc, phase_spc = py_x_ops.analyfiltbank( input_data, sample_rate) power_spc_true = np.array([ 0.000421823002, 0.000014681223, 0.000088715387, 0.000011405386, 0.000029108920, 0.000016433882, 0.000009128947, 0.000016150383, 0.000068095047, 0.000016092306, 0.000088840192, 0.000021255839, 0.000033152886, 0.000005644561, 0.000012678992, 0.000009685464, 0.000022561202, 0.000004176219, 0.000032476772, 0.000063007421, 0.000001721088, 0.000003773108, 0.000012991571, 0.000006143227, 0.000005361593, 0.000019796202, 0.000012828057, 0.000040009807, 0.000009260243, 0.000060815764, 0.000036184814, 0.000018079394, 0.000004533325, 0.000008295409, 0.000033129665, 0.000022150667, 0.000020058087, 0.000000962711, 0.000017114238, 0.000007549510, 0.000023227087, 0.000037615722, 0.000007189777, 0.000006701076, 0.000016871410, 0.000018671506, 0.000006927207, 0.000004177695, 0.000005777914, 0.000002745287 ]) phase_spc_true = np.array([ 3.141592741013, 0.017522372305, 2.614648103714, 1.024240016937, -0.082203239202, 0.177630946040, -0.947744905949, 1.557014584541, -2.254315614700, -0.327101945877, -2.747241020203, -1.865882754326, -2.847117424011, -0.581349492073, -3.014511823654, 2.957268953323, 1.846585988998, -1.926323652267, -2.718185901642, -2.704042911530, -0.473446547985, -2.938575029373, 2.915200233459, -1.540565252304, -3.052149772644, 2.665060997009, -2.724275827408, -2.989539623260, -2.875509977341, -2.549245357513, 2.585565090179, 1.503721714020, 1.570051312447, 1.980712175369, 2.068141937256, -1.657162785530, 2.774835824966, -1.669888973236, -2.816159725189, 3.112393617630, -0.539753019810, 2.466773271561, 2.961024999619, -1.002810001373, 2.275165081024, -2.257984638214, -2.611628055573, -2.753412723541, -2.071642875671, -2.972373962402 ]) self.assertEqual(tf.rank(power_spc).eval(), 1) self.assertEqual(tf.rank(phase_spc).eval(), 1) # logging.info('output1: {}'.format(output_1.eval().flatten()[:50])) # logging.info('output2: {}'.format(output_2.eval().flatten()[:50])) self.assertAllClose(power_spc.eval().flatten()[:50], power_spc_true) self.assertAllClose(phase_spc.eval().flatten()[:50], phase_spc_true)
def test_plp(self): ''' test plp op''' with self.session(): sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) output = py_x_ops.plp(input_data, sample_rate) output_true = np.array( [[-0.209490, -0.326126, 0.010536, -0.027167, -0.117118], [-0.020293, -0.454695, -0.104243, 0.001560, -0.234854], [-0.015118, -0.444044, -0.156695, -0.086221, -0.319310], [-0.031856, -0.130708, 0.047435, -0.089916, -0.160247], [0.052763, -0.271487, 0.011329, 0.025320, 0.012851]]) self.assertEqual(tf.rank(output).eval(), 2) logging.info('Shape of PLP: {}'.format(output.shape)) self.assertAllClose(output.eval()[50:55, 5:10], output_true)
def test_cepstrum(self): ''' test cepstrum op''' with self.session(): sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) output = py_x_ops.cepstrum(input_data, sample_rate) output_true = np.array( [[0.525808, 0.579537, 0.159656, 0.014726, -0.1866810], [0.225988, 1.557304, 3.381828, 0.132935, 0.7128600], [-1.832759, -1.045178, 0.753158, 0.116107, -0.9307780], [-0.696277, 1.333355, 1.590942, 2.041829, -0.0805630], [-0.377375, 2.984320, 0.036302, 3.676640, 1.1709290]]) self.assertEqual(tf.rank(output).eval(), 2) logging.info('Shape of cepstrum: {}'.format(output.shape)) self.assertAllClose(output.eval()[15:20, 7:12], output_true)
def test_spectrum(self): ''' test spectrum op''' with self.session(): sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) output = py_x_ops.spectrum(input_data, sample_rate) output_true = np.array( [[-16.863441, -16.910473, -17.077059, -16.371634, -16.845686], [-17.922068, -20.396345, -19.396944, -17.331493, -16.118851], [-17.017776, -17.551350, -20.332376, -17.403994, -16.617926], [-19.873854, -17.644503, -20.679525, -17.093716, -16.535091], [-17.074402, -17.295971, -16.896650, -15.995432, -16.560730]]) self.assertEqual(tf.rank(output).eval(), 2) logging.info('Shape of spectrum: {}'.format(output.shape)) self.assertAllClose(output.eval()[4:9, 4:9], output_true)
def test_frmpow(self): ''' test frame_power op''' with self.session(): sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) output = py_x_ops.frame_pow(input_data, sample_rate) output_true = np.array([ 0.000018, 0.000011, 0.000010, 0.000010, 0.000010, 0.000010, 0.000008, 0.000009, 0.000009, 0.000009, 0.000009, 0.000011, 0.090164, 0.133028, 0.156547, 0.053551, 0.056670, 0.097706, 0.405659, 2.119505, 4.296845, 6.139090, 6.623638, 6.136467, 7.595072, 7.904415, 7.655983, 6.771016, 5.706427, 4.220942, 3.259599, 2.218259, 1.911394, 2.234246, 3.056905, 2.534153, 0.464354, 0.013493, 0.021231, 0.148362, 0.364829, 0.627266, 0.494912, 0.366029, 0.315408, 0.312441, 0.323796, 0.267505, 0.152856, 0.045305 ]) self.assertEqual(tf.rank(output).eval(), 1) self.assertAllClose(output.eval().flatten()[:50], output_true)
def test_pitch(self): ''' test pitch op''' with self.session(): # read wave sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) output = py_x_ops.pitch(input_data, sample_rate) output_true = np.array([ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 122.823532, 117.647057, 116.788322, 116.788322, 119.402985, 119.402985, 119.402985, 119.402985, 119.402985, 123.076920, 124.031006, 125.000000, 132.065216, 139.130432, 139.130432, 137.931030, 126.108368, 114.285713, 115.107910, 122.070084, 129.032257, 130.081299, 130.081299, 129.032257, 130.081299, 131.147537, 129.032257, 125.000000, 120.300751, 115.107910 ]) self.assertEqual(tf.rank(output).eval(), 1) self.assertAllClose(output.eval().flatten()[:50], output_true)
def test_plp(self): ''' test plp op''' with self.session(): sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) output = py_x_ops.plp(input_data, sample_rate) output_true = np.array([ -0.000000, -0.959257, -0.095592, -0.219479, -0.104977, -0.185207, -0.153651, -0.081711, -0.156977, -0.072177, 0.077400, 0.027594, 0.040156, -0.000000, -0.956464, -0.086729, -0.211084, -0.062403, -0.212304, -0.240348, -0.081032, -0.036527, -0.071906, 0.025969, 0.004119, 0.003473, -0.000000, -0.952486, -0.094521, -0.143834, -0.133079, -0.244882, -0.175419, -0.040801, -0.071001, -0.134758, 0.061415, 0.085666, 0.012909, -0.000000, -0.928211, -0.108592, -0.249340, -0.141225, -0.199109, -0.081247, -0.044329, -0.140386, -0.174557, -0.045552 ]) self.assertEqual(tf.rank(output).eval(), 1) self.assertAllClose(output.eval().flatten()[:50], output_true)
def test_spectrum(self): ''' test spectrum op''' with self.session(): sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) output = py_x_ops.spectrum(input_data, sample_rate) output_true = np.array([ -16.018925, -16.491777, -16.903442, -18.108875, -19.477205, -19.039738, -17.066263, -16.530647, -16.033670, -15.492795, -15.347169, -16.443783, -15.385968, -15.631793, -16.286760, -16.555447, -15.107640, -15.158586, -16.397518, -14.803325, -15.173873, -15.785010, -15.551179, -15.487743, -15.732930, -15.610220, -15.314099, -14.765355, -14.572725, -13.482535, -13.463938, -14.457010, -16.253452, -15.444997, -13.472414, -12.852523, -13.163157, -13.957175, -14.148843, -13.527264, -12.840333, -13.056757, -14.582790, -13.900843, -13.864534, -14.037180, -15.386706, -16.500109, -16.309618, -13.585808 ]) self.assertEqual(tf.rank(output).eval(), 1) self.assertAllClose(output.eval().flatten()[:50], output_true)
def test_zcr(self): ''' test zcr op''' with self.session(): sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) output = py_x_ops.zcr(input_data, sample_rate) output_true = np.array([ 0.406250, 0.418750, 0.425000, 0.407500, 0.393750, 0.392500, 0.388750, 0.417500, 0.427500, 0.456250, 0.447500, 0.386250, 0.357500, 0.282500, 0.232500, 0.262500, 0.282500, 0.295000, 0.220000, 0.157500, 0.125000, 0.107500, 0.100000, 0.092500, 0.092500, 0.095000, 0.097500, 0.105000, 0.100000, 0.112500, 0.120000, 0.132500, 0.130000, 0.135000, 0.112500, 0.120000, 0.090000, 0.080000, 0.070000, 0.080000, 0.087500, 0.092500, 0.097500, 0.097500, 0.112500, 0.090000, 0.065000, 0.087500, 0.175000, 0.240000 ]) self.assertEqual(tf.rank(output).eval(), 1) logging.info('Shape of zero-cross-rate: {}'.format( output.eval().shape)) self.assertAllClose(output.eval().flatten()[:50], output_true)
def test_spectrum(self): ''' test spectrum op''' with self.cached_session(use_gpu=False, force_gpu=False): sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) logging.info( f"input shape: {input_data.shape}, sample rate dtype: {sample_rate.dtype}" ) self.assertEqual(sample_rate, 16000) output = py_x_ops.spectrum(input_data, sample_rate) #pylint: disable=bad-whitespace output_true = np.array( [[-16.863441, -16.910473, -17.077059, -16.371634, -16.845686], [-17.922068, -20.396345, -19.396944, -17.331493, -16.118851], [-17.017776, -17.551350, -20.332376, -17.403994, -16.617926], [-19.873854, -17.644503, -20.679525, -17.093716, -16.535091], [-17.074402, -17.295971, -16.896650, -15.995432, -16.560730]]) #pylint: enable=bad-whitespace self.assertEqual(tf.rank(output).eval(), 2) logging.info('Shape of spectrum: {}'.format(output.shape)) self.assertAllClose(output.eval()[4:9, 4:9], output_true)
def test_afb(self): ''' test afb op''' with self.session(): sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) power_spc, phase_spc = py_x_ops.analyfiltbank( input_data, sample_rate) power_spc_true = np.array( [[ 4.2182300e-04, 3.6964193e-04, 3.9906241e-05, 2.8196722e-05, 3.3976138e-04, 3.7671626e-04, 2.2727624e-04, 7.2495081e-05, 4.3451786e-05, 3.4654513e-06 ], [ 1.4681223e-05, 2.8831255e-05, 3.5616580e-05, 3.9359711e-05, 1.2714787e-04, 1.2794189e-04, 3.6509471e-05, 1.7578101e-05, 5.9672035e-05, 2.9785692e-06 ], [ 8.8715387e-05, 6.0998322e-05, 2.7695101e-05, 1.6866413e-04, 4.6845453e-05, 3.3532990e-05, 5.7005627e-06, 5.1852752e-05, 1.8390550e-05, 8.3459439e-05 ], [ 1.1405386e-05, 1.8942148e-06, 1.6338145e-06, 1.8362705e-05, 8.4106450e-06, 4.4174294e-06, 3.6533682e-05, 5.0541588e-05, 1.6701326e-06, 1.8736981e-05 ], [ 2.9108920e-05, 1.6862698e-05, 3.3437627e-05, 6.9332527e-05, 5.0028186e-05, 5.9426224e-05, 2.1895030e-06, 2.3780794e-06, 4.7786685e-05, 7.3811811e-05 ], [ 1.6433882e-05, 9.5777386e-07, 2.0980822e-06, 4.8990279e-07, 1.4232077e-05, 1.5986938e-05, 2.9042780e-05, 1.1719906e-05, 2.4548817e-06, 5.3594176e-06 ], [ 9.1289467e-06, 9.4249899e-06, 7.4781286e-07, 1.8923520e-05, 6.5740237e-06, 4.3209452e-06, 3.9396346e-06, 1.2287317e-05, 4.6807354e-06, 5.8512210e-06 ], [ 1.6150383e-05, 2.6649790e-05, 1.8610657e-05, 2.2872716e-06, 1.4209920e-05, 2.3279742e-06, 6.6038615e-06, 2.6169775e-05, 2.8335158e-05, 1.7595910e-06 ], [ 6.8095047e-05, 9.1859045e-05, 2.6713702e-05, 3.0580850e-05, 1.4539381e-05, 4.2510033e-05, 2.2579852e-05, 1.4843822e-05, 2.0883192e-05, 6.0624756e-05 ], [ 1.6092306e-05, 1.4245335e-05, 2.4250150e-05, 6.0177539e-05, 6.7926321e-06, 3.4922948e-07, 2.1843030e-06, 8.5554876e-07, 2.6831965e-06, 2.0012436e-05 ]]) phase_spc_true = np.array( [[ 3.1415927, 3.1415927, 3.1415927, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.1415927 ], [ 0.01752237, 1.6688037, 1.4971976, 1.4470094, 2.0516894, -2.3112175, -0.7115377, 2.9614341, -1.2494497, -0.7055688 ], [ 2.614648, 0.63351387, -2.0660093, 1.7626916, -1.1257634, 3.017448, -2.892095, -1.2209401, 1.7407895, -1.0281658 ], [ 1.02424, -1.8967879, -0.6139833, 2.587602, 3.0070715, 1.5781559, -1.899145, -1.1459525, -0.24284656, -0.8106653 ], [ -0.08220324, 0.5497215, 1.7031444, -2.8960562, -1.3680246, 0.4349923, 2.0676146, 1.2389332, 2.6312854, -1.7511902 ], [ 0.17763095, 2.7475302, -0.20671827, 1.0719725, -2.388657, 1.189566, -1.0643665, 2.5955305, -0.69036585, -0.5287417 ], [ -0.9477449, -2.7059674, 0.53469753, 1.9289348, 0.24833842, 0.03517391, -1.4778724, -0.16577117, -1.7509687, -0.46875867 ], [ 1.5570146, -2.9596932, -0.7975963, 3.0060582, -1.038453, 0.14911443, -1.5873562, 0.7229206, 2.679422, -1.1890441 ], [ -2.2543156, 0.47845784, -2.8412538, -0.5494534, 1.6583048, -1.4567885, 1.0724461, -2.70243, -0.2690962, 1.8831034 ], [ -0.32710192, 0.01503609, 0.29720783, -0.7409194, -2.183623, 2.3637679, 0.6405145, 1.4975713, 0.18241015, 2.2659144 ]]) self.assertEqual(tf.rank(power_spc).eval(), 2) self.assertEqual(tf.rank(phase_spc).eval(), 2) logging.info('power_spc shape: {}'.format(power_spc.shape)) logging.info('phase_spc shape: {}'.format(phase_spc.shape)) self.assertAllClose(power_spc.eval().transpose()[:10, :10], power_spc_true) self.assertAllClose(phase_spc.eval().transpose()[:10, :10], phase_spc_true)
def generate_data(self): ''' generate one example''' use_text = self.taskconf['text']['enable'] # total files total = len(self._train_by_filename.values()) self._epoch += 1 # epcoh from 1 batch = [] np.random.shuffle(self.data_items) for i, (filename, examples) in enumerate(self.data_items): #logging.info("example info", filename, examples) # convert txt to ids if use_text: text = _load_text('.'.join(filename.split('.')[:-1])) text2id = self._word_table_lookup(text) else: text2id = np.array([0] * self._max_text_len) # gen audio or load feat if self._file_suffix == '.wav': sr, raw_samples = feat_lib.load_wav(filename) #pylint: disable=invalid-name for label, seg, clip_id in examples: # examples of one file samples = raw_samples if seg[2]: samples = np.pad(samples, [0, seg[2]], mode='constant') samples = samples[seg[0]:seg[1]] assert len(samples) == self.example_len, "{} {}".format( filename, seg) labelid = self.class_id(label) if self.use_distilling: soft_label = self.teacher(feat) else: class_num = self.taskconf['classes']['num'] soft_label = [0] * class_num if use_text: if clip_id == 0: # only add into batch when meet the first clip batch.append((samples, text2id, labelid, filename, clip_id, soft_label)) else: batch.append((samples, text2id, labelid, filename, clip_id, soft_label)) else: feat = np.load(filename) # shape : [nframe, feat_size, 3] if self._feature_type: fbank = feat_lib.add_delta_delta(feat, self._feature_size, order=2) if self._input_channels == 1: fbank = fbank[:, :, 0:1] else: fbank = feat_lib.delta_delta(feat) for label, seg, clip_id in examples: feat = fbank #logging.info("feat shape: {}".format(feat.shape)) seg = list(map(self.sample_to_frame, seg)) if seg[2]: # need padding feat = np.pad(feat, [(0, seg[2]), (0, 0), (0, 0)], mode='constant') feat = feat[seg[0]:seg[1], :, :] assert len(feat) == self.sample_to_frame( self.example_len), "{} {} {} {} {} {}".format( filename, seg, len(feat), self.example_len, self.sample_to_frame(self.example_len), seg[2]) if self.use_distilling: soft_label = self.teacher(feat) else: class_num = self.taskconf['classes']['num'] soft_label = [0] * class_num # convert string label to int label labelid = self.class_id(label) if use_text: if clip_id == 0: # only add into batch when meet the first clip batch.append((feat, text2id, labelid, filename, clip_id, soft_label)) else: batch.append((feat, text2id, labelid, filename, clip_id, soft_label)) #if i % 100000: # logging.info('epoch:{} iter exmaple:{} total:{} : {:.2f}%'.format( # self._epoch, i, total, i * 100 / total)) for inputs, texts, label, filepath, clip_id, soft_label in batch: yield inputs, texts, label, filepath, clip_id, soft_label batch.clear() logging.info("Out of range") raise StopIteration #pylint: disable=stop-iteration-return