def _create_and_save_stats(self, mat, output): # tmp_mean = np.mean(mat, axis=0) # tmp_std = np.std(mat, axis=0) # get global mean and std num_samples = np.sum(mat[:, 0]) dim = int((mat.shape[1] - 1) / 2) # global mean tmp_mean = np.sum((np.expand_dims(mat[:, 0], 1) * mat[:, 1:(dim + 1)]), axis=0) / num_samples # global var tmp_std = np.sum( np.expand_dims(mat[:, 0], 1) * (mat[:, (dim + 1):] + np.square(mat[:, 1:(dim + 1)] - tmp_mean)), axis=0) / num_samples tmp_std = np.sqrt(tmp_std) # saving to stats matrix stats_dict = { 'mean': np.expand_dims(tmp_mean, 1), 'std': np.expand_dims(tmp_std, 1) } # change path p = Path(output) print(p.parent) with open(str(p.parent) + '/stats.mat', 'wb') as f: for key, mat in list(stats_dict.items()): kaldi_io.write_mat(f, mat.astype(np.float32, copy=False), key=key)
def _save_weights(self): weights_dict = {'weights': self._weights} with open('weights_tmp.mat', 'wb') as f: for key, mat in list(weights_dict.items()): kaldi_io.write_mat(f, mat.astype(np.float32, copy=False), key=key)
def create_dataset(self, nj, frac, path_data, output_folder): dataset = DataIterator(nj, path_data) data = [] misc = Misc() count_size = 0 while True: try: data_path = dataset.next_file() print(data_path) for key, mat in kaldi_io.read_mat_ark(data_path): df_mat = pd.DataFrame(mat) np_mat = df_mat.sample(frac=frac).values # np_mat[:, 39] = misc.trans_vec_to_phones(np_mat[:, 39]) data.append(np_mat) except StopIteration: data_sample = np.concatenate(data) print(data_sample.shape) data_dict = {} data_dict['data'] = data_sample with open(output_folder + '/dataset.mat', 'wb') as f: for key, mat in list(data_dict.items()): kaldi_io.write_mat(f, mat.astype(np.float32, copy=False), key=key) break
def save_codebook(self, path): if not self._kaldi_formatting: raise TypeError # prepare codebook for saving path_new = str.split(path, '.') assert len(self._dict_codebook) > 0 if len(self._dict_codebook) > 1: # prepare codebook for multiple vqs self.codebook = np.zeros([self._num_cluster, 39]) # 39 is the feature dimension keys = ['energy', 'raw', 'delta', 'dd'] dict_indicies = { 'energy': [0, 13, 26], 'raw': range(1, 13, 1), 'delta': range(14, 26, 1), 'dd': range(27, 39, 1) } for key in keys: self.codebook[:, dict_indicies[key]] = self._dict_codebook[ key].cluster_centers_ path = path_new[0] + '_multiple.' + path_new[1] else: self.codebook = self._dict_codebook['simple'].cluster_centers_ path = path_new[0] + '_single.' + path_new[1] with open(path, 'wb') as f: # print(self.codebook) kaldi_io.write_mat(f, self.codebook, key='cb')
def compute_wav_path(wav, feat_scp, feat_ark, utt2dur, utt2num_frames): feat, duration = Make_Spect(wav_path=wav[1], windowsize=0.02, stride=0.01, duration=True) # np_fbank = Make_Fbank(filename=uid2path[uid], use_energy=True, nfilt=c.TDNN_FBANK_FILTER) len_vec = len(feat.tobytes()) key = wav[0] kaldi_io.write_mat(feat_ark, feat, key=key) feat_scp.write(str(key) + ' ' + str(feat_ark.name) + ':' + str(feat_ark.tell() - len_vec - 15) + '\n') utt2dur.write('%s %.6f\n' % (str(key), duration)) utt2num_frames.write('%s %d\n' % (str(key), len(feat)))
def MakeFeatsProcess(out_dir, proid, t_queue, e_queue): # wav_scp = os.path.join(data_path, 'wav.scp') feat_scp = os.path.join(out_dir, 'feat.%d.scp' % proid) feat_ark = os.path.join(out_dir, 'feat.%d.ark' % proid) utt2dur = os.path.join(out_dir, 'utt2dur.%d' % proid) utt2num_frames = os.path.join(out_dir, 'utt2num_frames.%d' % proid) feat_scp = open(feat_scp, 'w') feat_ark = open(feat_ark, 'wb') utt2dur = open(utt2dur, 'w') utt2num_frames = open(utt2num_frames, 'w') while not t_queue.empty(): wav = t_queue.get() pair = wav.split() try: feat, duration = Make_Spect(wav_path=pair[1], windowsize=0.02, stride=0.01, duration=True) # np_fbank = Make_Fbank(filename=uid2path[uid], use_energy=True, nfilt=c.TDNN_FBANK_FILTER) len_vec = len(feat.tobytes()) key = pair[0] kaldi_io.write_mat(feat_ark, feat, key='') feat_scp.write( str(key) + ' ' + str(feat_ark.name) + ':' + str(feat_ark.tell() - len_vec - 15) + '\n') utt2dur.write('%s %.6f' % (str(key), duration)) utt2num_frames.write('%s %d' % (str(key), len(feat))) except: print("Error: %s" % pair[0]) e_queue.put(pair[0]) # if self.queue.qsize() % 1000 == 0: print('==> Process %s: %s left' % (str(proid), str(t_queue.qsize()))) feat_scp.close() feat_ark.close() utt2dur.close() utt2num_frames.close() print('>> Process {} finished!'.format(proid))
def create_p_s_m(self): self._feeder.init_train() # set model.train to False to avoid training # model.train = False while True: try: feat, labs = self._session.run( [self._input_train[0], self._input_train[1]]) nom_vq, den_vq = self._session.run( self._train_dict['data_vq'], feed_dict={ self._placeholders['ph_train']: False, self._placeholders['ph_features']: feat, self._placeholders['ph_labels']: labs }) except tf.errors.OutOfRangeError: nom_vq += Settings.delta den_vq += Settings.num_labels * Settings.delta prob = nom_vq / den_vq # saving matrix with kaldi_io save_dict = {'p_s_m': prob} print('Saving P(s_k|m_j)') with open('p_s_m.mat', 'wb') as f: for key, mat in list(save_dict.items()): kaldi_io.write_mat(f, mat, key=key) # reset den and nom, set variable self._session.run([ self._misc.reset_variable(self._variables['nominator']), self._misc.reset_variable(self._variables['denominator']), tf.assign(self._variables['conditioned_probability'], prob) ]) break
def merge_data_phonemes(self, nj, path_data, path_phonemes, output_folder): assert type(path_data) == str and type(path_phonemes) == str # create Iterators dataset = DataIterator(self._nj, path_data, splice=self._splice, cmvn=self._cmvn) phonemes = AlignmentIterator(nj, path_phonemes) # iterate through data count = 1 tmp_dict = {} while True: try: for (key_data, mat_data), (key_pho, mat_pho) in zip( kaldi_io.read_mat_ark(dataset.next_file()), kaldi_io.read_ali_ark(phonemes.next_file())): # check for same key if key_data == key_pho: print(key_data) tmp_dict[key_data] = pd.concat( [pd.DataFrame(mat_data), pd.DataFrame(mat_pho)], axis=1) with open(output_folder + '/feats_vq_' + str(count), 'wb') as f: for key, mat in list(tmp_dict.items()): kaldi_io.write_mat(f, mat.values.astype(np.float32, copy=False), key=key) tmp_dict = {} count += 1 except StopIteration: break
def write_ark_scp(output_name, kmat_dict: dict): ark_scp_output = f"ark:| copy-feats --compress=true ark:- ark,scp:{output_name}.ark,{output_name}.scp" with open_or_fd(ark_scp_output, "wb") as f: for k, mat in kmat_dict.items(): write_mat(f, mat, key=k)
cv_feat_keys, cv_feats = featureUtils.read_feats(Config.CV_FEATS) cv_ali_keys, cv_alis, _ = featureUtils.read_ali_and_compute_prior( Config.CV_ALIGNMENTS, None) utils.verify_order(cv_feat_keys, cv_ali_keys) eval_loss, eval_ce = dnn.eval(sess, cv_feats, cv_alis) print 'validation loss after model restore:', eval_loss, 'xent:', eval_ce print 'Forward pass test data' te_key, te_mat, te_feat_len = featureUtils.read_feats( Config.TEST_FEATS, True) out = dnn.forwardPass(sess, te_mat) out = np.array(out) # loading prior prior = np.loadtxt( '/speech1/DIT_PROJ/srini/PycharmProjects/tfkaldi-fork/prior.npy', dtype=np.float32) out /= prior np.where(out == 0, np.finfo(float).eps, out) out = np.log(out) prev = 0 with open('out2.ark', 'wb') as f: for ran in range(len(te_key)): temp = out[0][prev:prev + te_feat_len[ran]] prev = prev + te_feat_len[ran] kaldi_io.write_mat(f, temp, key=te_key[ran]) end_time = time.time() print 'seconds', (end_time - begin_time)
def concat_data(self, path_data, path_phonemes, output_folder): dataset = DataIterator(self._nj, path_data, splice=self._splice, cmvn=self._cmvn) create_stats = True if path_data in ['test', 'dev']: create_stats = False # set dim dim = self._dim * (2 * self._splice + 1) print('Loading alignment dict') alignment_dict = {} for key, mat in kaldi_io.read_ali_ark(path_phonemes): alignment_dict[key] = mat print('Loading done') count = 1 tmp_dict = {} # gather_stats: n, mean, var (nj rows) gather_stats = np.zeros([self._nj, 2 * dim + 1]) gather_data = [] print('Creating filtered training data and merge them with the labels') while True: try: for key, mat in kaldi_io.read_mat_ark(dataset.next_file()): # we need to filter the training data because we don't have the alignments for all the # training data. Therefor, we have to avoid to use this data for training our HMMs # TODO Could also work with --> check performance difference if key in list(alignment_dict.keys()) and \ mat.shape[0] == alignment_dict[key].shape[0]: tmp_dict[key] = pd.concat([ pd.DataFrame(mat), pd.DataFrame(alignment_dict[key]) ], axis=1) gather_data.append(mat) od = collections.OrderedDict(sorted(tmp_dict.items())) # write filtered training data and the labels to files with open(output_folder + '/feats_vq_' + str(count), 'wb') as f: for key, mat in list(od.items()): kaldi_io.write_mat(f, mat.values.astype(np.float32, copy=False), key=key) # write the filtered training data with open(output_folder + '/features_' + str(count), 'wb') as f: for key, mat in list(od.items()): kaldi_io.write_mat( f, mat.values.astype(np.float32, copy=False)[:, :dim], key=key) tmp_dict = {} # save stats for single file tmp_data = np.concatenate(gather_data) gather_stats[count - 1, 0] = tmp_data.shape[0] # add number of samples gather_stats[count - 1, 1:(dim + 1)] = np.mean(tmp_data, axis=0) # add mean of file gather_stats[count - 1, (dim + 1):] = np.var(tmp_data, axis=0) # add var of file # print(gather_stats) count += 1 gather_data = [] # reset gather_data except StopIteration: if create_stats: print('Saving std and mean of data to stats.mat') self._create_and_save_stats(gather_stats, output_folder) break
def vq_data(self, nj, data_folder, output_folder): # vqing traing data assert self.codebook.shape[0] > 0 print('VQing training data...') dataset = DataIterator(nj, data_folder) keys = [] dict_vq, dict_indicies = {}, {} if self._multiple: keys = ['energy', 'raw', 'delta', 'dd'] dict_indicies = { 'energy': [0, 13, 26], 'raw': range(1, 13, 1), 'delta': range(14, 26, 1), 'dd': range(27, 39, 1) } else: keys = ['simple'] dict_indicies = {'simple': range(0, 39)} for key in keys: dict_vq[key] = self.codebook[:, dict_indicies[key]] tmp_dict = {} labels_all = [] phoneme_all = [] count = 1 while True: try: data_path = dataset.next_file() print("Data path is in ", data_path) for key, mat in kaldi_io.read_mat_ark(data_path): if self._multiple: # getting label for every vq df = pd.DataFrame( vq(whiten(mat[:, dict_indicies['energy']]), dict_vq['energy'])[0][:, np.newaxis]) df = pd.concat([ df, pd.DataFrame( vq(whiten(mat[:, dict_indicies['raw']]), dict_vq['raw'])[0][:, np.newaxis]) ], axis=1) df = pd.concat([ df, pd.DataFrame( vq(whiten(mat[:, dict_indicies['delta']]), dict_vq['delta'])[0][:, np.newaxis]) ], axis=1) df = pd.concat([ df, pd.DataFrame( vq(whiten(mat[:, dict_indicies['dd']]), dict_vq['dd'])[0][:, np.newaxis]) ], axis=1) else: if self._whitening: df = pd.DataFrame( vq(whiten(mat[:, :39]), dict_vq['simple'])[0][:, np.newaxis]) labels_all.append(df.values) else: df = pd.DataFrame( vq(mat[:, :39], dict_vq['simple'])[0][:, np.newaxis]) labels_all.append(df.values) if np.shape(mat)[1] > 39: phoneme_all.append(mat[:, 39]) # add to tmp_dict for later saving tmp_dict[key] = df # ordered dict od = collections.OrderedDict(sorted(tmp_dict.items())) # save label-stream from vq with open(output_folder + '/feats_vq_' + str(count), 'wb') as f: for key, mat in list(od.items()): kaldi_io.write_mat(f, mat.values.astype(np.float32, copy=False), key=key) tmp_dict = {} count += 1 except StopIteration: # calc MI if False: misc = Misc() labels_all = np.concatenate(labels_all) # labels_all = np.reshape(labels_all, [np.shape(labels_all)[0] * np.shape(labels_all)[1]], # np.shape(labels_all)[2]) phoneme_all = np.concatenate(phoneme_all) # phoneme_all = np.reshape(phoneme_all, [np.shape(phoneme_all)[0] * np.shape(phoneme_all)[1]], # np.shape(phoneme_all)[2]) print(misc.calculate_mi(labels_all, phoneme_all)) break
def do_inference(self, nj, input_folder, output_folder): """ Does the inference of the model :param nj: number of jobs (how the dataset is split in kaldi) :param input_folder: path to the data folder to do the inference :param output_folder: path to save the output of the inference """ # create DataIterator for iterate through the split folder created by kaldi dataset = DataIterator(nj, input_folder, splice=self._splice, cmvn=self._cmvn) dim = self._dim * (2 * self._splice + 1) # number iterator for counting, necessary for writing the matrices later iterator = iter([i for i in range(1, dataset.get_size() + 1)]) features_all = {} phoneme_all = {} inferenced_data = {} # storing the inferenced data check_data = {} output_all = {} while True: try: data_path = dataset.next_file() # get path to data # print(data_path) # iterate through data for key, mat in kaldi_io.read_mat_ark(data_path): inferenced_data[key] = self._do_single_inference( mat[:, :dim]) # do inference for one batch tmp = self._do_single_inference(mat[:, :dim]) # check_data[key] = [np.argmax(tmp[0], axis=1), np.argmax(tmp[1], axis=1), # np.argmax(tmp[2], axis=1), self._dev_alignments[key]] if np.shape( mat )[1] > dim: # get statistics for mi (only if we input data + labels), for debugging phoneme_all[key] = mat[:, dim] # add for debugging, see below output_all[key] = tmp od = collections.OrderedDict(sorted(inferenced_data.items())) # write posteriors (inferenced data) to files with open(output_folder + '/feats_vq_' + str(next(iterator)), 'wb') as f: for key, mat in list(od.items()): if self.transform: kaldi_io.write_mat(f, mat, key=key) else: kaldi_io.write_mat(f, mat[:, np.newaxis], key=key) inferenced_data = {} # reset dict except StopIteration: # debugging # gather_right = np.zeros(127) # gather_right.fill(1e-5) # gather_wrong = np.zeros(127) # gather_wrong.fill(1e-5) # gather_vq = np.zeros(127) # gather_vq.fill(1e-5) # gather_comb = np.zeros(127) # gather_comb.fill(1e-5) # # for key, entry in check_data.items(): # tmp_van = entry[0] == entry[3] # right pred of vanilla # tmp_vq = entry[1] == entry[3] # right pred of vanilla # tmp_comb = entry[2] == entry[3] # right pred of vanilla # # # np.max(np.expand_dims(~tmp_vq, 1) * output_all[key], axis=1) # # comb_right = [t for t, x in enumerate(tmp_comb) if x] # comb_wrong = [t for t, x in enumerate(~tmp_comb) if x] # vq_right = [t for t, x in enumerate(tmp_vq) if x] # vq_wrong = [t for t, x in enumerate(~tmp_vq) if x] # van_right = [t for t, x in enumerate(tmp_van) if x] # van_wrong = [t for t, x in enumerate(~tmp_van) if x] # # list_vq = ~(entry[0] == entry[3]) == (entry[1] == entry[3]) # list_comb = (entry[0] == entry[3]) == ~(entry[2] == entry[3]) # ind_vq_true = [t for t, x in enumerate(list_vq) if x] # ind_comb_true = [t for t, x in enumerate(list_comb) if x] # ind_vq_false = [t for t, x in enumerate(list_vq) if not x] # # est = output_all[key][1] # # # # plt.subplot(2, 1, 1) # # # plt.hist(np.ndarray.flatten(np.expand_dims(list_vq, 1) * output_all[key]), bins=100, range=[1e-15, 1]) # # plt.hist(-np.sum(np.log2(output_all[key][0]) * output_all[key][0], axis=1), bins=10) # # plt.subplot(2, 1, 2) # # # plt.hist(np.ndarray.flatten(np.expand_dims(~list_vq, 1) * output_all[key]), bins=100, range=[1e-15, 1]) # # plt.hist(-np.sum(np.log2(output_all[key][1]) * output_all[key][1], axis=1), bins=10) # # plt.show() # # print('right comb: ' + str(len(comb_right))) # print('wrong comb: ' + str(len(comb_wrong))) # print('right vq: ' + str(len(vq_right))) # print('wrong vq: ' + str(len(vq_wrong))) # print('right van: ' + str(len(van_right))) # print('wrong van: ' + str(len(van_wrong))) # # print(len(van_right) + len(van_wrong)) # # print(entry[2][van_wrong]) # gather_right[entry[3][comb_right]] += 1.0 # gather_wrong[entry[3][comb_wrong]] += 1.0 # gather_vq[entry[3][ind_vq_true]] += 1.0 # gather_comb[entry[3][ind_comb_true]] += 1.0 # # print(len(van_right) + len(van_wrong)) # # print(len(entry[2])) # print(sum(list_comb) / len(entry[3])) # print(sum(list_vq) / len(entry[3])) # plt.subplot(3, 1, 1) # plt.bar(range(0, 127), gather_right) # plt.subplot(3, 1, 2) # plt.bar(range(0, 127), gather_wrong) # plt.subplot(3, 1, 3) # plt.bar(range(0, 127), gather_vq) # plt.show() # print(check_data[0] == check_data[1]) if False: misc = Misc() features_all = np.concatenate(features_all) phoneme_all = np.expand_dims(np.concatenate(phoneme_all), 1) phoneme_all = misc.trans_vec_to_phones(phoneme_all) # print(misc.calculate_mi(features_all, phoneme_all)) mi, test_py, test_pw, test_pyw = self._session.run( ["mutual_info:0", "p_y:0", "p_w:0", "p_yw:0"], feed_dict={ "is_train:0": False, "ph_features:0": features_all, "ph_labels:0": phoneme_all }) print(mi) tmp_pywtest = pd.DataFrame(test_py) tmp_pywtest.to_csv('py_inf.txt', header=False, index=False) tmp_pywtest = pd.DataFrame(test_pw) tmp_pywtest.to_csv('pw_inf.txt', header=False, index=False) tmp_pywtest = pd.DataFrame(test_pyw) tmp_pywtest.to_csv('pwy_inf.txt', header=False, index=False) break