Esempio n. 1
0
    def _set_probabilities(self, file):
        # TODO hard coded for getting class counts --> make sure that file class.counts exists
        # TODO and contains the key class_counts
        """
        Set P(s_k|m_j) and prior P(s_k) from training

        :param file: path to P(s_k|m_j) file (kaldi-format, must contain the key 'p_s_m')
        """
        # Set P(s_k|m_j)
        for key, mat in kaldi_io.read_mat_ark(file):
            if key == 'p_s_m':
                print('Setting P(s_k|m_j)')
                self.cond_prob = np.transpose(
                    mat)  # we transpose for later dot product
                # print(np.sum(self.cond_prob, axis=1))
                # print(np.shape(np.sum(self.cond_prob, axis=1)))
            else:
                print('No probability found')

        # Set prior P(s_k)
        for key, mat in kaldi_io.read_mat_ark('../class.counts'):
            if key == 'class_counts':
                print('Setting Prior')
                self.prior = mat / np.sum(mat)
            else:
                print('No Prior found')
Esempio n. 2
0
    def create_dataset(self, nj, frac, path_data, output_folder):

        dataset = DataIterator(nj, path_data)

        data = []
        misc = Misc()
        count_size = 0
        while True:
            try:
                data_path = dataset.next_file()
                print(data_path)
                for key, mat in kaldi_io.read_mat_ark(data_path):
                    df_mat = pd.DataFrame(mat)
                    np_mat = df_mat.sample(frac=frac).values
                    # np_mat[:, 39] = misc.trans_vec_to_phones(np_mat[:, 39])
                    data.append(np_mat)

            except StopIteration:
                data_sample = np.concatenate(data)
                print(data_sample.shape)
                data_dict = {}
                data_dict['data'] = data_sample

                with open(output_folder + '/dataset.mat', 'wb') as f:
                    for key, mat in list(data_dict.items()):
                        kaldi_io.write_mat(f,
                                           mat.astype(np.float32, copy=False),
                                           key=key)

                break
Esempio n. 3
0
    def create_codebook(self, nj, data_folder):
        # create keys for enumeration
        if self._multiple:
            keys = ['energy', 'raw', 'delta', 'dd']
        else:
            keys = ['simple']

        # init 4 minibatchkmeans for energy, raw, delta and delta delta features
        dict_kmeans = {}
        for key in keys:
            dict_kmeans[key] = MiniBatchKMeans(n_clusters=self._num_cluster,
                                               init='random',
                                               batch_size=200,
                                               verbose=1,
                                               reassignment_ratio=0.001,
                                               max_no_improvement=100,
                                               n_init=self._num_cluster)

        # create dataiterator
        dataset = DataIterator(nj, data_folder)

        # iterator and do kmeans
        df = pd.DataFrame()
        while True:
            try:
                data_path = dataset.next_file()
                print(data_path)
                for key, mat in kaldi_io.read_mat_ark(data_path):
                    tmp_df = pd.DataFrame(mat)
                    df = df.append(tmp_df.sample(int(tmp_df.shape[0] * 1.0)))

                    if df.shape[0] > 1000:
                        # so kmeans for every features
                        if self._multiple:
                            dict_kmeans['energy'].partial_fit(
                                whiten(df.values[:, [0, 13, 26]]))
                            dict_kmeans['raw'].partial_fit(
                                whiten(df.values[:, range(1, 13, 1)]))
                            dict_kmeans['delta'].partial_fit(
                                whiten(df.values[:, range(14, 26, 1)]))
                            dict_kmeans['dd'].partial_fit(
                                whiten(df.values[:, range(27, 39, 1)]))
                        else:
                            if self._whitening:
                                dict_kmeans['simple'].partial_fit(
                                    whiten(df.values))
                            else:
                                dict_kmeans['simple'].partial_fit(df.values)
                        self._dict_codebook = dict_kmeans
                        df = pd.DataFrame()  # clean up
            except StopIteration:
                break
Esempio n. 4
0
    def _set_global_stats(self, file):
        """
        Set the mean and the variance in the class

        :param file: path to stats file (kaldi-format, must contain the keys 'mean' and 'var')
        :return:
        """
        for key, mat in kaldi_io.read_mat_ark(file):
            if key == 'mean':
                print('Setting mean')
                self.global_mean = np.transpose(mat)
            elif key == 'std':
                print('Setting var')
                self.global_var = np.transpose(mat)
            else:
                print('No mean or var set!!!')
Esempio n. 5
0
 def load_codebook(self, path):
     if not self._kaldi_formatting:
         raise TypeError
     for key, mat in kaldi_io.read_mat_ark(path):
         self.codebook = mat
Esempio n. 6
0
 def _load_dataset(self, path):
     for key, mat in kaldi_io.read_mat_ark(path):
         self._dataset = mat
Esempio n. 7
0
 def _load_weights(self, path):
     for key, mat in kaldi_io.read_mat_ark(path):
         self._weights = mat
Esempio n. 8
0
    def vq_data(self, nj, data_folder, output_folder):
        # vqing traing data
        assert self.codebook.shape[0] > 0
        print('VQing training data...')

        dataset = DataIterator(nj, data_folder)

        keys = []
        dict_vq, dict_indicies = {}, {}
        if self._multiple:
            keys = ['energy', 'raw', 'delta', 'dd']
            dict_indicies = {
                'energy': [0, 13, 26],
                'raw': range(1, 13, 1),
                'delta': range(14, 26, 1),
                'dd': range(27, 39, 1)
            }
        else:
            keys = ['simple']
            dict_indicies = {'simple': range(0, 39)}

        for key in keys:
            dict_vq[key] = self.codebook[:, dict_indicies[key]]

        tmp_dict = {}
        labels_all = []
        phoneme_all = []
        count = 1
        while True:
            try:
                data_path = dataset.next_file()
                print("Data path is in ", data_path)
                for key, mat in kaldi_io.read_mat_ark(data_path):
                    if self._multiple:
                        # getting label for every vq
                        df = pd.DataFrame(
                            vq(whiten(mat[:, dict_indicies['energy']]),
                               dict_vq['energy'])[0][:, np.newaxis])
                        df = pd.concat([
                            df,
                            pd.DataFrame(
                                vq(whiten(mat[:, dict_indicies['raw']]),
                                   dict_vq['raw'])[0][:, np.newaxis])
                        ],
                                       axis=1)
                        df = pd.concat([
                            df,
                            pd.DataFrame(
                                vq(whiten(mat[:, dict_indicies['delta']]),
                                   dict_vq['delta'])[0][:, np.newaxis])
                        ],
                                       axis=1)
                        df = pd.concat([
                            df,
                            pd.DataFrame(
                                vq(whiten(mat[:, dict_indicies['dd']]),
                                   dict_vq['dd'])[0][:, np.newaxis])
                        ],
                                       axis=1)
                    else:
                        if self._whitening:
                            df = pd.DataFrame(
                                vq(whiten(mat[:, :39]),
                                   dict_vq['simple'])[0][:, np.newaxis])
                            labels_all.append(df.values)
                        else:
                            df = pd.DataFrame(
                                vq(mat[:, :39],
                                   dict_vq['simple'])[0][:, np.newaxis])
                            labels_all.append(df.values)

                        if np.shape(mat)[1] > 39:
                            phoneme_all.append(mat[:, 39])

                    # add to tmp_dict for later saving
                    tmp_dict[key] = df

                # ordered dict
                od = collections.OrderedDict(sorted(tmp_dict.items()))

                # save label-stream from vq
                with open(output_folder + '/feats_vq_' + str(count),
                          'wb') as f:
                    for key, mat in list(od.items()):
                        kaldi_io.write_mat(f,
                                           mat.values.astype(np.float32,
                                                             copy=False),
                                           key=key)

                tmp_dict = {}
                count += 1

            except StopIteration:
                # calc MI
                if False:
                    misc = Misc()
                    labels_all = np.concatenate(labels_all)
                    # labels_all = np.reshape(labels_all, [np.shape(labels_all)[0] * np.shape(labels_all)[1]],
                    #                         np.shape(labels_all)[2])
                    phoneme_all = np.concatenate(phoneme_all)
                    # phoneme_all = np.reshape(phoneme_all, [np.shape(phoneme_all)[0] * np.shape(phoneme_all)[1]],
                    #                          np.shape(phoneme_all)[2])
                    print(misc.calculate_mi(labels_all, phoneme_all))
                break
Esempio n. 9
0
    def do_inference(self, nj, input_folder, output_folder):
        """
        Does the inference of the model

        :param nj:              number of jobs (how the dataset is split in kaldi)
        :param input_folder:    path to the data folder to do the inference
        :param output_folder:   path to save the output of the inference
        """

        # create DataIterator for iterate through the split folder created by kaldi
        dataset = DataIterator(nj,
                               input_folder,
                               splice=self._splice,
                               cmvn=self._cmvn)

        dim = self._dim * (2 * self._splice + 1)

        # number iterator for counting, necessary for writing the matrices later
        iterator = iter([i for i in range(1, dataset.get_size() + 1)])

        features_all = {}
        phoneme_all = {}
        inferenced_data = {}  # storing the inferenced data
        check_data = {}
        output_all = {}

        while True:
            try:
                data_path = dataset.next_file()  # get path to data
                # print(data_path)
                # iterate through data
                for key, mat in kaldi_io.read_mat_ark(data_path):
                    inferenced_data[key] = self._do_single_inference(
                        mat[:, :dim])  # do inference for one batch
                    tmp = self._do_single_inference(mat[:, :dim])
                    # check_data[key] = [np.argmax(tmp[0], axis=1), np.argmax(tmp[1], axis=1),
                    #                    np.argmax(tmp[2], axis=1), self._dev_alignments[key]]
                    if np.shape(
                            mat
                    )[1] > dim:  # get statistics for mi (only if we input data + labels), for debugging
                        phoneme_all[key] = mat[:, dim]
                    # add for debugging, see below
                    output_all[key] = tmp

                od = collections.OrderedDict(sorted(inferenced_data.items()))

                # write posteriors (inferenced data) to files
                with open(output_folder + '/feats_vq_' + str(next(iterator)),
                          'wb') as f:
                    for key, mat in list(od.items()):
                        if self.transform:
                            kaldi_io.write_mat(f, mat, key=key)
                        else:
                            kaldi_io.write_mat(f, mat[:, np.newaxis], key=key)
                inferenced_data = {}  # reset dict

            except StopIteration:
                # debugging
                # gather_right = np.zeros(127)
                # gather_right.fill(1e-5)
                # gather_wrong = np.zeros(127)
                # gather_wrong.fill(1e-5)
                # gather_vq = np.zeros(127)
                # gather_vq.fill(1e-5)
                # gather_comb = np.zeros(127)
                # gather_comb.fill(1e-5)
                #
                # for key, entry in check_data.items():
                #     tmp_van = entry[0] == entry[3]  # right pred of vanilla
                #     tmp_vq = entry[1] == entry[3]  # right pred of vanilla
                #     tmp_comb = entry[2] == entry[3]  # right pred of vanilla
                #
                #     # np.max(np.expand_dims(~tmp_vq, 1) * output_all[key], axis=1)
                #
                #     comb_right = [t for t, x in enumerate(tmp_comb) if x]
                #     comb_wrong = [t for t, x in enumerate(~tmp_comb) if x]
                #     vq_right = [t for t, x in enumerate(tmp_vq) if x]
                #     vq_wrong = [t for t, x in enumerate(~tmp_vq) if x]
                #     van_right = [t for t, x in enumerate(tmp_van) if x]
                #     van_wrong = [t for t, x in enumerate(~tmp_van) if x]
                #
                #     list_vq = ~(entry[0] == entry[3]) == (entry[1] == entry[3])
                #     list_comb = (entry[0] == entry[3]) == ~(entry[2] == entry[3])
                #     ind_vq_true = [t for t, x in enumerate(list_vq) if x]
                #     ind_comb_true = [t for t, x in enumerate(list_comb) if x]
                #     ind_vq_false = [t for t, x in enumerate(list_vq) if not x]
                #     # est = output_all[key][1]
                #
                #
                #     # plt.subplot(2, 1, 1)
                #     # # plt.hist(np.ndarray.flatten(np.expand_dims(list_vq, 1) * output_all[key]), bins=100, range=[1e-15, 1])
                #     # plt.hist(-np.sum(np.log2(output_all[key][0]) * output_all[key][0], axis=1), bins=10)
                #     # plt.subplot(2, 1, 2)
                #     # # plt.hist(np.ndarray.flatten(np.expand_dims(~list_vq, 1) * output_all[key]), bins=100, range=[1e-15, 1])
                #     # plt.hist(-np.sum(np.log2(output_all[key][1]) * output_all[key][1], axis=1), bins=10)
                #     # plt.show()
                #
                #     print('right comb: ' + str(len(comb_right)))
                #     print('wrong comb: ' + str(len(comb_wrong)))
                #     print('right vq: ' + str(len(vq_right)))
                #     print('wrong vq: ' + str(len(vq_wrong)))
                #     print('right van: ' + str(len(van_right)))
                #     print('wrong van: ' + str(len(van_wrong)))
                #     # print(len(van_right) + len(van_wrong))
                #     # print(entry[2][van_wrong])
                #     gather_right[entry[3][comb_right]] += 1.0
                #     gather_wrong[entry[3][comb_wrong]] += 1.0
                #     gather_vq[entry[3][ind_vq_true]] += 1.0
                #     gather_comb[entry[3][ind_comb_true]] += 1.0
                #     # print(len(van_right) + len(van_wrong))
                #     # print(len(entry[2]))
                #     print(sum(list_comb) / len(entry[3]))
                #     print(sum(list_vq) / len(entry[3]))

                # plt.subplot(3, 1, 1)
                # plt.bar(range(0, 127), gather_right)
                # plt.subplot(3, 1, 2)
                # plt.bar(range(0, 127), gather_wrong)
                # plt.subplot(3, 1, 3)
                # plt.bar(range(0, 127), gather_vq)
                # plt.show()
                # print(check_data[0] == check_data[1])
                if False:
                    misc = Misc()
                    features_all = np.concatenate(features_all)
                    phoneme_all = np.expand_dims(np.concatenate(phoneme_all),
                                                 1)
                    phoneme_all = misc.trans_vec_to_phones(phoneme_all)
                    # print(misc.calculate_mi(features_all, phoneme_all))
                    mi, test_py, test_pw, test_pyw = self._session.run(
                        ["mutual_info:0", "p_y:0", "p_w:0", "p_yw:0"],
                        feed_dict={
                            "is_train:0": False,
                            "ph_features:0": features_all,
                            "ph_labels:0": phoneme_all
                        })
                    print(mi)
                    tmp_pywtest = pd.DataFrame(test_py)
                    tmp_pywtest.to_csv('py_inf.txt', header=False, index=False)
                    tmp_pywtest = pd.DataFrame(test_pw)
                    tmp_pywtest.to_csv('pw_inf.txt', header=False, index=False)
                    tmp_pywtest = pd.DataFrame(test_pyw)
                    tmp_pywtest.to_csv('pwy_inf.txt',
                                       header=False,
                                       index=False)

                break