Ejemplo n.º 1
0
    def create_dataset(self, nj, frac, path_data, output_folder):

        dataset = DataIterator(nj, path_data)

        data = []
        misc = Misc()
        count_size = 0
        while True:
            try:
                data_path = dataset.next_file()
                print(data_path)
                for key, mat in kaldi_io.read_mat_ark(data_path):
                    df_mat = pd.DataFrame(mat)
                    np_mat = df_mat.sample(frac=frac).values
                    # np_mat[:, 39] = misc.trans_vec_to_phones(np_mat[:, 39])
                    data.append(np_mat)

            except StopIteration:
                data_sample = np.concatenate(data)
                print(data_sample.shape)
                data_dict = {}
                data_dict['data'] = data_sample

                with open(output_folder + '/dataset.mat', 'wb') as f:
                    for key, mat in list(data_dict.items()):
                        kaldi_io.write_mat(f,
                                           mat.astype(np.float32, copy=False),
                                           key=key)

                break
Ejemplo n.º 2
0
    def create_codebook(self, nj, data_folder):
        # create keys for enumeration
        if self._multiple:
            keys = ['energy', 'raw', 'delta', 'dd']
        else:
            keys = ['simple']

        # init 4 minibatchkmeans for energy, raw, delta and delta delta features
        dict_kmeans = {}
        for key in keys:
            dict_kmeans[key] = MiniBatchKMeans(n_clusters=self._num_cluster,
                                               init='random',
                                               batch_size=200,
                                               verbose=1,
                                               reassignment_ratio=0.001,
                                               max_no_improvement=100,
                                               n_init=self._num_cluster)

        # create dataiterator
        dataset = DataIterator(nj, data_folder)

        # iterator and do kmeans
        df = pd.DataFrame()
        while True:
            try:
                data_path = dataset.next_file()
                print(data_path)
                for key, mat in kaldi_io.read_mat_ark(data_path):
                    tmp_df = pd.DataFrame(mat)
                    df = df.append(tmp_df.sample(int(tmp_df.shape[0] * 1.0)))

                    if df.shape[0] > 1000:
                        # so kmeans for every features
                        if self._multiple:
                            dict_kmeans['energy'].partial_fit(
                                whiten(df.values[:, [0, 13, 26]]))
                            dict_kmeans['raw'].partial_fit(
                                whiten(df.values[:, range(1, 13, 1)]))
                            dict_kmeans['delta'].partial_fit(
                                whiten(df.values[:, range(14, 26, 1)]))
                            dict_kmeans['dd'].partial_fit(
                                whiten(df.values[:, range(27, 39, 1)]))
                        else:
                            if self._whitening:
                                dict_kmeans['simple'].partial_fit(
                                    whiten(df.values))
                            else:
                                dict_kmeans['simple'].partial_fit(df.values)
                        self._dict_codebook = dict_kmeans
                        df = pd.DataFrame()  # clean up
            except StopIteration:
                break
Ejemplo n.º 3
0
    def create_tfrecords(self, stats, path_input, path_output):
        # TODO refactor to KaldiMiscHelper ???
        """
        Create the TFRecord files

        :param nj: number of jobs split into in data folder
        :param trans_phoneme: transform to single phoneme (41) or multi (166)
        :param splice: splice feats (1 left and 1 right context)
        :param stats: stats-file to normalize data
        :param path_input: path to the folder where the features + phonemes are
        :param path_output: output path to save the tfrecords
        :return:
        """
        assert type(path_input) == str and type(path_output) == str

        # TODO hard-coded
        for key, mat in kaldi_io.read_mat_ark(stats):
            if key == 'mean':
                print('Setting mean')
                self.global_mean = np.transpose(mat)[0, :]
                # print(self.global_mean.shape)
            elif key == 'std':
                print('Setting std')
                self.global_std = np.transpose(mat)[0, :]
            else:
                print('No mean or var set!!!')

        dataset = DataIterator(self._nj,
                               path_input,
                               splice=self._splice,
                               cmvn=self._cmvn)

        tmp_df = pd.DataFrame()
        count = 1
        while True:
            try:
                for _, mat in kaldi_io.read_mat_ark(dataset.next_file()):
                    tmp_df = pd.concat([tmp_df, pd.DataFrame(mat)])

                self._convert_npy_to_tfrecords(
                    tmp_df.values,
                    path_output + '/data_' + str(count) + '.tfrecords')
                print('/data_' + str(count) + '.tfrecords created')

                count += 1
                tmp_df = pd.DataFrame()

            except StopIteration:
                break
Ejemplo n.º 4
0
    def merge_data_phonemes(self, nj, path_data, path_phonemes, output_folder):
        assert type(path_data) == str and type(path_phonemes) == str

        # create Iterators
        dataset = DataIterator(self._nj,
                               path_data,
                               splice=self._splice,
                               cmvn=self._cmvn)
        phonemes = AlignmentIterator(nj, path_phonemes)

        # iterate through data
        count = 1
        tmp_dict = {}
        while True:
            try:
                for (key_data, mat_data), (key_pho, mat_pho) in zip(
                        kaldi_io.read_mat_ark(dataset.next_file()),
                        kaldi_io.read_ali_ark(phonemes.next_file())):
                    # check for same key
                    if key_data == key_pho:
                        print(key_data)
                        tmp_dict[key_data] = pd.concat(
                            [pd.DataFrame(mat_data),
                             pd.DataFrame(mat_pho)],
                            axis=1)

                with open(output_folder + '/feats_vq_' + str(count),
                          'wb') as f:
                    for key, mat in list(tmp_dict.items()):
                        kaldi_io.write_mat(f,
                                           mat.values.astype(np.float32,
                                                             copy=False),
                                           key=key)

                tmp_dict = {}
                count += 1

            except StopIteration:
                break
Ejemplo n.º 5
0
    def concat_data(self, path_data, path_phonemes, output_folder):
        dataset = DataIterator(self._nj,
                               path_data,
                               splice=self._splice,
                               cmvn=self._cmvn)

        create_stats = True
        if path_data in ['test', 'dev']:
            create_stats = False

        # set dim
        dim = self._dim * (2 * self._splice + 1)

        print('Loading alignment dict')
        alignment_dict = {}
        for key, mat in kaldi_io.read_ali_ark(path_phonemes):
            alignment_dict[key] = mat

        print('Loading done')
        count = 1
        tmp_dict = {}
        # gather_stats: n, mean, var (nj rows)
        gather_stats = np.zeros([self._nj, 2 * dim + 1])
        gather_data = []
        print('Creating filtered training data and merge them with the labels')
        while True:
            try:
                for key, mat in kaldi_io.read_mat_ark(dataset.next_file()):
                    # we need to filter the training data because we don't have the alignments for all the
                    # training data. Therefor, we have to avoid to use this data for training our HMMs
                    # TODO Could also work with --> check performance difference
                    if key in list(alignment_dict.keys()) and \
                                    mat.shape[0] == alignment_dict[key].shape[0]:
                        tmp_dict[key] = pd.concat([
                            pd.DataFrame(mat),
                            pd.DataFrame(alignment_dict[key])
                        ],
                                                  axis=1)
                        gather_data.append(mat)

                od = collections.OrderedDict(sorted(tmp_dict.items()))

                # write filtered training data and the labels to files
                with open(output_folder + '/feats_vq_' + str(count),
                          'wb') as f:
                    for key, mat in list(od.items()):
                        kaldi_io.write_mat(f,
                                           mat.values.astype(np.float32,
                                                             copy=False),
                                           key=key)
                # write the filtered training data
                with open(output_folder + '/features_' + str(count),
                          'wb') as f:
                    for key, mat in list(od.items()):
                        kaldi_io.write_mat(
                            f,
                            mat.values.astype(np.float32, copy=False)[:, :dim],
                            key=key)
                tmp_dict = {}
                # save stats for single file
                tmp_data = np.concatenate(gather_data)
                gather_stats[count - 1,
                             0] = tmp_data.shape[0]  # add number of samples
                gather_stats[count - 1,
                             1:(dim + 1)] = np.mean(tmp_data,
                                                    axis=0)  # add mean of file
                gather_stats[count - 1,
                             (dim + 1):] = np.var(tmp_data,
                                                  axis=0)  # add var of file
                # print(gather_stats)
                count += 1
                gather_data = []  # reset gather_data

            except StopIteration:
                if create_stats:
                    print('Saving std and mean of data to stats.mat')
                    self._create_and_save_stats(gather_stats, output_folder)
                break
Ejemplo n.º 6
0
    def vq_data(self, nj, data_folder, output_folder):
        # vqing traing data
        assert self.codebook.shape[0] > 0
        print('VQing training data...')

        dataset = DataIterator(nj, data_folder)

        keys = []
        dict_vq, dict_indicies = {}, {}
        if self._multiple:
            keys = ['energy', 'raw', 'delta', 'dd']
            dict_indicies = {
                'energy': [0, 13, 26],
                'raw': range(1, 13, 1),
                'delta': range(14, 26, 1),
                'dd': range(27, 39, 1)
            }
        else:
            keys = ['simple']
            dict_indicies = {'simple': range(0, 39)}

        for key in keys:
            dict_vq[key] = self.codebook[:, dict_indicies[key]]

        tmp_dict = {}
        labels_all = []
        phoneme_all = []
        count = 1
        while True:
            try:
                data_path = dataset.next_file()
                print("Data path is in ", data_path)
                for key, mat in kaldi_io.read_mat_ark(data_path):
                    if self._multiple:
                        # getting label for every vq
                        df = pd.DataFrame(
                            vq(whiten(mat[:, dict_indicies['energy']]),
                               dict_vq['energy'])[0][:, np.newaxis])
                        df = pd.concat([
                            df,
                            pd.DataFrame(
                                vq(whiten(mat[:, dict_indicies['raw']]),
                                   dict_vq['raw'])[0][:, np.newaxis])
                        ],
                                       axis=1)
                        df = pd.concat([
                            df,
                            pd.DataFrame(
                                vq(whiten(mat[:, dict_indicies['delta']]),
                                   dict_vq['delta'])[0][:, np.newaxis])
                        ],
                                       axis=1)
                        df = pd.concat([
                            df,
                            pd.DataFrame(
                                vq(whiten(mat[:, dict_indicies['dd']]),
                                   dict_vq['dd'])[0][:, np.newaxis])
                        ],
                                       axis=1)
                    else:
                        if self._whitening:
                            df = pd.DataFrame(
                                vq(whiten(mat[:, :39]),
                                   dict_vq['simple'])[0][:, np.newaxis])
                            labels_all.append(df.values)
                        else:
                            df = pd.DataFrame(
                                vq(mat[:, :39],
                                   dict_vq['simple'])[0][:, np.newaxis])
                            labels_all.append(df.values)

                        if np.shape(mat)[1] > 39:
                            phoneme_all.append(mat[:, 39])

                    # add to tmp_dict for later saving
                    tmp_dict[key] = df

                # ordered dict
                od = collections.OrderedDict(sorted(tmp_dict.items()))

                # save label-stream from vq
                with open(output_folder + '/feats_vq_' + str(count),
                          'wb') as f:
                    for key, mat in list(od.items()):
                        kaldi_io.write_mat(f,
                                           mat.values.astype(np.float32,
                                                             copy=False),
                                           key=key)

                tmp_dict = {}
                count += 1

            except StopIteration:
                # calc MI
                if False:
                    misc = Misc()
                    labels_all = np.concatenate(labels_all)
                    # labels_all = np.reshape(labels_all, [np.shape(labels_all)[0] * np.shape(labels_all)[1]],
                    #                         np.shape(labels_all)[2])
                    phoneme_all = np.concatenate(phoneme_all)
                    # phoneme_all = np.reshape(phoneme_all, [np.shape(phoneme_all)[0] * np.shape(phoneme_all)[1]],
                    #                          np.shape(phoneme_all)[2])
                    print(misc.calculate_mi(labels_all, phoneme_all))
                break
Ejemplo n.º 7
0
    def do_inference(self, nj, input_folder, output_folder):
        """
        Does the inference of the model

        :param nj:              number of jobs (how the dataset is split in kaldi)
        :param input_folder:    path to the data folder to do the inference
        :param output_folder:   path to save the output of the inference
        """

        # create DataIterator for iterate through the split folder created by kaldi
        dataset = DataIterator(nj,
                               input_folder,
                               splice=self._splice,
                               cmvn=self._cmvn)

        dim = self._dim * (2 * self._splice + 1)

        # number iterator for counting, necessary for writing the matrices later
        iterator = iter([i for i in range(1, dataset.get_size() + 1)])

        features_all = {}
        phoneme_all = {}
        inferenced_data = {}  # storing the inferenced data
        check_data = {}
        output_all = {}

        while True:
            try:
                data_path = dataset.next_file()  # get path to data
                # print(data_path)
                # iterate through data
                for key, mat in kaldi_io.read_mat_ark(data_path):
                    inferenced_data[key] = self._do_single_inference(
                        mat[:, :dim])  # do inference for one batch
                    tmp = self._do_single_inference(mat[:, :dim])
                    # check_data[key] = [np.argmax(tmp[0], axis=1), np.argmax(tmp[1], axis=1),
                    #                    np.argmax(tmp[2], axis=1), self._dev_alignments[key]]
                    if np.shape(
                            mat
                    )[1] > dim:  # get statistics for mi (only if we input data + labels), for debugging
                        phoneme_all[key] = mat[:, dim]
                    # add for debugging, see below
                    output_all[key] = tmp

                od = collections.OrderedDict(sorted(inferenced_data.items()))

                # write posteriors (inferenced data) to files
                with open(output_folder + '/feats_vq_' + str(next(iterator)),
                          'wb') as f:
                    for key, mat in list(od.items()):
                        if self.transform:
                            kaldi_io.write_mat(f, mat, key=key)
                        else:
                            kaldi_io.write_mat(f, mat[:, np.newaxis], key=key)
                inferenced_data = {}  # reset dict

            except StopIteration:
                # debugging
                # gather_right = np.zeros(127)
                # gather_right.fill(1e-5)
                # gather_wrong = np.zeros(127)
                # gather_wrong.fill(1e-5)
                # gather_vq = np.zeros(127)
                # gather_vq.fill(1e-5)
                # gather_comb = np.zeros(127)
                # gather_comb.fill(1e-5)
                #
                # for key, entry in check_data.items():
                #     tmp_van = entry[0] == entry[3]  # right pred of vanilla
                #     tmp_vq = entry[1] == entry[3]  # right pred of vanilla
                #     tmp_comb = entry[2] == entry[3]  # right pred of vanilla
                #
                #     # np.max(np.expand_dims(~tmp_vq, 1) * output_all[key], axis=1)
                #
                #     comb_right = [t for t, x in enumerate(tmp_comb) if x]
                #     comb_wrong = [t for t, x in enumerate(~tmp_comb) if x]
                #     vq_right = [t for t, x in enumerate(tmp_vq) if x]
                #     vq_wrong = [t for t, x in enumerate(~tmp_vq) if x]
                #     van_right = [t for t, x in enumerate(tmp_van) if x]
                #     van_wrong = [t for t, x in enumerate(~tmp_van) if x]
                #
                #     list_vq = ~(entry[0] == entry[3]) == (entry[1] == entry[3])
                #     list_comb = (entry[0] == entry[3]) == ~(entry[2] == entry[3])
                #     ind_vq_true = [t for t, x in enumerate(list_vq) if x]
                #     ind_comb_true = [t for t, x in enumerate(list_comb) if x]
                #     ind_vq_false = [t for t, x in enumerate(list_vq) if not x]
                #     # est = output_all[key][1]
                #
                #
                #     # plt.subplot(2, 1, 1)
                #     # # plt.hist(np.ndarray.flatten(np.expand_dims(list_vq, 1) * output_all[key]), bins=100, range=[1e-15, 1])
                #     # plt.hist(-np.sum(np.log2(output_all[key][0]) * output_all[key][0], axis=1), bins=10)
                #     # plt.subplot(2, 1, 2)
                #     # # plt.hist(np.ndarray.flatten(np.expand_dims(~list_vq, 1) * output_all[key]), bins=100, range=[1e-15, 1])
                #     # plt.hist(-np.sum(np.log2(output_all[key][1]) * output_all[key][1], axis=1), bins=10)
                #     # plt.show()
                #
                #     print('right comb: ' + str(len(comb_right)))
                #     print('wrong comb: ' + str(len(comb_wrong)))
                #     print('right vq: ' + str(len(vq_right)))
                #     print('wrong vq: ' + str(len(vq_wrong)))
                #     print('right van: ' + str(len(van_right)))
                #     print('wrong van: ' + str(len(van_wrong)))
                #     # print(len(van_right) + len(van_wrong))
                #     # print(entry[2][van_wrong])
                #     gather_right[entry[3][comb_right]] += 1.0
                #     gather_wrong[entry[3][comb_wrong]] += 1.0
                #     gather_vq[entry[3][ind_vq_true]] += 1.0
                #     gather_comb[entry[3][ind_comb_true]] += 1.0
                #     # print(len(van_right) + len(van_wrong))
                #     # print(len(entry[2]))
                #     print(sum(list_comb) / len(entry[3]))
                #     print(sum(list_vq) / len(entry[3]))

                # plt.subplot(3, 1, 1)
                # plt.bar(range(0, 127), gather_right)
                # plt.subplot(3, 1, 2)
                # plt.bar(range(0, 127), gather_wrong)
                # plt.subplot(3, 1, 3)
                # plt.bar(range(0, 127), gather_vq)
                # plt.show()
                # print(check_data[0] == check_data[1])
                if False:
                    misc = Misc()
                    features_all = np.concatenate(features_all)
                    phoneme_all = np.expand_dims(np.concatenate(phoneme_all),
                                                 1)
                    phoneme_all = misc.trans_vec_to_phones(phoneme_all)
                    # print(misc.calculate_mi(features_all, phoneme_all))
                    mi, test_py, test_pw, test_pyw = self._session.run(
                        ["mutual_info:0", "p_y:0", "p_w:0", "p_yw:0"],
                        feed_dict={
                            "is_train:0": False,
                            "ph_features:0": features_all,
                            "ph_labels:0": phoneme_all
                        })
                    print(mi)
                    tmp_pywtest = pd.DataFrame(test_py)
                    tmp_pywtest.to_csv('py_inf.txt', header=False, index=False)
                    tmp_pywtest = pd.DataFrame(test_pw)
                    tmp_pywtest.to_csv('pw_inf.txt', header=False, index=False)
                    tmp_pywtest = pd.DataFrame(test_pyw)
                    tmp_pywtest.to_csv('pwy_inf.txt',
                                       header=False,
                                       index=False)

                break