Ejemplo n.º 1
0
    def process_audio(self, wav_path, label_file, char_file, h5_file):
        #read map file
        self.char_map, self.int2phone = process_map_file(char_file)

        #read the label file
        label_dict = process_label_file(label_file, self.out_type,
                                        self.char_map)

        #extract spectrum
        spec_dict = dict()
        f = open(wav_path, 'r')
        for line in f.readlines():
            utt, path = line.strip().split()
            spect = parse_audio(path,
                                audio_conf,
                                windows,
                                normalize=self.normalize)
            spec_dict[utt] = spect.numpy()
        f.close()

        assert len(spec_dict) == len(label_dict)

        self.features_label = []
        #save the data as h5 file
        f = h5py.File(h5_file, 'w')
        f.create_dataset("phone_map_key", data=self.char_map.keys())
        f.create_dataset("phone_map_value", data=self.char_map.values())
        for utt in spec_dict:
            grp = f.create_group(utt)
            self.features_label.append(
                (torch.FloatTensor(spec_dict[utt]), label_dict[utt].tolist()))
            grp.create_dataset('data', data=spec_dict[utt])
            grp.create_dataset('label', data=label_dict[utt])
        print("Saved the %s data to h5py file" % self.data_set)
Ejemplo n.º 2
0
    def process_audio(self, wav_path, label_file, char_file, h5_file):
        #read map file
        self.char_map, self.int2phone = process_map_file(char_file)

        #read the label file
        label_dict = process_label_file(label_file, self.out_type,
                                        self.char_map)

        #extract spectrum
        spec_dict = dict()
        f = open(wav_path, 'r')
        for line in f.readlines():
            utt, path = line.strip().split()
            spect = parse_audio(path, audio_conf, windows)
            #print(spect)
            spec_dict[utt] = spect.numpy()
        f.close()

        if self.normalize:
            i = 0
            for utt in spec_dict:
                if i == 0:
                    spec_all = torch.FloatTensor(spec_dict[utt])
                else:
                    spec_all = torch.cat(
                        (spec_all, torch.FloatTensor(spec_dict[utt])), 0)
                i += 1
            mean = torch.mean(spec_all, 0, True)
            std = torch.std(spec_all, 0, True)
            for utt in spec_dict:
                tmp = torch.add(torch.FloatTensor(spec_dict[utt]), -1, mean)
                spec_dict[utt] = torch.div(tmp, std).numpy()

        if len(spec_dict) != len(label_dict):
            print("%s data: The num of wav and text are not the same!" %
                  self.data_set)
            sys.exit(1)

        self.features_label = []
        #save the data as h5 file
        f = h5py.File(h5_file, 'w')
        f.create_dataset("phone_map_key", data=self.char_map.keys())
        f.create_dataset("phone_map_value", data=self.char_map.values())
        for utt in spec_dict:
            grp = f.create_group(utt)
            self.features_label.append(
                (torch.FloatTensor(spec_dict[utt]), label_dict[utt].tolist()))
            grp.create_dataset('data', data=spec_dict[utt])
            grp.create_dataset('label', data=label_dict[utt])
        print("Saved the %s data to h5py file" % self.data_set)
Ejemplo n.º 3
0
 def __getitem__(self, idx):
     return parse_audio(self.path[idx],
                        audio_conf,
                        windows,
                        normalize=self.normalize), self.label[idx]
Ejemplo n.º 4
0
 def __getitem__(self, idx):
     return parse_audio(self.path[idx], audio_conf, windows, normalize=self.normalize), self.label[idx]