def process_audio(self, wav_path, label_file, char_file, h5_file): #read map file self.char_map, self.int2phone = process_map_file(char_file) #read the label file label_dict = process_label_file(label_file, self.out_type, self.char_map) #extract spectrum spec_dict = dict() f = open(wav_path, 'r') for line in f.readlines(): utt, path = line.strip().split() spect = parse_audio(path, audio_conf, windows, normalize=self.normalize) spec_dict[utt] = spect.numpy() f.close() assert len(spec_dict) == len(label_dict) self.features_label = [] #save the data as h5 file f = h5py.File(h5_file, 'w') f.create_dataset("phone_map_key", data=self.char_map.keys()) f.create_dataset("phone_map_value", data=self.char_map.values()) for utt in spec_dict: grp = f.create_group(utt) self.features_label.append( (torch.FloatTensor(spec_dict[utt]), label_dict[utt].tolist())) grp.create_dataset('data', data=spec_dict[utt]) grp.create_dataset('label', data=label_dict[utt]) print("Saved the %s data to h5py file" % self.data_set)
def process_audio(self, wav_path, label_file, char_file, h5_file): #read map file self.char_map, self.int2phone = process_map_file(char_file) #read the label file label_dict = process_label_file(label_file, self.out_type, self.char_map) #extract spectrum spec_dict = dict() f = open(wav_path, 'r') for line in f.readlines(): utt, path = line.strip().split() spect = parse_audio(path, audio_conf, windows) #print(spect) spec_dict[utt] = spect.numpy() f.close() if self.normalize: i = 0 for utt in spec_dict: if i == 0: spec_all = torch.FloatTensor(spec_dict[utt]) else: spec_all = torch.cat( (spec_all, torch.FloatTensor(spec_dict[utt])), 0) i += 1 mean = torch.mean(spec_all, 0, True) std = torch.std(spec_all, 0, True) for utt in spec_dict: tmp = torch.add(torch.FloatTensor(spec_dict[utt]), -1, mean) spec_dict[utt] = torch.div(tmp, std).numpy() if len(spec_dict) != len(label_dict): print("%s data: The num of wav and text are not the same!" % self.data_set) sys.exit(1) self.features_label = [] #save the data as h5 file f = h5py.File(h5_file, 'w') f.create_dataset("phone_map_key", data=self.char_map.keys()) f.create_dataset("phone_map_value", data=self.char_map.values()) for utt in spec_dict: grp = f.create_group(utt) self.features_label.append( (torch.FloatTensor(spec_dict[utt]), label_dict[utt].tolist())) grp.create_dataset('data', data=spec_dict[utt]) grp.create_dataset('label', data=label_dict[utt]) print("Saved the %s data to h5py file" % self.data_set)
def __getitem__(self, idx): return parse_audio(self.path[idx], audio_conf, windows, normalize=self.normalize), self.label[idx]