def __init__(self, data, set_type, config): super().__init__() self.config = config self.audio_files = list(data.keys()) self.set_type = set_type self.audio_labels = list(data.values()) config["bg_noise_files"] = list( filter(lambda x: x.endswith("wav"), config.get("bg_noise_files", []))) self.bg_noise_audio = [ librosa.core.load(file, sr=16000)[0] for file in config["bg_noise_files"] ] self.unknown_prob = config["unknown_prob"] self.silence_prob = config["silence_prob"] self.noise_prob = config["noise_prob"] self.input_length = config["input_length"] self.timeshift_ms = config["timeshift_ms"] self._audio_cache = SimpleCache(config["cache_size"]) self._file_cache = SimpleCache(config["cache_size"]) n_unk = len(list(filter(lambda x: x == 1, self.audio_labels))) self.n_silence = int(self.silence_prob * (len(self.audio_labels) - n_unk)) self.audio_processor = AudioPreprocessor( n_mels=config["n_mels"], n_dct_filters=config["n_dct_filters"], hop_ms=10, config=config) self.audio_preprocess_type = config["audio_preprocess_type"] self.n_mels = config["n_mels"]
def __init__(self, onnx_filename, labels): self.labels = labels self.model_filename = onnx_filename self.audio_processor = AudioPreprocessor() self._graph = onnx.load(onnx_filename) self._in_name = self._graph.graph.input[0].name self.model = onnx_caffe2.backend.prepare(self._graph)
def __init__(self, model_filename, no_cuda=False, labels=["_silence_", "_unknown_", "command", "random"]): self.labels = labels self.model_filename = model_filename self.no_cuda = no_cuda self.audio_processor = AudioPreprocessor() self.reload()
class TorchLabelService(LabelService): def __init__(self, model_filename, no_cuda=False, labels=["_silence_", "_unknown_", "command", "random"]): self.labels = labels self.model_filename = model_filename self.no_cuda = no_cuda self.audio_processor = AudioPreprocessor() self.reload() def reload(self): config = model.find_config(model.ConfigType.EFFB0) config["n_labels"] = len(self.labels) self.model = model.SpeechEffModel(config) if not self.no_cuda: self.model.cuda() self.model.load(self.model_filename) self.model.eval() print(self.model()) def label(self, wav_data): """Labels audio data as one of the specified trained labels Args: wav_data: The WAVE to label Returns: A (most likely label, probability) tuple """ wav_data = np.frombuffer(wav_data, dtype=np.int16) / 32768. model_in = torch.from_numpy(self.audio_processor.compute_mfccs(wav_data).squeeze(2)).unsqueeze(0) model_in = torch.autograd.Variable(model_in, requires_grad=False) if not self.no_cuda: model_in = model_in.cuda() predictions = F.softmax(self.model(model_in).squeeze(0).cpu()).data.numpy() # return (self.labels[np.argmax(predictions)], np.max(predictions)) return self.labels, predictions
class Caffe2LabelService(LabelService): def __init__(self, onnx_filename, labels): self.labels = labels self.model_filename = onnx_filename self.audio_processor = AudioPreprocessor() self._graph = onnx.load(onnx_filename) self._in_name = self._graph.graph.input[0].name self.model = onnx_caffe2.backend.prepare(self._graph) def label(self, wav_data): wav_data = np.frombuffer(wav_data, dtype=np.int16) / 32768. model_in = np.expand_dims(self.audio_processor.compute_mfccs(wav_data).squeeze(2), 0) model_in = np.expand_dims(model_in, 0) model_in = model_in.astype(np.float32) predictions = _softmax(self.model.run({self._in_name: model_in})[0]) return (self.labels[np.argmax(predictions)], np.max(predictions))
class HiKoovLabelService(LabelService): def __init__(self): self.labels = ["_silence_", "_unknown_", "hi_koov"] self.model_filename = 'model/model-0355-best.pt' self.audio_processor = AudioPreprocessor() self.reload() def reload(self): config = model.find_config(model.ConfigType.RES8_KOOV) self.model = model.SpeechResModel(config) self.model.load(self.model_filename) self.model.eval() def label(self, wav_data): mfccs = self.audio_processor.compute_mfccs(wav_data) model_in = torch.from_numpy(mfccs.squeeze(2)).unsqueeze(0) model_in = torch.autograd.Variable(model_in, requires_grad=False) model_out = self.model(model_in).squeeze(0).cpu() predictions = F.softmax(model_out, dim=0).data.numpy() return (self.labels[np.argmax(predictions)], np.max(predictions))
def __init__(self): self.labels = ["_silence_", "_unknown_", "hi_koov"] self.model_filename = 'model/model-0355-best.pt' self.audio_processor = AudioPreprocessor() self.reload()
class SpeechDataset(data.Dataset): LABEL_SILENCE = "__silence__" # public static variable LABEL_UNKNOWN = "__unknown__" def __init__(self, data, set_type, config): super().__init__() self.config = config self.audio_files = list(data.keys()) self.set_type = set_type self.audio_labels = list(data.values()) config["bg_noise_files"] = list( filter(lambda x: x.endswith("wav"), config.get("bg_noise_files", []))) self.bg_noise_audio = [ librosa.core.load(file, sr=16000)[0] for file in config["bg_noise_files"] ] self.unknown_prob = config["unknown_prob"] self.silence_prob = config["silence_prob"] self.noise_prob = config["noise_prob"] self.input_length = config["input_length"] self.timeshift_ms = config["timeshift_ms"] self._audio_cache = SimpleCache(config["cache_size"]) self._file_cache = SimpleCache(config["cache_size"]) n_unk = len(list(filter(lambda x: x == 1, self.audio_labels))) self.n_silence = int(self.silence_prob * (len(self.audio_labels) - n_unk)) self.audio_processor = AudioPreprocessor( n_mels=config["n_mels"], n_dct_filters=config["n_dct_filters"], hop_ms=10, config=config) self.audio_preprocess_type = config["audio_preprocess_type"] self.n_mels = config["n_mels"] @staticmethod def default_config(): config = {} config["group_speakers_by_id"] = True config["silence_prob"] = 0.1 config["noise_prob"] = 0.8 config["n_dct_filters"] = 40 config["input_length"] = 16000 config["n_mels"] = 40 config["timeshift_ms"] = 100 config["unknown_prob"] = 0.1 config["train_pct"] = 80 config["dev_pct"] = 10 config["test_pct"] = 10 config["wanted_words"] = ["command", "random"] config["data_folder"] = "data/speech_dataset" config["audio_preprocess_type"] = "MFCCs" return config def collate_fn(self, data): x = None y = [] mult = 1 if self.config["feature_type"] == "log_mel": mult = 3 # print("collate star:", time.time()) for audio_data, label in data: # data and label if self.audio_preprocess_type == "MFCCs": audio_tensor = torch.from_numpy( self.audio_processor.compute_mfccs(audio_data).reshape( 1, self.config["n_mels"] * mult, 101)) # shape(b, h, w) x = audio_tensor if x is None else torch.cat( (x, audio_tensor), 0) elif self.audio_preprocess_type == "PCEN": audio_tensor = torch.from_numpy( np.expand_dims(audio_data, axis=0)) audio_tensor = self.audio_processor.compute_pcen(audio_tensor) x = audio_tensor if x is None else torch.cat( (x, audio_tensor), 0) y.append(label) # print("collate end:", time.time()) return x, torch.tensor(y) def _timeshift_audio(self, data): shift = (16000 * self.timeshift_ms) // 1000 shift = random.randint(-shift, shift) a = -min(0, shift) b = max(0, shift) data = np.pad(data, (a, b), "constant") return data[:len(data) - a] if a else data[b:] def load_audio(self, example, silence=False): if silence: example = "__silence__" if random.random() < 0.7: try: return self._audio_cache[example] except KeyError: pass in_len = self.input_length if self.bg_noise_audio: bg_noise = random.choice(self.bg_noise_audio) a = random.randint(0, len(bg_noise) - in_len - 1) bg_noise = bg_noise[a:a + in_len] else: bg_noise = np.zeros(in_len) if silence: data = np.zeros(in_len) else: # augmenter = Compose([ # AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5), # # TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5), # PitchShift(min_semitones=-4, max_semitones=4, p=0.5), # Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5), # ]) file_data = self._file_cache.get(example) data = librosa.core.load( example, sr=16000)[0] if file_data is None else file_data # data = augmenter(samples=data, sample_rate=16000) self._file_cache[example] = data data = np.pad(data, (0, max(0, in_len - len(data))), "constant") if self.set_type == DatasetType.TRAIN: data = self._timeshift_audio(data) if self.config["add_noise"]: if random.random() < self.noise_prob or silence: a = random.random() * 0.1 data = np.clip(a * bg_noise + data, -1, 1) # data = np.clip(data, -1, 1) self._audio_cache[example] = data return data @classmethod def splits(cls, config): folder = config["data_folder"] # data/speech_dataset wanted_words = config[ "wanted_words"] # ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go'] unknown_prob = config["unknown_prob"] # 0.1 train_pct = config["train_pct"] # 80 dev_pct = config["dev_pct"] # 10 test_pct = config["test_pct"] # 10 words = {word: i + 2 for i, word in enumerate(wanted_words)} # {'yes': 2, 'no': 3, 'up': 4, 'down': 5, 'left': 6, 'right': 7, 'on': 8, 'off': 9, 'stop': 10, 'go': 11} words.update({cls.LABEL_SILENCE: 0, cls.LABEL_UNKNOWN: 1}) sets = [{}, {}, {}] unknowns = [0] * 3 bg_noise_files = [] unknown_files = [] for folder_name in os.listdir(folder): path_name = os.path.join(folder, folder_name) # data/speech_dataset/yes is_bg_noise = False if os.path.isfile(path_name): continue if folder_name in words: label = words[folder_name] elif folder_name == "_background_noise_": is_bg_noise = True else: label = words[cls.LABEL_UNKNOWN] for filename in os.listdir(path_name): wav_name = os.path.join( path_name, filename) # data/speech_dataset/down/00b01445_nohash_1.wav if is_bg_noise and os.path.isfile(wav_name): bg_noise_files.append(wav_name) continue elif label == words[ cls. LABEL_UNKNOWN]: # here the one\four folder is the UNKNOWN unknown_files.append(wav_name) continue if config["group_speakers_by_id"]: hashname = re.sub(r"_nohash_.*$", "", filename) max_no_wavs = 2**27 - 1 bucket = int(hashlib.sha1(hashname.encode()).hexdigest(), 16) # hash values hexdigest() return 16 jinzhi bucket = (bucket % (max_no_wavs + 1)) * (100. / max_no_wavs) if bucket < dev_pct: tag = DatasetType.DEV # TRAIN = 0, DEV = 1, TEST = 2 elif bucket < test_pct + dev_pct: # dev_pct = 10, test_pct = 10, train_pct = 80 tag = DatasetType.TEST else: tag = DatasetType.TRAIN if config["type"] == "eval": sets[2][wav_name] = label elif config["type"] == "train": sets[tag.value][wav_name] = label # sets = [ # train {'00b01445_nohash_1': 1, }, length = 16696 # dev {'00b01443_nohash_1': 2, }, length = 2316 # test {'00b01441_nohash_1': 3, } length = 2311 # ] for tag in range(len(sets)): unknowns[tag] = int( unknown_prob * len(sets[tag])) # train length, validation, test random.shuffle(unknown_files) a = 0 for i, dataset in enumerate(sets): b = a + unknowns[i] unk_dict = { u: words[cls.LABEL_UNKNOWN] for u in unknown_files[a:b] } dataset.update(unk_dict) a = b # unk_dict = { # 0:len(train_dataset)-1, # len(train_dataset): len(train+dev_dataset)-1 # len(train+dev):len(train+dev+test)-1 # } train_cfg = ChainMap(dict(bg_noise_files=bg_noise_files), config) test_cfg = ChainMap(dict(bg_noise_files=bg_noise_files, noise_prob=0), config) # print(test_cfg) datasets = (cls(sets[0], DatasetType.TRAIN, train_cfg), cls(sets[1], DatasetType.DEV, test_cfg), cls(sets[2], DatasetType.TEST, config)) return datasets def __getitem__(self, index): if index >= len(self.audio_labels): return self.load_audio(None, silence=True), 0 return self.load_audio( self.audio_files[index]), self.audio_labels[index] def __len__(self): # return len(self.audio_labels) + self.n_silence return len(self.audio_labels)