コード例 #1
0
ファイル: Dataset.py プロジェクト: genty1314/KWS
 def __init__(self, data, set_type, config):
     super().__init__()
     self.config = config
     self.audio_files = list(data.keys())
     self.set_type = set_type
     self.audio_labels = list(data.values())
     config["bg_noise_files"] = list(
         filter(lambda x: x.endswith("wav"),
                config.get("bg_noise_files", [])))
     self.bg_noise_audio = [
         librosa.core.load(file, sr=16000)[0]
         for file in config["bg_noise_files"]
     ]
     self.unknown_prob = config["unknown_prob"]
     self.silence_prob = config["silence_prob"]
     self.noise_prob = config["noise_prob"]
     self.input_length = config["input_length"]
     self.timeshift_ms = config["timeshift_ms"]
     self._audio_cache = SimpleCache(config["cache_size"])
     self._file_cache = SimpleCache(config["cache_size"])
     n_unk = len(list(filter(lambda x: x == 1, self.audio_labels)))
     self.n_silence = int(self.silence_prob *
                          (len(self.audio_labels) - n_unk))
     self.audio_processor = AudioPreprocessor(
         n_mels=config["n_mels"],
         n_dct_filters=config["n_dct_filters"],
         hop_ms=10,
         config=config)
     self.audio_preprocess_type = config["audio_preprocess_type"]
     self.n_mels = config["n_mels"]
コード例 #2
0
ファイル: service.py プロジェクト: Davezqq/honk-dingdang
 def __init__(self, onnx_filename, labels):
     self.labels = labels
     self.model_filename = onnx_filename
     self.audio_processor = AudioPreprocessor()
     self._graph = onnx.load(onnx_filename)
     self._in_name = self._graph.graph.input[0].name
     self.model = onnx_caffe2.backend.prepare(self._graph)
コード例 #3
0
ファイル: service.py プロジェクト: Davezqq/honk-dingdang
 def __init__(self,
              model_filename,
              no_cuda=False,
              labels=["_silence_", "_unknown_", "command", "random"]):
     self.labels = labels
     self.model_filename = model_filename
     self.no_cuda = no_cuda
     self.audio_processor = AudioPreprocessor()
     self.reload()
コード例 #4
0
class TorchLabelService(LabelService):
    def __init__(self, model_filename, no_cuda=False, labels=["_silence_", "_unknown_", "command", "random"]):
        self.labels = labels
        self.model_filename = model_filename
        self.no_cuda = no_cuda
        self.audio_processor = AudioPreprocessor()
        self.reload()

    def reload(self):
        config = model.find_config(model.ConfigType.EFFB0)
        config["n_labels"] = len(self.labels)
        self.model = model.SpeechEffModel(config)
        if not self.no_cuda:
            self.model.cuda()
        self.model.load(self.model_filename)
        self.model.eval()
        print(self.model())

    def label(self, wav_data):
        """Labels audio data as one of the specified trained labels

        Args:
            wav_data: The WAVE to label

        Returns:
            A (most likely label, probability) tuple
        """
        wav_data = np.frombuffer(wav_data, dtype=np.int16) / 32768.
        model_in = torch.from_numpy(self.audio_processor.compute_mfccs(wav_data).squeeze(2)).unsqueeze(0)
        model_in = torch.autograd.Variable(model_in, requires_grad=False)
        if not self.no_cuda:
            model_in = model_in.cuda()
        predictions = F.softmax(self.model(model_in).squeeze(0).cpu()).data.numpy()
        # return (self.labels[np.argmax(predictions)], np.max(predictions))
        return self.labels, predictions
コード例 #5
0
class Caffe2LabelService(LabelService):
    def __init__(self, onnx_filename, labels):
        self.labels = labels
        self.model_filename = onnx_filename
        self.audio_processor = AudioPreprocessor()
        self._graph = onnx.load(onnx_filename)
        self._in_name = self._graph.graph.input[0].name
        self.model = onnx_caffe2.backend.prepare(self._graph)

    def label(self, wav_data):
        wav_data = np.frombuffer(wav_data, dtype=np.int16) / 32768.
        model_in = np.expand_dims(self.audio_processor.compute_mfccs(wav_data).squeeze(2), 0)
        model_in = np.expand_dims(model_in, 0)
        model_in = model_in.astype(np.float32)
        predictions = _softmax(self.model.run({self._in_name: model_in})[0])
        return (self.labels[np.argmax(predictions)], np.max(predictions))
コード例 #6
0
ファイル: service.py プロジェクト: dzungtx/honk
class HiKoovLabelService(LabelService):
    def __init__(self):
        self.labels = ["_silence_", "_unknown_", "hi_koov"]
        self.model_filename = 'model/model-0355-best.pt'
        self.audio_processor = AudioPreprocessor()
        self.reload()

    def reload(self):
        config = model.find_config(model.ConfigType.RES8_KOOV)
        self.model = model.SpeechResModel(config)
        self.model.load(self.model_filename)
        self.model.eval()

    def label(self, wav_data):
        mfccs = self.audio_processor.compute_mfccs(wav_data)
        model_in = torch.from_numpy(mfccs.squeeze(2)).unsqueeze(0)
        model_in = torch.autograd.Variable(model_in, requires_grad=False)
        model_out = self.model(model_in).squeeze(0).cpu()
        predictions = F.softmax(model_out, dim=0).data.numpy()
        return (self.labels[np.argmax(predictions)], np.max(predictions))
コード例 #7
0
ファイル: service.py プロジェクト: dzungtx/honk
 def __init__(self):
     self.labels = ["_silence_", "_unknown_", "hi_koov"]
     self.model_filename = 'model/model-0355-best.pt'
     self.audio_processor = AudioPreprocessor()
     self.reload()
コード例 #8
0
ファイル: Dataset.py プロジェクト: genty1314/KWS
class SpeechDataset(data.Dataset):
    LABEL_SILENCE = "__silence__"  # public static variable
    LABEL_UNKNOWN = "__unknown__"

    def __init__(self, data, set_type, config):
        super().__init__()
        self.config = config
        self.audio_files = list(data.keys())
        self.set_type = set_type
        self.audio_labels = list(data.values())
        config["bg_noise_files"] = list(
            filter(lambda x: x.endswith("wav"),
                   config.get("bg_noise_files", [])))
        self.bg_noise_audio = [
            librosa.core.load(file, sr=16000)[0]
            for file in config["bg_noise_files"]
        ]
        self.unknown_prob = config["unknown_prob"]
        self.silence_prob = config["silence_prob"]
        self.noise_prob = config["noise_prob"]
        self.input_length = config["input_length"]
        self.timeshift_ms = config["timeshift_ms"]
        self._audio_cache = SimpleCache(config["cache_size"])
        self._file_cache = SimpleCache(config["cache_size"])
        n_unk = len(list(filter(lambda x: x == 1, self.audio_labels)))
        self.n_silence = int(self.silence_prob *
                             (len(self.audio_labels) - n_unk))
        self.audio_processor = AudioPreprocessor(
            n_mels=config["n_mels"],
            n_dct_filters=config["n_dct_filters"],
            hop_ms=10,
            config=config)
        self.audio_preprocess_type = config["audio_preprocess_type"]
        self.n_mels = config["n_mels"]

    @staticmethod
    def default_config():
        config = {}
        config["group_speakers_by_id"] = True
        config["silence_prob"] = 0.1
        config["noise_prob"] = 0.8
        config["n_dct_filters"] = 40
        config["input_length"] = 16000
        config["n_mels"] = 40
        config["timeshift_ms"] = 100
        config["unknown_prob"] = 0.1
        config["train_pct"] = 80
        config["dev_pct"] = 10
        config["test_pct"] = 10
        config["wanted_words"] = ["command", "random"]
        config["data_folder"] = "data/speech_dataset"
        config["audio_preprocess_type"] = "MFCCs"
        return config

    def collate_fn(self, data):
        x = None
        y = []
        mult = 1
        if self.config["feature_type"] == "log_mel":
            mult = 3
        # print("collate star:", time.time())
        for audio_data, label in data:  # data and label
            if self.audio_preprocess_type == "MFCCs":
                audio_tensor = torch.from_numpy(
                    self.audio_processor.compute_mfccs(audio_data).reshape(
                        1, self.config["n_mels"] * mult,
                        101))  # shape(b, h, w)
                x = audio_tensor if x is None else torch.cat(
                    (x, audio_tensor), 0)
            elif self.audio_preprocess_type == "PCEN":
                audio_tensor = torch.from_numpy(
                    np.expand_dims(audio_data, axis=0))
                audio_tensor = self.audio_processor.compute_pcen(audio_tensor)
                x = audio_tensor if x is None else torch.cat(
                    (x, audio_tensor), 0)
            y.append(label)
        # print("collate end:", time.time())
        return x, torch.tensor(y)

    def _timeshift_audio(self, data):
        shift = (16000 * self.timeshift_ms) // 1000
        shift = random.randint(-shift, shift)
        a = -min(0, shift)
        b = max(0, shift)
        data = np.pad(data, (a, b), "constant")
        return data[:len(data) - a] if a else data[b:]

    def load_audio(self, example, silence=False):
        if silence:
            example = "__silence__"
        if random.random() < 0.7:
            try:
                return self._audio_cache[example]
            except KeyError:
                pass
        in_len = self.input_length
        if self.bg_noise_audio:
            bg_noise = random.choice(self.bg_noise_audio)
            a = random.randint(0, len(bg_noise) - in_len - 1)
            bg_noise = bg_noise[a:a + in_len]
        else:
            bg_noise = np.zeros(in_len)

        if silence:
            data = np.zeros(in_len)
        else:
            # augmenter = Compose([
            #     AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
            #     # TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
            #     PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
            #     Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5),
            # ])
            file_data = self._file_cache.get(example)
            data = librosa.core.load(
                example, sr=16000)[0] if file_data is None else file_data
            # data = augmenter(samples=data, sample_rate=16000)
            self._file_cache[example] = data
        data = np.pad(data, (0, max(0, in_len - len(data))), "constant")
        if self.set_type == DatasetType.TRAIN:
            data = self._timeshift_audio(data)
        if self.config["add_noise"]:
            if random.random() < self.noise_prob or silence:
                a = random.random() * 0.1
                data = np.clip(a * bg_noise + data, -1, 1)
        # data = np.clip(data, -1, 1)

        self._audio_cache[example] = data
        return data

    @classmethod
    def splits(cls, config):
        folder = config["data_folder"]  # data/speech_dataset
        wanted_words = config[
            "wanted_words"]  # ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']
        unknown_prob = config["unknown_prob"]  # 0.1
        train_pct = config["train_pct"]  # 80
        dev_pct = config["dev_pct"]  # 10
        test_pct = config["test_pct"]  # 10

        words = {word: i + 2 for i, word in enumerate(wanted_words)}
        # {'yes': 2, 'no': 3, 'up': 4, 'down': 5, 'left': 6, 'right': 7, 'on': 8, 'off': 9, 'stop': 10, 'go': 11}
        words.update({cls.LABEL_SILENCE: 0, cls.LABEL_UNKNOWN: 1})
        sets = [{}, {}, {}]
        unknowns = [0] * 3
        bg_noise_files = []
        unknown_files = []

        for folder_name in os.listdir(folder):
            path_name = os.path.join(folder,
                                     folder_name)  # data/speech_dataset/yes
            is_bg_noise = False
            if os.path.isfile(path_name):
                continue
            if folder_name in words:
                label = words[folder_name]
            elif folder_name == "_background_noise_":
                is_bg_noise = True
            else:
                label = words[cls.LABEL_UNKNOWN]

            for filename in os.listdir(path_name):
                wav_name = os.path.join(
                    path_name,
                    filename)  # data/speech_dataset/down/00b01445_nohash_1.wav
                if is_bg_noise and os.path.isfile(wav_name):
                    bg_noise_files.append(wav_name)
                    continue
                elif label == words[
                        cls.
                        LABEL_UNKNOWN]:  # here the one\four folder is the UNKNOWN
                    unknown_files.append(wav_name)
                    continue
                if config["group_speakers_by_id"]:
                    hashname = re.sub(r"_nohash_.*$", "", filename)
                max_no_wavs = 2**27 - 1
                bucket = int(hashlib.sha1(hashname.encode()).hexdigest(), 16)
                # hash values  hexdigest() return 16 jinzhi
                bucket = (bucket % (max_no_wavs + 1)) * (100. / max_no_wavs)
                if bucket < dev_pct:
                    tag = DatasetType.DEV  # TRAIN = 0, DEV = 1, TEST = 2
                elif bucket < test_pct + dev_pct:  # dev_pct = 10, test_pct = 10, train_pct = 80
                    tag = DatasetType.TEST
                else:
                    tag = DatasetType.TRAIN
                if config["type"] == "eval":
                    sets[2][wav_name] = label
                elif config["type"] == "train":
                    sets[tag.value][wav_name] = label
                #  sets = [
                # train  {'00b01445_nohash_1': 1, },  length = 16696
                # dev    {'00b01443_nohash_1': 2, },  length = 2316
                # test   {'00b01441_nohash_1': 3, }   length = 2311
                #  ]

        for tag in range(len(sets)):
            unknowns[tag] = int(
                unknown_prob *
                len(sets[tag]))  # train length, validation, test
        random.shuffle(unknown_files)
        a = 0
        for i, dataset in enumerate(sets):
            b = a + unknowns[i]
            unk_dict = {
                u: words[cls.LABEL_UNKNOWN]
                for u in unknown_files[a:b]
            }
            dataset.update(unk_dict)
            a = b
            # unk_dict = {
            #   0:len(train_dataset)-1,
            #   len(train_dataset): len(train+dev_dataset)-1
            #   len(train+dev):len(train+dev+test)-1
            # }
        train_cfg = ChainMap(dict(bg_noise_files=bg_noise_files), config)
        test_cfg = ChainMap(dict(bg_noise_files=bg_noise_files, noise_prob=0),
                            config)
        # print(test_cfg)
        datasets = (cls(sets[0], DatasetType.TRAIN,
                        train_cfg), cls(sets[1], DatasetType.DEV, test_cfg),
                    cls(sets[2], DatasetType.TEST, config))
        return datasets

    def __getitem__(self, index):
        if index >= len(self.audio_labels):
            return self.load_audio(None, silence=True), 0
        return self.load_audio(
            self.audio_files[index]), self.audio_labels[index]

    def __len__(self):
        # return len(self.audio_labels) + self.n_silence
        return len(self.audio_labels)