def _perform_speech_activity_detection(self): """ Checks for type of speech activity detection from config. Choices are NeMo VAD, external vad manifest and oracle VAD (generates speech activity labels from provided RTTM files) """ if self.has_vad_model: self._dont_auto_split = False self._split_duration = 50 manifest_vad_input = self._diarizer_params.manifest_filepath if not self._dont_auto_split: logging.info( "Split long audio file to avoid CUDA memory issue") logging.debug( "Try smaller split_duration if you still have CUDA memory issue" ) config = { 'manifest_filepath': manifest_vad_input, 'time_length': self._vad_window_length_in_sec, 'split_duration': self._split_duration, 'num_workers': self._cfg.num_workers, } manifest_vad_input = prepare_manifest(config) else: logging.warning( "If you encounter CUDA memory issue, try splitting manifest entry by split_duration to avoid it." ) self._setup_vad_test_data(manifest_vad_input) self._run_vad(manifest_vad_input) elif self._diarizer_params.vad.external_vad_manifest is not None: self._speaker_manifest_path = self._diarizer_params.vad.external_vad_manifest elif self._diarizer_params.oracle_vad: self._speaker_manifest_path = os.path.join( self._speaker_dir, 'oracle_vad_manifest.json') self._speaker_manifest_path = write_rttm2manifest( self.AUDIO_RTTM_MAP, self._speaker_manifest_path) else: raise ValueError( "Only one of diarizer.oracle_vad, vad.model_path or vad.external_vad_manifest must be passed" )
def diarize(self, paths2audio_files: List[str] = None, batch_size: int = 1): """ """ if paths2audio_files: self.paths2audio_files = paths2audio_files else: if self._cfg.diarizer.paths2audio_files is None: raise ValueError( "Pass path2audio files either through config or to diarize method" ) else: self.paths2audio_files = self._cfg.diarizer.paths2audio_files if type(self.paths2audio_files) is str and os.path.isfile( self.paths2audio_files): paths2audio_files = [] with open(self.paths2audio_files, 'r') as path2file: for audiofile in path2file.readlines(): audiofile = audiofile.strip() paths2audio_files.append(audiofile) elif type(self.paths2audio_files) in [list, ListConfig]: paths2audio_files = list(self.paths2audio_files) else: raise ValueError( "paths2audio_files must be of type list or path to file containing audio files" ) self.AUDIO_RTTM_MAP = audio_rttm_map( paths2audio_files, self._cfg.diarizer.path2groundtruth_rttm_files) if self.has_vad_model: logging.info("Performing VAD") mfst_file = self.path2audio_files_to_manifest(paths2audio_files) self._dont_auto_split = False self._split_duration = 50 manifest_vad_input = mfst_file if not self._dont_auto_split: logging.info( "Split long audio file to avoid CUDA memory issue") logging.debug( "Try smaller split_duration if you still have CUDA memory issue" ) config = { 'manifest_filepath': mfst_file, 'time_length': self._vad_window_length_in_sec, 'split_duration': self._split_duration, 'num_workers': self._cfg.num_workers, } manifest_vad_input = prepare_manifest(config) else: logging.warning( "If you encounter CUDA memory issue, try splitting manifest entry by split_duration to avoid it." ) self._setup_vad_test_data(manifest_vad_input) self._run_vad(manifest_vad_input) else: if not os.path.exists(self._speaker_manifest_path): raise NotFoundError("Oracle VAD based manifest file not found") self._extract_embeddings(self._speaker_manifest_path) out_rttm_dir = os.path.join(self._out_dir, 'pred_rttms') os.makedirs(out_rttm_dir, exist_ok=True) perform_diarization( embeddings_file=self._embeddings_file, reco2num=self._num_speakers, manifest_path=self._speaker_manifest_path, sample_rate=self._cfg.sample_rate, window=self._cfg.diarizer.speaker_embeddings.window_length_in_sec, shift=self._cfg.diarizer.speaker_embeddings.shift_length_in_sec, audio_rttm_map=self.AUDIO_RTTM_MAP, out_rttm_dir=out_rttm_dir, max_num_speakers=self.max_num_speakers, )
def main(): parser = ArgumentParser() parser.add_argument("--vad_model", type=str, default="MatchboxNet-VAD-3x2", required=False, help="Pass: '******'") parser.add_argument( "--dataset", type=str, required=True, help= "Path of json file of evaluation data. Audio files should have unique names.", ) parser.add_argument("--out_dir", type=str, default="vad_frame", help="Dir of your vad outputs") parser.add_argument("--time_length", type=float, default=0.63) parser.add_argument("--shift_length", type=float, default=0.01) parser.add_argument("--normalize_audio", type=bool, default=False) parser.add_argument("--num_workers", type=float, default=20) parser.add_argument("--split_duration", type=float, default=400) parser.add_argument( "--dont_auto_split", default=False, action='store_true', help= "Whether to automatically split manifest entry by split_duration to avoid potential CUDA out of memory issue.", ) args = parser.parse_args() torch.set_grad_enabled(False) if args.vad_model.endswith('.nemo'): logging.info(f"Using local VAD model from {args.vad_model}") vad_model = EncDecClassificationModel.restore_from( restore_path=args.vad_model) else: logging.info(f"Using NGC cloud VAD model {args.vad_model}") vad_model = EncDecClassificationModel.from_pretrained( model_name=args.vad_model) if not os.path.exists(args.out_dir): os.mkdir(args.out_dir) # Prepare manifest for streaming VAD manifest_vad_input = args.dataset if not args.dont_auto_split: logging.info("Split long audio file to avoid CUDA memory issue") logging.debug( "Try smaller split_duration if you still have CUDA memory issue") config = { 'manifest_filepath': manifest_vad_input, 'time_length': args.time_length, 'split_duration': args.split_duration, 'num_workers': args.num_workers, } manifest_vad_input = prepare_manifest(config) else: logging.warning( "If you encounter CUDA memory issue, try splitting manifest entry by split_duration to avoid it." ) # setup_test_data vad_model.setup_test_data( test_data_config={ 'vad_stream': True, 'sample_rate': 16000, 'manifest_filepath': manifest_vad_input, 'labels': [ 'infer', ], 'num_workers': args.num_workers, 'shuffle': False, 'time_length': args.time_length, 'shift_length': args.shift_length, 'trim_silence': False, 'normalize_audio': args.normalize_audio, }) vad_model = vad_model.to(device) vad_model.eval() time_unit = int(args.time_length / args.shift_length) trunc = int(time_unit / 2) trunc_l = time_unit - trunc all_len = 0 data = [] for line in open(args.dataset, 'r'): file = json.loads(line)['audio_filepath'].split("/")[-1] data.append(file.split(".wav")[0]) logging.info(f"Inference on {len(data)} audio files/json lines!") status = get_vad_stream_status(data) for i, test_batch in enumerate(vad_model.test_dataloader()): test_batch = [x.to(device) for x in test_batch] with autocast(): log_probs = vad_model(input_signal=test_batch[0], input_signal_length=test_batch[1]) probs = torch.softmax(log_probs, dim=-1) pred = probs[:, 1] if status[i] == 'start': to_save = pred[:-trunc] elif status[i] == 'next': to_save = pred[trunc:-trunc_l] elif status[i] == 'end': to_save = pred[trunc_l:] else: to_save = pred all_len += len(to_save) outpath = os.path.join(args.out_dir, data[i] + ".frame") with open(outpath, "a") as fout: for f in range(len(to_save)): fout.write('{0:0.4f}\n'.format(to_save[f])) del test_batch if status[i] == 'end' or status[i] == 'single': logging.debug( f"Overall length of prediction of {data[i]} is {all_len}!") all_len = 0
def main(cfg): if not cfg.dataset: raise ValueError("You must input the path of json file of evaluation data") # each line of dataset should be have different audio_filepath and unique name to simplfiy edge cases or conditions key_meta_map = {} with open(cfg.dataset, 'r') as manifest: for line in manifest.readlines(): audio_filepath = json.loads(line.strip())['audio_filepath'] uniq_audio_name = audio_filepath.split('/')[-1].rsplit('.', 1)[0] if uniq_audio_name in key_meta_map: raise ValueError("Please make sure each line is with different audio_filepath! ") key_meta_map[uniq_audio_name] = {'audio_filepath': audio_filepath} # Prepare manifest for streaming VAD manifest_vad_input = cfg.dataset if cfg.prepare_manifest.auto_split: logging.info("Split long audio file to avoid CUDA memory issue") logging.debug("Try smaller split_duration if you still have CUDA memory issue") config = { 'input': manifest_vad_input, 'window_length_in_sec': cfg.vad.parameters.window_length_in_sec, 'split_duration': cfg.prepare_manifest.split_duration, 'num_workers': cfg.num_workers, 'prepared_manfiest_vad_input': cfg.prepared_manfiest_vad_input, } manifest_vad_input = prepare_manifest(config) else: logging.warning( "If you encounter CUDA memory issue, try splitting manifest entry by split_duration to avoid it." ) torch.set_grad_enabled(False) vad_model = init_vad_model(cfg.vad.model_path) # setup_test_data vad_model.setup_test_data( test_data_config={ 'vad_stream': True, 'sample_rate': 16000, 'manifest_filepath': manifest_vad_input, 'labels': ['infer',], 'num_workers': cfg.num_workers, 'shuffle': False, 'window_length_in_sec': cfg.vad.parameters.window_length_in_sec, 'shift_length_in_sec': cfg.vad.parameters.shift_length_in_sec, 'trim_silence': False, 'normalize_audio': cfg.vad.parameters.normalize_audio, } ) vad_model = vad_model.to(device) vad_model.eval() if not os.path.exists(cfg.frame_out_dir): os.mkdir(cfg.frame_out_dir) else: logging.warning( "Note frame_out_dir exists. If new file has same name as file inside existing folder, it will append result to existing file and might cause mistakes for next steps." ) logging.info("Generating frame level prediction ") pred_dir = generate_vad_frame_pred( vad_model=vad_model, window_length_in_sec=cfg.vad.parameters.window_length_in_sec, shift_length_in_sec=cfg.vad.parameters.shift_length_in_sec, manifest_vad_input=manifest_vad_input, out_dir=cfg.frame_out_dir, ) logging.info( f"Finish generating VAD frame level prediction with window_length_in_sec={cfg.vad.parameters.window_length_in_sec} and shift_length_in_sec={cfg.vad.parameters.shift_length_in_sec}" ) # overlap smoothing filter if cfg.gen_overlap_seq: # Generate predictions with overlapping input segments. Then a smoothing filter is applied to decide the label for a frame spanned by multiple segments. # smoothing_method would be either in majority vote (median) or average (mean) logging.info("Generating predictions with overlapping input segments") smoothing_pred_dir = generate_overlap_vad_seq( frame_pred_dir=pred_dir, smoothing_method=cfg.vad.parameters.smoothing, overlap=cfg.vad.parameters.overlap, window_length_in_sec=cfg.vad.parameters.window_length_in_sec, shift_length_in_sec=cfg.vad.parameters.shift_length_in_sec, num_workers=cfg.num_workers, out_dir=cfg.smoothing_out_dir, ) logging.info( f"Finish generating predictions with overlapping input segments with smoothing_method={cfg.vad.parameters.smoothing} and overlap={cfg.vad.parameters.overlap}" ) pred_dir = smoothing_pred_dir # postprocessing and generate speech segments if cfg.gen_seg_table: logging.info("Converting frame level prediction to speech/no-speech segment in start and end times format.") table_out_dir = generate_vad_segment_table( vad_pred_dir=pred_dir, postprocessing_params=cfg.vad.parameters.postprocessing, shift_length_in_sec=cfg.vad.parameters.shift_length_in_sec, num_workers=cfg.num_workers, out_dir=cfg.table_out_dir, ) logging.info( f"Finish generating speech semgents table with postprocessing_params: {cfg.vad.parameters.postprocessing}" ) if cfg.write_to_manifest: for i in key_meta_map: key_meta_map[i]['rttm_filepath'] = os.path.join(table_out_dir, i + ".txt") if not cfg.out_manifest_filepath: out_manifest_filepath = "vad_out.json" else: out_manifest_filepath = cfg.out_manifest_filepath out_manifest_filepath = write_rttm2manifest(key_meta_map, out_manifest_filepath) logging.info(f"Writing VAD output to manifest: {out_manifest_filepath}")
def main(): parser = ArgumentParser() parser.add_argument("--inp_dir", type=str, required=True, help="(full path) folder of files to be processed") parser.add_argument( "--inp_list", type=str, help= "(full path) a file contains NAME of files inside inp_dir to be processed" ) parser.add_argument( "--out_dir", type=str, default=".", help="(full path) location to store generated json file") parser.add_argument("--manifest_name", type=str, default="generated_manifest", help="name of generated json file") parser.add_argument("--split_duration", type=int, required=True, help="max duration of each audio clip/line") parser.add_argument( "--window_length_in_sec", type=float, default=0.63, help="window length in sec for VAD context input , default is 0.63s", ) parser.add_argument("--num_workers", type=int, default=4, help="number of workers for multiprocessing") args = parser.parse_args() if not args.inp_list: input_audios = [] for root, dirs, files in os.walk(args.inp_dir): for basename in files: if basename.endswith('.wav'): filename = os.path.join(root, basename) input_audios.append(filename) else: name_list = np.loadtxt(args.inp_list, dtype='str') input_audios = [ os.path.join(args.inp_dir, name + ".wav") for name in name_list ] input_list = [] for i in input_audios: input_list.append({'audio_filepath': i, "offset": 0, "duration": None}) logging.info(f"Number of wav files to be processed: {len(input_audios)}") output_path = os.path.join(args.out_dir, args.manifest_name + '.json') logging.info("Split long audio file to avoid CUDA memory issue") logging.debug( "Try smaller split_duration if you still have CUDA memory issue") config = { 'input': input_list, 'window_length_in_sec': args.window_length_in_sec, 'split_duration': args.split_duration, 'num_workers': args.num_workers, 'prepared_manfiest_vad_input': output_path, } manifest_vad_input = prepare_manifest(config) logging.info(f"Done! Save to {manifest_vad_input}")