def extract_ivecs(self): """ Extract normalization i-vectors using averaging. Returns: """ speakers_dict = {} with open(self.norm_list, 'r') as f: for line in f: if len(line.split()) > 1: # number of speakers is defined line = line.split()[0] speakers_dict = self.process_file(line, speakers_dict) for dict_key in speakers_dict: speakers_dict[dict_key] = np.mean(speakers_dict[dict_key], axis=0) if self.out_ivec_dir: for speaker in speakers_dict: Utils.mkdir_p( os.path.join(self.out_ivec_dir, os.path.dirname(speaker))) h5file = h5py.File( '{}.{}'.format(os.path.join(self.out_ivec_dir, speaker), 'h5'), 'w') h5file.create_dataset(speaker, data=speakers_dict[speaker]) h5file.close() return np.array(speakers_dict.values())
def dump_rttm(self, scores, out_dir): """ Args: scores: out_dir: Returns: """ for ivecset in self.ivecs: if ivecset.size() > 0: name = ivecset.name reg_name = re.sub('/.*', '', ivecset.name) Utils.mkdir_p(os.path.join(out_dir, os.path.dirname(name))) with open(os.path.join(out_dir, name + '.rttm'), 'w') as f: for i, ivec in enumerate(ivecset.ivecs): start, end = ivec.window_start, ivec.window_end idx = np.argmax(scores[name].T[i]) f.write( 'SPEAKER {} 1 {} {} <NA> <NA> {}_spkr_{} <NA>\n'. format(reg_name, float(start / 1000.0), float((end - start) / 1000.0), reg_name, idx)) else: logwarning( '[Diarization.dump_rttm] No i-vectors to dump in {}.'. format(ivecset.name))
def score(self, test, enroll): """ Score ivectors based on the PLDA model Input: PLDA object. PLDA.V and PLDA.U gives the subspaces stats objects: enroll and test Output: 2D array of scores of all possibilities """ test = test - self.mu enroll = enroll - self.mu test = Utils.l2_norm(test) enroll = Utils.l2_norm(enroll) Tstats = self.prepare_stats(test) tstats = self.prepare_stats(enroll) # Create scores scores = np.zeros((len(Tstats.N), len(tstats.N)), 'f') (a, b) = scores.shape # Score for each uniq combination of N enroll and M test sessions (only enroll for now) for n_enroll_sessions in np.unique(Tstats.N): idxs_T = np.where(Tstats.N == n_enroll_sessions)[0] for n_test_sessions in np.unique(tstats.N): idxs_t = np.where(tstats.N == n_test_sessions)[0] scores[np.ix_(idxs_T, idxs_t)] = self.score_with_constant_n( n_enroll_sessions, Tstats.F[idxs_T, :].T, n_test_sessions, tstats.F[idxs_t, :].T) return scores.T
def process_file(wav_dir, vad_dir, out_dir, file_name, fea2ivec_obj, max_size, tolerance, wav_suffix='wav', vad_suffix='lab.gz'): """ Process single audio file. Args: wav_dir (str): directory with wav files vad_dir (str): directory with vad files out_dir (str): output directory file_name (str): name of the file fea2ivec_obj (Fea2Ivec): input models for i-vector extraction max_size (int): maximal size of window in ms tolerance (int): accept given number of frames as speech even when it is marked as silence wav_suffix (str): suffix of wav files vad_suffix (str): suffix of vad files """ loginfo('Processing file {} ...'.format(file_name.split()[0])) num_speakers = None if len(file_name.split()) > 1: # number of speakers is defined file_name, num_speakers = file_name.split()[0], int( file_name.split()[1]) wav = '{}.{}'.format(os.path.join(wav_dir, file_name), wav_suffix) rate, sig = read(wav) if len(sig.shape) != 1: raise ValueError('Expected mono as input audio.') if rate != RATE: logwarning( 'The input file is expected to be in 8000 Hz, got {} Hz instead, resampling.' .format(rate)) sig = signal.resample(sig, RATE) fea_extractor = Features() fea = fea_extractor(sig) vad, n_regions, n_frames = get_vad( '{}.{}'.format(os.path.join(vad_dir, file_name), vad_suffix), len(fea)) ivec_set = IvecSet() ivec_set.name = file_name ivec_set.num_speakers = num_speakers for seg in get_segments(vad, max_size, tolerance): start, end = get_num_segments(seg[0]), get_num_segments(seg[1]) if seg[0] > fea.shape[0] - 1 or seg[1] > fea.shape[0] - 1: raise ValueError( 'Unexpected features dimensionality - check VAD input or audio.' ) w = fea2ivec_obj.get_ivec(fea[seg[0]:seg[1]]) ivec_set.add(w, start, end, mfccs=fea) if out_dir is not None: Utils.mkdir_p(os.path.join(out_dir, os.path.dirname(file_name))) ivec_set.save(os.path.join(out_dir, '{}.pkl'.format(file_name))) else: return ivec_set
def score_ivec(self, min_length, max_num_speakers, num_threads): """ Score i-vectors. Args: min_length (int): minimal length of segment used for clustering in miliseconds max_num_speakers (int): maximal number of speakers num_threads (int): number of threads to use Returns: dict: dictionary with scores for each file """ scores_dict = {} for ivecset in self.ivecs: name = os.path.normpath(ivecset.name) ivecs_all = ivecset.get_all() ivecs_long = ivecset.get_longer(min_length) loginfo('Scoring {} ...'.format(name)) size = ivecset.size() if size > 0: if ivecset.num_speakers is not None: num_speakers = ivecset.num_speakers sklearnkmeans = sklearnKMeans( n_clusters=num_speakers, n_init=100, n_jobs=num_threads).fit(ivecs_long) if self.plda is None: centroids = sklearnkmeans.cluster_centers_ else: centroids = PLDAKMeans(sklearnkmeans.cluster_centers_, num_speakers, self.plda).fit(ivecs_long) else: xm = xmeans(ivecs_long, kmax=max_num_speakers) xm.process() num_speakers = len(xm.get_clusters()) sklearnkmeans = sklearnKMeans( n_clusters=num_speakers, n_init=100, n_jobs=num_threads).fit(ivecs_long) centroids = sklearnkmeans.cluster_centers_ if self.norm is None: if self.plda is None: ivecs_all = Utils.l2_norm(ivecs_all) centroids = Utils.l2_norm(centroids) scores_dict[name] = cosine_similarity( ivecs_all, centroids).T else: scores_dict[name] = self.plda.score( ivecs_all, centroids) else: ivecs_all = Utils.l2_norm(ivecs_all) centroids = Utils.l2_norm(centroids) scores_dict[name] = self.norm.s_norm(ivecs_all, centroids) else: logwarning('No i-vectors to score in {}.'.format(ivecset.name)) return scores_dict
def __init__(self, input_list, embeddings, embeddings_mean=None, lda=None, use_l2_norm=True, norm=None, plda=None): """ Initialize diarization class. Args: input_list (string_types): path to list of input files embeddings (string_types|List[EmbeddingSet]): path to directory containing embeddings or list of EmbeddingSet instances embeddings_mean (np.ndarray): lda (np.ndarray): linear discriminant analysis - dimensionality reduction use_l2_norm (bool): do l2 normalization norm (Normalization): instance of class Normalization plda (GPLDA): instance of class GPLDA """ self.input_list = input_list if isinstance(embeddings, str): self.embeddings_dir = embeddings self.embeddings = list(self.load_embeddings()) else: self.embeddings = embeddings self.lda = lda self.use_l2_norm = use_l2_norm self.norm = norm self.plda = plda for embedding_set in self.embeddings: for embedding in embedding_set: if embeddings_mean is not None: embedding.data = embedding.data - embeddings_mean if lda is not None: embedding.data = embedding.data.dot(lda) if use_l2_norm: embedding.data = Utils.l2_norm( embedding.data[np.newaxis, :]).flatten() if self.norm: assert embeddings_mean is not None, 'Expecting usage of mean from normalization set.' self.norm.embeddings = self.norm.embeddings - embeddings_mean if lda is not None: self.norm.embeddings = self.norm.embeddings.dot(lda) if use_l2_norm: self.norm.embeddings = Utils.l2_norm(self.norm.embeddings)
def process_files(fns, wav_dir, vad_dir, out_dir, features_extractor, embedding_extractor, min_size, max_size, overlap, tolerance, wav_suffix='.wav', vad_suffix='.lab.gz', n_jobs=1): """ Process all files from list. Args: fns (list): name of files to process wav_dir (str): directory with wav files vad_dir (str): directory with vad files out_dir (str|None): output directory features_extractor (Any): intialized object for feature extraction embedding_extractor (Any): initialized object for embedding extraction max_size (int): maximal size of window in ms min_size (int): minimal size of window in ms overlap (int): size of window overlap in ms tolerance (int): accept given number of frames as speech even when it is marked as silence wav_suffix (str): suffix of wav files vad_suffix (str): suffix of vad files n_jobs (int): number of jobs to run in parallel Returns: List[EmbeddingSet] """ kwargs = dict(wav_dir=wav_dir, vad_dir=vad_dir, out_dir=out_dir, features_extractor=features_extractor, embedding_extractor=embedding_extractor, tolerance=tolerance, min_size=min_size, max_size=max_size, overlap=overlap, wav_suffix=wav_suffix, vad_suffix=vad_suffix) if n_jobs == 1: ret = _process_files((fns, kwargs)) else: pool = multiprocessing.Pool(n_jobs) ret = pool.map(_process_files, ((part, kwargs) for part in Utils.partition(fns, n_jobs))) return [item for sublist in ret for item in sublist]
def load_ivecs(self): """ Load normalization i-vectors. :returns: i-vectors :rtype: numpy.array """ ivecs_list = [] for f in Utils.list_directory_by_suffix(self.in_ivec_dir, 'h5'): loginfo('Loading h5 normalization file {} ...'.format(f)) h5file = h5py.File(os.path.join(self.in_ivec_dir, f), 'r') for h5_key in h5file.keys(): ivecs_list.append(h5file[h5_key][:].flatten()) return np.array(ivecs_list)
def process_files(fns, speakers_dict, features_extractor, embedding_extractor, audio_dir, wav_suffix, in_rttm_dir, rttm_suffix, min_length, n_jobs=1): """ Args: fns: speakers_dict: features_extractor: embedding_extractor: audio_dir: wav_suffix: in_rttm_dir: rttm_suffix: min_length: n_jobs: Returns: """ kwargs = dict(speakers_dict=speakers_dict, features_extractor=features_extractor, embedding_extractor=embedding_extractor, audio_dir=audio_dir, wav_suffix=wav_suffix, in_rttm_dir=in_rttm_dir, rttm_suffix=rttm_suffix, min_length=min_length) if n_jobs == 1: ret = _process_files((fns, kwargs)) else: pool = multiprocessing.Pool(n_jobs) ret = pool.map(_process_files, ((part, kwargs) for part in Utils.partition(fns, n_jobs))) return ret
def process_files(fns, wav_dir, vad_dir, out_dir, fea2ivec_obj, max_size, tolerance, wav_suffix='.wav', vad_suffix='.lab.gz', n_jobs=1): """ Process all files from list. Args: fns (list): name of files to process wav_dir (str): directory with wav files vad_dir (str): directory with vad files out_dir (str): output directory fea2ivec_obj (Fea2Ivec): input models for i-vector extraction max_size (int): maximal size of window in ms tolerance (int): accept given number of frames as speech even when it is marked as silence wav_suffix (str): suffix of wav files vad_suffix (str): suffix of vad files n_jobs (int): number of jobs to run in parallel Returns: """ kwargs = dict(wav_dir=wav_dir, vad_dir=vad_dir, out_dir=out_dir, fea2ivec_obj=fea2ivec_obj, max_size=max_size, wav_suffix=wav_suffix, vad_suffix=vad_suffix, tolerance=tolerance) if n_jobs == 1: ret = _process_files((fns, kwargs)) else: pool = multiprocessing.Pool(n_jobs) ret = pool.map(_process_files, ((part, kwargs) for part in Utils.partition(fns, n_jobs))) return ret
def save(self, path): Utils.mkdir_p(os.path.dirname(path)) with open(path, 'wb') as f: pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
type=int, required=False) parser.set_defaults(num_cores=1) parser.set_defaults(max_num_speakers=10) parser.set_defaults(wav_suffix='wav') parser.set_defaults(vad_suffix='lab.gz') parser.set_defaults(rttm_suffix='rttm') parser.set_defaults(min_window_size=2500) parser.set_defaults(max_window_size=3000) parser.set_defaults(vad_tolerance=0) args = parser.parse_args() set_mkl(1) # initialize extractor config = Utils.read_config(args.configuration) fea2ivec = Fea2Ivec(config['GMM']['model_path'], config['Extractor']['model_path']) files = [line.rstrip('\n') for line in open(args.input_list)] # extract i-vectors if args.in_ivec_dir is None: ivec = process_files(files, args.audio_dir, args.vad_dir, args.in_ivec_dir, fea2ivec, args.max_window_size, args.vad_tolerance, args.wav_suffix, args.vad_suffix, args.num_threads) if args.out_ivec_dir: for ivecset in ivec: ivecset.save('{}.{}'.format( os.path.join(args.out_ivec_dir, ivecset.name), 'pkl')) else: