def __init__(self, feature_extraction, sad__pre, scd__pre, emb__pre, sad__onset=0.7, sad__offset=0.7, sad__dimension=1, scd__alpha=0.5, scd__min_duration=1., scd__dimension=1, emb__internal=False, cls__damping=0.8, cls__preference=-20, cls__metric='cosine'): super(SpeakerDiarizationPreStages, self).__init__() self.feature_extraction = feature_extraction # speech activity detection hyper-parameters self.sad__onset = sad__onset self.sad__offset = sad__offset self.sad__dimension = sad__dimension # speaker change detection hyper-parameters self.scd__alpha = scd__alpha self.scd__min_duration = scd__min_duration self.scd__dimension = scd__dimension # embedding hyper-parameters self.emb__internal = emb__internal # clustering hyper-parameters self.cls__damping = cls__damping self.cls__preference = cls__preference self.cls__metric = cls__metric step = self.feature_extraction.sliding_window().step # initialize speech activity detection module self.sad_ = Precomputed(sad__pre) self.sad_binarize_ = Binarize(onset=self.sad__onset, offset=self.sad__offset) # initialize speaker change detection module self.scd_ = Precomputed(scd__pre) self.scd_peak_ = Peak(alpha=self.scd__alpha, min_duration=self.scd__min_duration, percentile=False) # initialize speech turn embedding module self.emb_ = Precomputed(emb__pre) # initialize clustering module self.cls_ = my_cluster.ClusteringAP(metric=self.cls__metric, damping=self.cls__damping, preference=self.cls__preference)
def __init__(self, feature_extraction, sad__pre, scd__pre, emb__pre, sad__onset=0.7, sad__offset=0.7, sad__dimension=1, scd__alpha=0.5, scd__min_duration=1., scd__dimension=1, emb__internal=False, cls__method='average', cls__threshold=5, cls__metric='cosine'): super(SpeakerDiarizationHACPre, self).__init__() self.feature_extraction = feature_extraction # speech activity detection hyper-parameters self.sad__onset = sad__onset self.sad__offset = sad__offset self.sad__dimension = sad__dimension # speaker change detection hyper-parameters self.scd__alpha = scd__alpha self.scd__min_duration = scd__min_duration self.scd__dimension = scd__dimension # embedding hyper-parameters self.emb__internal = emb__internal # clustering hyper-parameters self.cls__method = cls__method self.cls__threshold = cls__threshold self.cls__metric = cls__metric step = self.feature_extraction.sliding_window().step # initialize speech activity detection module self.sad_ = Precomputed(sad__pre) self.sad_binarize_ = Binarize(onset=self.sad__onset, offset=self.sad__offset) # initialize speaker change detection module self.scd_ = Precomputed(scd__pre) self.scd_peak_ = Peak(alpha=self.scd__alpha, min_duration=self.scd__min_duration, percentile=False) # initialize speech turn embedding module self.emb_ = Precomputed(emb__pre) # initialize clustering module self.cls_ = my_cluster.ClusteringHAC(metric=self.cls__metric, method=self.cls__method, threshold=self.cls__threshold)
def with_params(self, sad_onset=0.7, sad_offset=0.7, scd_alpha=0.5, scd_min_duration=1.): # initialize speech activity detection self.sad_ = Precomputed(self.sad) self.sad_onset = sad_onset self.sad_offset = sad_offset self.sad_binarize_ = Binarize(onset=sad_onset, offset=sad_offset) # initialize speaker change detection self.scd_ = Precomputed(self.scd) self.scd_alpha = scd_alpha self.scd_min_duration = scd_min_duration self.scd_peak_ = Peak(alpha=scd_alpha, min_duration=scd_min_duration) return self
def objective_function(parameters, beta=1.0): epoch, alpha = parameters weights_h5 = WEIGHTS_H5.format(epoch=epoch) sequence_embedding = SequenceEmbedding.from_disk( architecture_yml, weights_h5) segmentation = Segmentation( sequence_embedding, feature_extraction, duration=duration, step=0.100) if epoch not in predictions: predictions[epoch] = {} purity = SegmentationPurity() coverage = SegmentationCoverage() f, n = 0., 0 for dev_file in getattr(protocol, subset)(): uri = get_unique_identifier(dev_file) reference = dev_file['annotation'] n += 1 if uri in predictions[epoch]: prediction = predictions[epoch][uri] else: prediction = segmentation.apply(dev_file) predictions[epoch][uri] = prediction peak = Peak(alpha=alpha) hypothesis = peak.apply(prediction) p = purity(reference, hypothesis) c = coverage(reference, hypothesis) f += f_measure(c, p, beta=beta) return 1 - (f / n)
def with_params(self, sad_onset=0.7, sad_offset=0.7, scd_alpha=0.5, scd_min_duration=1.): # initialize speech activity detection self.sad_ = Precomputed(self.sad) self.sad_onset = sad_onset self.sad_offset = sad_offset self.sad_binarize_ = Binarize(onset=sad_onset, offset=sad_offset) # initialize speaker change detection self.scd_ = Precomputed(self.scd) self.scd_alpha = scd_alpha self.scd_min_duration = scd_min_duration self.scd_peak_ = Peak(alpha=scd_alpha, min_duration=scd_min_duration) return self
class NeuralSegmentation(Pipeline): def __init__(self, sad=None, scd=None, **kwargs): super().__init__() self.sad = Path(sad).expanduser().resolve(strict=True) self.scd = Path(scd).expanduser().resolve(strict=True) self.with_params(**kwargs) def get_tune_space(self): return { 'sad_onset': chocolate.uniform(0., 1.), 'sad_offset': chocolate.uniform(0., 1.), 'scd_alpha': chocolate.uniform(0., 1.), 'scd_min_duration': chocolate.uniform(0., 5.), } def get_tune_metric(self): raise NotImplementedError() def with_params(self, sad_onset=0.7, sad_offset=0.7, scd_alpha=0.5, scd_min_duration=1.): # initialize speech activity detection self.sad_ = Precomputed(self.sad) self.sad_onset = sad_onset self.sad_offset = sad_offset self.sad_binarize_ = Binarize(onset=sad_onset, offset=sad_offset) # initialize speaker change detection self.scd_ = Precomputed(self.scd) self.scd_alpha = scd_alpha self.scd_min_duration = scd_min_duration self.scd_peak_ = Peak(alpha=scd_alpha, min_duration=scd_min_duration) return self def apply(self, current_file): # Speech Activity Detection # get raw SAD scores soft_sad = self.sad_(current_file) # check once and for all whether SAD scores are log-scaled if not hasattr(self, 'sad_log_scale_'): if np.nanmean(soft_sad.data) < 0: self.sad_log_scale_ = True else: self.sad_log_scale_ = False # get SAD probability prob_sad = np.exp(soft_sad.data) if self.sad_log_scale_ \ else soft_sad.data # support both non-speech/speech & non-speech/single/overlap prob_sad = 1. - prob_sad[:, 0] prob_sad = SlidingWindowFeature(prob_sad, soft_sad.sliding_window) # binarization hard_sad = self.sad_binarize_.apply(prob_sad) # Speaker Change Detection # get raw SCD scores soft_scd = self.scd_(current_file) # check once and for all whether SCD scores are log-scaled if not hasattr(self, 'scd_log_scale_'): if np.nanmean(soft_scd.data) < 0: self.scd_log_scale_ = True else: self.scd_log_scale_ = False # get SCD probability prob_scd = np.exp(soft_scd.data) if self.scd_log_scale_ \ else soft_scd.data # take the final dimension # (in order to support both classification and regression scores) prob_scd = prob_scd[:, -1] prob_scd = SlidingWindowFeature(prob_scd, soft_scd.sliding_window) # peak detection hard_scd = self.scd_peak_.apply(prob_scd) speech_turns = hard_scd.crop(hard_sad) # only process the annotated part speech_turns = speech_turns.crop(get_annotated(current_file)) return speech_turns
def test(protocol, tune_dir, apply_dir, subset='test', beta=1.0): os.makedirs(apply_dir) train_dir = os.path.dirname(os.path.dirname(os.path.dirname(tune_dir))) duration = float(os.path.basename(train_dir)) config_dir = os.path.dirname(os.path.dirname(os.path.dirname(train_dir))) config_yml = config_dir + '/config.yml' with open(config_yml, 'r') as fp: config = yaml.load(fp) # -- FEATURE EXTRACTION -- feature_extraction_name = config['feature_extraction']['name'] features = __import__('pyannote.audio.features', fromlist=[feature_extraction_name]) FeatureExtraction = getattr(features, feature_extraction_name) feature_extraction = FeatureExtraction( **config['feature_extraction'].get('params', {})) # -- HYPER-PARAMETERS -- tune_yml = tune_dir + '/tune.yml' with open(tune_yml, 'r') as fp: tune = yaml.load(fp) architecture_yml = train_dir + '/architecture.yml' WEIGHTS_H5 = train_dir + '/weights/{epoch:04d}.h5' weights_h5 = WEIGHTS_H5.format(epoch=tune['epoch']) sequence_embedding = SequenceEmbedding.from_disk( architecture_yml, weights_h5) segmentation = Segmentation( sequence_embedding, feature_extraction, duration=duration, step=0.100) peak = Peak(alpha=tune['alpha']) HARD_JSON = apply_dir + '/{uri}.hard.json' SOFT_PKL = apply_dir + '/{uri}.soft.pkl' eval_txt = apply_dir + '/eval.txt' TEMPLATE = '{uri} {purity:.5f} {coverage:.5f} {f_measure:.5f}\n' purity = SegmentationPurity() coverage = SegmentationCoverage() fscore = [] for test_file in getattr(protocol, subset)(): soft = segmentation.apply(test_file) hard = peak.apply(soft) uri = get_unique_identifier(test_file) path = SOFT_PKL.format(uri=uri) mkdir_p(os.path.dirname(path)) with open(path, 'w') as fp: pickle.dump(soft, fp) path = HARD_JSON.format(uri=uri) mkdir_p(os.path.dirname(path)) with open(path, 'w') as fp: pyannote.core.json.dump(hard, fp) try: reference = test_file['annotation'] uem = test_file['annotated'] except KeyError as e: continue p = purity(reference, hard) c = coverage(reference, hard) f = f_measure(c, p, beta=beta) fscore.append(f) line = TEMPLATE.format( uri=uri, purity=p, coverage=c, f_measure=f) with open(eval_txt, 'a') as fp: fp.write(line) p = abs(purity) c = abs(coverage) f = np.mean(fscore) line = TEMPLATE.format( uri='ALL', purity=p, coverage=c, f_measure=f) with open(eval_txt, 'a') as fp: fp.write(line)
predictions[uri] = segmentation.apply(wav) # tested thresholds alphas = np.linspace(0, 1, 50) # evaluation metrics (purity and coverage) from pyannote.metrics.segmentation import SegmentationPurity from pyannote.metrics.segmentation import SegmentationCoverage purity = [SegmentationPurity() for alpha in alphas] coverage = [SegmentationCoverage() for alpha in alphas] # peak detection from pyannote.audio.signal import Peak for i, alpha in enumerate(alphas): # initialize peak detection algorithm peak = Peak(alpha=alpha, min_duration=1.0) for uri, reference in groundtruth.items(): # apply peak detection hypothesis = peak.apply(predictions[uri]) # compute purity and coverage purity[i](reference, hypothesis) coverage[i](reference, hypothesis) # print the results in three columns: # threshold, purity, coverage TEMPLATE = '{alpha:.3f} {purity:.1f}% {coverage:.1f}%' for i, a in enumerate(alphas): p = 100 * abs(purity[i]) c = 100 * abs(coverage[i]) print(TEMPLATE.format(alpha=a, purity=p, coverage=c))
class SpeakerDiarizationHACPre(object): '''Speaker diarization with hierarchical agglomerative clustering''' def __init__(self, feature_extraction, sad__pre, scd__pre, emb__pre, sad__onset=0.7, sad__offset=0.7, sad__dimension=1, scd__alpha=0.5, scd__min_duration=1., scd__dimension=1, emb__internal=False, cls__method='average', cls__threshold=5, cls__metric='cosine'): super(SpeakerDiarizationHACPre, self).__init__() self.feature_extraction = feature_extraction # speech activity detection hyper-parameters self.sad__onset = sad__onset self.sad__offset = sad__offset self.sad__dimension = sad__dimension # speaker change detection hyper-parameters self.scd__alpha = scd__alpha self.scd__min_duration = scd__min_duration self.scd__dimension = scd__dimension # embedding hyper-parameters self.emb__internal = emb__internal # clustering hyper-parameters self.cls__method = cls__method self.cls__threshold = cls__threshold self.cls__metric = cls__metric step = self.feature_extraction.sliding_window().step # initialize speech activity detection module self.sad_ = Precomputed(sad__pre) self.sad_binarize_ = Binarize(onset=self.sad__onset, offset=self.sad__offset) # initialize speaker change detection module self.scd_ = Precomputed(scd__pre) self.scd_peak_ = Peak(alpha=self.scd__alpha, min_duration=self.scd__min_duration, percentile=False) # initialize speech turn embedding module self.emb_ = Precomputed(emb__pre) # initialize clustering module self.cls_ = my_cluster.ClusteringHAC(metric=self.cls__metric, method=self.cls__method, threshold=self.cls__threshold) def __call__(self, current_file, annotated=False): # speech activity detection soft_sad = self.sad_(current_file) hard_sad = self.sad_binarize_.apply(soft_sad, dimension=self.sad__dimension) # speaker change detection soft_scd = self.scd_(current_file) hard_scd = self.scd_peak_.apply(soft_scd, dimension=self.scd__dimension) # speech turns speech_turns = hard_scd.crop(hard_sad) if annotated: speech_turns = speech_turns.crop(get_annotated(current_file)) # remove small speech turns emb = self.emb_(current_file) speech_turns = [ speech_turn for speech_turn in speech_turns if len(emb.crop(speech_turn, mode='loose')) > 0 ] # speech turns embedding to_stack = [ np.sum(emb.crop(speech_turn, mode='loose'), axis=0) for speech_turn in speech_turns ] if len(to_stack) < 1: return None fX = l2_normalize(np.vstack(to_stack)) # speech turn clustering cluster_labels = self.cls_.apply(fX) # build hypothesis from clustering results hypothesis = Annotation(uri=current_file['uri']) for speech_turn, label in zip(speech_turns, cluster_labels): hypothesis[speech_turn] = label return hypothesis
def annotate_speakers(self, filename, gui, visualization=True): test_file = {'uri': 'filename', 'audio': filename} sad_scores = self.sad(test_file) binarize = Binarize(offset=0.5, onset=0.70, log_scale=True) speech = binarize.apply(sad_scores, dimension=1) scd_scores = self.scd(test_file) peak = Peak(alpha=0.1, min_duration=1, log_scale=True) partition = peak.apply(scd_scores, dimension=1) speech_turns = partition.crop(speech) embeddings = self.emb(test_file) long_turns = Timeline( segments=[s for s in speech_turns if s.duration > 1.1]) res = [] for segment in long_turns: x = embeddings.crop(segment, mode='strict') if x.size == 0: continue n_sample = x.shape[0] x = np.mean(x, axis=0) if np.any(np.isnan(x)): continue res.append((segment, x, n_sample)) if visualization: dist = [] for i in range(len(res)): for j in range(i + 1, len(res)): dist.append(l2_dist(res[i][1], res[j][1])) fig, ax = plt.subplots() ax.scatter(np.arange(len(dist)), np.array(sorted(dist))) fig.show() # let's visualize SAD and SCD results using pyannote.core visualization API # helper function to make visualization prettier plot_ready = lambda scores: SlidingWindowFeature( np.exp(scores.data[:, 1:]), scores.sliding_window) # create a figure with 6 rows with matplotlib nrows = 6 fig, ax = plt.subplots(nrows=nrows, ncols=1) fig.set_figwidth(20) fig.set_figheight(nrows * 2) # 1st row: reference annotation # notebook.plot_annotation(test_file['annotation'], ax=ax[0]) # ax[0].text(notebook.crop.start + 0.5, 0.1, 'reference', fontsize=14) # 2nd row: SAD raw scores notebook.plot_feature(plot_ready(sad_scores), ax=ax[1]) ax[1].text(notebook.crop.start + 0.5, 0.6, 'SAD\nscores', fontsize=14) ax[1].set_ylim(-0.1, 1.1) # 3rd row: SAD result notebook.plot_timeline(speech, ax=ax[2]) ax[2].text(notebook.crop.start + 0.5, 0.1, 'SAD', fontsize=14) # 4th row: SCD raw scores notebook.plot_feature(plot_ready(scd_scores), ax=ax[3]) ax[3].text(notebook.crop.start + 0.5, 0.3, 'SCD\nscores', fontsize=14) ax[3].set_ylim(-0.1, 0.6) # 5th row: SCD result notebook.plot_timeline(partition, ax=ax[4]) ax[4].text(notebook.crop.start + 0.5, 0.1, 'SCD', fontsize=14) # 6th row: combination of SAD and SCD notebook.plot_timeline(speech_turns, ax=ax[5]) ax[5].text(notebook.crop.start + 0.5, 0.1, 'speech turns', fontsize=14) fig.show() res, num_people = self.min_spanning_tree(res) gui.append_line('There are {} people in this audio'.format(num_people)) return res
def apply(protocol, train_dir, store_dir, threshold, subset='development', epoch=None, min_duration=1.0): # -- LOAD MODEL -- nb_epoch = 0 while True: weights_h5 = LoggingCallback.WEIGHTS_H5.format(log_dir=train_dir, epoch=nb_epoch) if not os.path.isfile(weights_h5): break nb_epoch += 1 config_dir = os.path.dirname(os.path.dirname(train_dir)) config_yml = config_dir + '/config.yml' with open(config_yml, 'r') as fp: config = yaml.load(fp) # -- FEATURE EXTRACTION -- feature_extraction_name = config['feature_extraction']['name'] features = __import__('pyannote.audio.features', fromlist=[feature_extraction_name]) FeatureExtraction = getattr(features, feature_extraction_name) feature_extraction = FeatureExtraction( **config['feature_extraction'].get('params', {})) # -- SEQUENCE GENERATOR -- duration = config['sequences']['duration'] step = config['sequences']['step'] def saveSeg(filepath, filename, segmentation): f = open(filepath, 'w') for idx, val in enumerate(segmentation): line = filename + ' ' + str(idx) + ' 1 ' + str(int( val[0] * 100)) + ' ' + str( int(val[1] * 100 - val[0] * 100)) + '\n' f.write(line) f.close() filepath = store_dir + '/' + str(threshold) + '/' mkdir_p(filepath) # -- CHOOSE MODEL -- if epoch > nb_epoch: raise ValueError('Epoch should be less than ' + str(nb_epoch)) if epoch is None: epoch = nb_epoch - 1 sequence_labeling = SequenceLabeling.from_disk(train_dir, epoch) aggregation = SequenceLabelingAggregation(sequence_labeling, feature_extraction, duration=duration, step=step) # -- PREDICTION -- predictions = {} for dev_file in getattr(protocol, subset)(): uri = dev_file['uri'] predictions[uri] = aggregation.apply(dev_file) # initialize peak detection algorithm peak = Peak(alpha=threshold, min_duration=min_duration) for dev_file in getattr(protocol, subset)(): uri = dev_file['uri'] hypothesis = peak.apply(predictions[uri]) filepath = store_dir + '/' + str(threshold) + '/' + uri + '.0.seg' saveSeg(filepath, uri, hypothesis)
def evaluate(protocol, train_dir, store_dir, subset='development', epoch=None, min_duration=1.0): mkdir_p(store_dir) # -- LOAD MODEL -- nb_epoch = 0 while True: weights_h5 = LoggingCallback.WEIGHTS_H5.format(log_dir=train_dir, epoch=nb_epoch) if not os.path.isfile(weights_h5): break nb_epoch += 1 config_dir = os.path.dirname(os.path.dirname(train_dir)) config_yml = config_dir + '/config.yml' with open(config_yml, 'r') as fp: config = yaml.load(fp) # -- FEATURE EXTRACTION -- feature_extraction_name = config['feature_extraction']['name'] features = __import__('pyannote.audio.features', fromlist=[feature_extraction_name]) FeatureExtraction = getattr(features, feature_extraction_name) feature_extraction = FeatureExtraction( **config['feature_extraction'].get('params', {})) # -- SEQUENCE GENERATOR -- duration = config['sequences']['duration'] step = config['sequences']['step'] groundtruth = {} for dev_file in getattr(protocol, subset)(): uri = dev_file['uri'] groundtruth[uri] = dev_file['annotation'] # -- CHOOSE MODEL -- if epoch > nb_epoch: raise ValueError('Epoch should be less than ' + str(nb_epoch)) if epoch is None: epoch = nb_epoch - 1 sequence_labeling = SequenceLabeling.from_disk(train_dir, epoch) aggregation = SequenceLabelingAggregation(sequence_labeling, feature_extraction, duration=duration, step=step) # -- PREDICTION -- predictions = {} for dev_file in getattr(protocol, subset)(): uri = dev_file['uri'] predictions[uri] = aggregation.apply(dev_file) alphas = np.linspace(0, 1, 20) purity = [SegmentationPurity(parallel=False) for alpha in alphas] coverage = [SegmentationCoverage(parallel=False) for alpha in alphas] # -- SAVE RESULTS -- for i, alpha in enumerate(alphas): # initialize peak detection algorithm peak = Peak(alpha=alpha, min_duration=min_duration) for uri, reference in groundtruth.items(): # apply peak detection hypothesis = peak.apply(predictions[uri]) # compute purity and coverage purity[i](reference, hypothesis) coverage[i](reference, hypothesis) TEMPLATE = '{alpha:g} {purity:.3f}% {coverage:.3f}%' with open(store_dir + '/res.txt', 'a') as fp: for i, a in enumerate(alphas): p = 100 * abs(purity[i]) c = 100 * abs(coverage[i]) print(TEMPLATE.format(alpha=a, purity=p, coverage=c)) fp.write(TEMPLATE.format(alpha=a, purity=p, coverage=c) + '\n')
class SpeakerDiarizationWeighted(object): def __init__(self, feature_extraction, sad__pre, scd__pre, weight__pre, emb__pre, sad__onset=0.7, sad__offset=0.7, sad__dimension=1, scd__alpha=0.5, scd__min_duration=1., scd__dimension=1, emb__internal=False, cls__damping=0.8, cls__preference=-20, cls__metric='cosine'): super(SpeakerDiarizationWeighted, self).__init__() self.feature_extraction = feature_extraction # speech activity detection hyper-parameters self.sad__onset = sad__onset self.sad__offset = sad__offset self.sad__dimension = sad__dimension # speaker change detection hyper-parameters self.scd__alpha = scd__alpha self.scd__min_duration = scd__min_duration self.scd__dimension = scd__dimension # embedding hyper-parameters self.emb__internal = emb__internal # clustering hyper-parameters self.cls__damping = cls__damping self.cls__preference = cls__preference self.cls__metric = cls__metric step = self.feature_extraction.sliding_window().step # initialize speech activity detection module self.sad_ = Precomputed(sad__pre) self.sad_binarize_ = Binarize(onset=self.sad__onset, offset=self.sad__offset) # initialize speaker change detection module self.scd_ = Precomputed(scd__pre) self.scd_peak_ = Peak(alpha=self.scd__alpha, min_duration=self.scd__min_duration, percentile=False) # initialize weights self.weight_ = Precomputed(weight__pre) # initialize speech turn embedding module self.emb_ = Precomputed(emb__pre) # initialize clustering module self.cls_ = my_cluster.ClusteringAP(metric=self.cls__metric, damping=self.cls__damping, preference=self.cls__preference) def __call__(self, current_file, annotated=False): # speech activity detection soft_sad = self.sad_(current_file) hard_sad = self.sad_binarize_.apply(soft_sad, dimension=self.sad__dimension) # speaker change detection soft_scd = self.scd_(current_file) hard_scd = self.scd_peak_.apply(soft_scd, dimension=self.scd__dimension) # speech turns speech_turns = hard_scd.crop(hard_sad) if annotated: speech_turns = speech_turns.crop(get_annotated(current_file)) # remove small speech turns emb = self.emb_(current_file) speech_turns = [ speech_turn for speech_turn in speech_turns if len(emb.crop(speech_turn, mode='loose')) > 0 ] # weights weight = self.weight_(current_file) # speech turns embedding to_stack = [ np.mean(emb.crop(speech_turn, mode='loose') * (1 - weight.crop(speech_turn, mode='loose')), axis=0) for speech_turn in speech_turns ] if len(to_stack) < 1: return None fX = l2_normalize(np.vstack(to_stack)) # speech turn clustering cluster_labels = self.cls_.apply(fX) # build hypothesis from clustering results hypothesis = Annotation(uri=current_file['uri']) for speech_turn, label in zip(speech_turns, cluster_labels): hypothesis[speech_turn] = label return hypothesis
metric = SegmentationPurityCoverageFMeasure() # peak detection min_duration = 1.0 from pyannote.audio.signal import Peak # alpha / min_duration are tunable parameters (and should be tuned for better performance) # we use log_scale = True because of the final log-softmax in the StackedRNN model alphas = np.linspace(0, 1, 20) purity_list = [] coverage_list = [] for alpha in alphas: peak = Peak(alpha=alpha, min_duration=min_duration, log_scale=True) # evaluation metric # loop on test files for test_file in protocol.test(): # load reference annotation reference = test_file['annotation'] uem = get_annotated(test_file) # load precomputed change scores as pyannote.core.SlidingWindowFeature scd_scores = precomputed(test_file) # binarize scores to obtain speech regions as pyannote.core.Timeline hypothesis = peak.apply(scd_scores, dimension=1)
class NeuralSegmentation(Pipeline): def __init__(self, sad=None, scd=None, **kwargs): super().__init__() self.sad = Path(sad).expanduser().resolve(strict=True) self.scd = Path(scd).expanduser().resolve(strict=True) self.with_params(**kwargs) def get_tune_space(self): return { 'sad_onset': chocolate.uniform(0., 1.), 'sad_offset': chocolate.uniform(0., 1.), 'scd_alpha': chocolate.uniform(0., 1.), 'scd_min_duration': chocolate.uniform(0., 5.), } def get_tune_metric(self): raise NotImplementedError() def with_params(self, sad_onset=0.7, sad_offset=0.7, scd_alpha=0.5, scd_min_duration=1.): # initialize speech activity detection self.sad_ = Precomputed(self.sad) self.sad_onset = sad_onset self.sad_offset = sad_offset self.sad_binarize_ = Binarize(onset=sad_onset, offset=sad_offset) # initialize speaker change detection self.scd_ = Precomputed(self.scd) self.scd_alpha = scd_alpha self.scd_min_duration = scd_min_duration self.scd_peak_ = Peak(alpha=scd_alpha, min_duration=scd_min_duration) return self def apply(self, current_file): # Speech Activity Detection # get raw SAD scores soft_sad = self.sad_(current_file) # check once and for all whether SAD scores are log-scaled if not hasattr(self, 'sad_log_scale_'): if np.nanmean(soft_sad.data) < 0: self.sad_log_scale_ = True else: self.sad_log_scale_ = False # get SAD probability prob_sad = np.exp(soft_sad.data) if self.sad_log_scale_ \ else soft_sad.data # support both non-speech/speech & non-speech/single/overlap prob_sad = 1. - prob_sad[:, 0] prob_sad = SlidingWindowFeature(prob_sad, soft_sad.sliding_window) # binarization hard_sad = self.sad_binarize_.apply(prob_sad) # Speaker Change Detection # get raw SCD scores soft_scd = self.scd_(current_file) # check once and for all whether SCD scores are log-scaled if not hasattr(self, 'scd_log_scale_'): if np.nanmean(soft_scd.data) < 0: self.scd_log_scale_ = True else: self.scd_log_scale_ = False # get SCD probability prob_scd = np.exp(soft_scd.data) if self.scd_log_scale_ \ else soft_scd.data # take the final dimension # (in order to support both classification and regression scores) prob_scd = prob_scd[:, -1] prob_scd = SlidingWindowFeature(prob_scd, soft_scd.sliding_window) # peak detection hard_scd = self.scd_peak_.apply(prob_scd) speech_turns = hard_scd.crop(hard_sad) # only process the annotated part speech_turns = speech_turns.crop(get_annotated(current_file)) return speech_turns
binarize = Binarize(offset=0.94, onset=0.70, log_scale=True) speech = binarize.apply(sad_scores, dimension=1) # # iterate over speech segments (as `pyannote.core.Segment` instances) for segment in speech: print(segment.start, segment.end) # obtain raw SCD scores (as `pyannote.core.SlidingWindowFeature` instance) scd_scores = scd(test_file) # detect peaks and return speaker homogeneous segments # (as `pyannote.core.Annotation` instance) # NOTE: both alpha/min_duration values were tuned on AMI dataset. # you might need to use different values for better results. from pyannote.audio.signal import Peak peak = Peak(alpha=0.08, min_duration=0.40, log_scale=True) partition = peak.apply(scd_scores, dimension=1) for segment in partition: print(segment.start, segment.end) # speech_turns = partition.crop(speech) # # # # let's visualize SAD and SCD results using pyannote.core visualization API # from matplotlib import pyplot as plt # from pyannote.core import Segment, notebook # # # only plot one minute (between t=120s and t=180s) # notebook.crop = Segment(120, 180) # # # helper function to make visualization prettier
def validate_epoch(self, epoch, protocol_name, subset='development', validation_data=None): target_purity = self.purity # load model for current epoch model = self.load_model(epoch).to(self.device) model.eval() if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False duration = self.task_.duration step = .25 * duration sequence_labeling = SequenceLabeling( model, self.feature_extraction_, duration=duration, step=.25 * duration, batch_size=self.batch_size, source='audio', device=self.device) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) # extract predictions for all files. predictions = {} for current_file in getattr(protocol, subset)(): uri = get_unique_identifier(current_file) predictions[uri] = sequence_labeling.apply(current_file) # dichotomic search to find alpha that maximizes coverage # while having at least `target_purity` lower_alpha = 0. upper_alpha = 1. best_alpha = .5 * (lower_alpha + upper_alpha) best_coverage = 0. for _ in range(10): current_alpha = .5 * (lower_alpha + upper_alpha) peak = Peak(alpha=current_alpha, min_duration=0.0, log_scale=model.logsoftmax) metric = DiarizationPurityCoverageFMeasure() # NOTE -- embarrasingly parallel # TODO -- parallelize this for current_file in getattr(protocol, subset)(): reference = current_file['annotation'] uri = get_unique_identifier(current_file) hypothesis = peak.apply(predictions[uri], dimension=1) hypothesis = hypothesis.to_annotation() uem = get_annotated(current_file) metric(reference, hypothesis, uem=uem) purity, coverage, _ = metric.compute_metrics() if purity < target_purity: upper_alpha = current_alpha else: lower_alpha = current_alpha if coverage > best_coverage: best_coverage = coverage best_alpha = current_alpha task = 'speaker_change_detection' metric_name = f'{task}/coverage@{target_purity:.2f}purity' return { metric_name: {'minimize': False, 'value': best_coverage}, f'{task}/threshold': {'minimize': 'NA', 'value': best_alpha}}