def gecko(args): hypotheses_path = args['<hypotheses_path>'] uri = args['<uri>'] colors = get_colors(uri) distances = {} if Path(hypotheses_path).exists(): hypotheses = load_rttm(hypotheses_path) hypothesis = hypotheses[uri] else: # protocol protocol = get_protocol(args['<hypotheses_path>']) reference = get_file(protocol, uri) hypothesis = reference['annotation'] annotated = get_annotated(reference) hypotheses_path = Path(hypotheses_path) protocol = args['--database.task.protocol'] features = None if protocol: protocol = get_protocol(protocol) embeddings = args['--embeddings'] reference, features = get_file(protocol, uri, embeddings=embeddings) if args['--map']: print(f"mapping {uri} with {protocol}") diarizationErrorRate = DiarizationErrorRate() annotated = get_annotated(reference) optimal_mapping = diarizationErrorRate.optimal_mapping( reference['annotation'], hypothesis, annotated) hypothesis = hypothesis.rename_labels(mapping=optimal_mapping) hypothesis = update_labels(hypothesis, distances) # tag unsure clusters distances_per_speaker = get_distances_per_speaker( features, hypothesis) if features else {} if args['--tag_na']: whole_file = Segment(0., annotated.segments_boundaries_[-1]) not_annotated = annotated.gaps(whole_file).to_annotation(na()) hypothesis = hypothesis.crop(annotated).update(not_annotated) gecko_json = annotation_to_GeckoJSON(hypothesis, distances_per_speaker, colors) if hypotheses_path.exists(): dir_path = hypotheses_path.parent else: dir_path = Path(".") json_path = os.path.join(dir_path, f'{uri}.json') with open(json_path, 'w') as file: json.dump(gecko_json, file) print(f"succefully dumped {json_path}")
def __init__(self, protocol=None, subset='train', db_yml=None, snr_min=5, snr_max=20): super().__init__() self.protocol = protocol self.subset = subset self.db_yml = db_yml self.snr_min = snr_min self.snr_max = snr_max # returns gaps in annotation as pyannote.core.Timeline instance get_gaps = lambda f: f['annotation'].get_timeline().gaps( support=get_annotated(f)) if isinstance(protocol, str): preprocessors = { 'audio': FileFinder(config_yml=db_yml), 'duration': get_audio_duration, 'gaps': get_gaps } protocol = get_protocol(self.protocol, preprocessors=preprocessors) else: protocol.preprocessors['gaps'] = get_gaps self.files_ = list(getattr(protocol, self.subset)())
def update_distances(args): """Loads user annotation from json path, converts it to pyannote `Annotation` using regions timings. From the annotation uri and precomputed embeddings, it computes the in-cluster distances between every speech turns Dumps the updated (with correct distances) JSON file to a timestamped file. """ json_path = Path(args['<json_path>']) uri = args['<uri>'] with open(json_path, 'r') as file: gecko_json = json.load(file) hypothesis, _, _, _ = gecko_JSON_to_Annotation(gecko_json, uri, 'speaker') colors = get_colors(uri) precomputed = Precomputed(embeddings) protocol = args['<database.task.protocol>'] protocol = get_protocol(protocol) for reference in getattr(protocol, 'test')(): if reference['uri'] == uri: features = precomputed(reference) break distances_per_speaker = get_distances_per_speaker(features, hypothesis) gecko_json = annotation_to_GeckoJSON(hypothesis, distances_per_speaker, colors) name = f"{json_path.stem}.{TIMESTAMP}.json" updated_path = Path(json_path.parent, name) with open(updated_path, 'w') as file: json.dump(gecko_json, file) print(f"succefully dumped {updated_path}")
def validate_init(self, protocol_name, subset='development'): """Initialize validation data Parameters ---------- protocol_name : `str` subset : {'train', 'development', 'test'} Defaults to 'development'. Returns ------- validation_data : object Validation data. """ protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) files = getattr(protocol, subset)() # convert lazy ProtocolFile to regular dict for multiprocessing files = [dict(file) for file in files] if isinstance(self.feature_extraction_, (Precomputed, RawAudio)): return files validation_data = [] for current_file in tqdm(files, desc='Feature extraction'): current_file['features'] = self.feature_extraction_(current_file) validation_data.append(current_file) return validation_data
def _validate_init_turn(self, protocol_name, subset='development'): np.random.seed(1337) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) batch_generator = SpeechTurnSubSegmentGenerator( self.feature_extraction_, self.duration, per_label=10, per_turn=5) batch = next(batch_generator(protocol, subset=subset)) X = np.stack(batch['X']) y = np.stack(batch['y']) z = np.stack(batch['z']) # get list of labels from list of repeated labels: # z 0 0 0 1 1 1 2 2 2 2 3 3 3 3 # y A A A A A A B B B B B B B B # becomes # z 0 0 0 1 1 1 2 2 2 2 3 3 3 3 # y A B yz = np.vstack([y, z]).T y = [] for _, yz_ in itertools.groupby(yz, lambda t: t[1]): yz_ = np.stack(yz_) y.append(yz_[0, 0]) y = np.array(y).reshape((-1, 1)) # precompute same/different groundtruth y = pdist(y, metric='equal') return {'X': X, 'y': y, 'z': z}
def __init__(self, protocol=None, subset: Subset = "train", snr_min=5, snr_max=20): super().__init__() self.protocol = protocol self.subset = subset self.snr_min = snr_min self.snr_max = snr_max # returns gaps in annotation as pyannote.core.Timeline instance get_gaps = (lambda f: f["annotation"].get_timeline().gaps( support=get_annotated(f))) if isinstance(protocol, str): preprocessors = { "audio": FileFinder(), "duration": get_audio_duration, "gaps": get_gaps, } protocol = get_protocol(self.protocol, preprocessors=preprocessors) else: protocol.preprocessors["gaps"] = get_gaps self.files_ = list(getattr(protocol, self.subset)())
def _validate_init_turn(self, protocol_name, subset='development'): np.random.seed(1337) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) batch_generator = SpeechTurnSubSegmentGenerator( self.feature_extraction_, self.duration, per_label=10, per_turn=5) batch = next(batch_generator(protocol, subset=subset)) X = np.stack(batch['X']) y = np.stack(batch['y']) z = np.stack(batch['z']) # get list of labels from list of repeated labels: # z 0 0 0 1 1 1 2 2 2 2 3 3 3 3 # y A A A A A A B B B B B B B B # becomes # z 0 0 0 1 1 1 2 2 2 2 3 3 3 3 # y A B yz = np.vstack([y, z]).T y = [] for _, yz_ in itertools.groupby(yz, lambda t: t[1]): yz_ = np.stack(yz_) y.append(yz_[0, 0]) y = np.array(y).reshape((-1, 1)) # precompute same/different groundtruth y = pdist(y, metric='equal') return {'X': X, 'y': y, 'z': z}
def main(args): protocol_name = args['<database.task.protocol>'] set = args['--set'] if args['--set'] else "train" filter_unk = args['--filter_unk'] crop = float(args['--crop']) if args['--crop'] else None hist = args['--hist'] verbose = args['--verbose'] save = args['--save'] protocol = get_protocol(protocol_name) print(f"getting stats from {protocol_name}.{set}...") stats = protocol.stats(set) print_stats(stats) if filter_unk: values = [ value for label, value in stats['labels'].items() if '#unknown#' not in label ] else: values = list(stats['labels'].values()) print(f"n_speaking_speakers: {np.array(values).nonzero()[0].shape[0]}") print("quartiles:") print(quartiles(values)) print("deciles:") print(deciles(values)) plot_speech_duration(values, protocol_name, set, hist, crop, save)
def info(protocol: str): """Print protocol detailed information""" p = get_protocol(protocol) if isinstance(p, SpeakerDiarizationProtocol): subsets = ["train", "development", "test"] skip_annotation = False skip_annotated = False elif isinstance(p, CollectionProtocol): subsets = ["files"] skip_annotation = True skip_annotated = True else: typer.echo( "Only collections and speaker diarization protocols are supported." ) typer.Exit(code=1) for subset in subsets: num_files = 0 speakers = set() duration = 0.0 speech = 0.0 def iterate(): try: for file in getattr(p, subset)(): yield file except (AttributeError, NotImplementedError): return for file in iterate(): num_files += 1 if not skip_annotation: annotation = file["annotation"] speakers.update(annotation.labels()) speech += annotation.get_timeline().support().duration() if not skip_annotated: annotated = file["annotated"] duration += annotated.duration() if num_files > 0: typer.secho(f"{subset}", fg=typer.colors.BRIGHT_GREEN, underline=True, bold=True) typer.echo(f" {num_files} files") if not skip_annotated: typer.echo(f" {duration_to_str(duration)} annotated") if not skip_annotation: typer.echo( f" {duration_to_str(speech)} of speech ({100 * speech / duration:.0f}%)" ) typer.echo(f" {len(speakers)} speakers")
def extract(protocol_name, file_finder, experiment_dir, robust=False, parallel=False): protocol = get_protocol(protocol_name) # load configuration file config_yml = experiment_dir + "/config.yml" with open(config_yml, "r") as fp: config = yaml.load(fp, Loader=yaml.SafeLoader) FeatureExtraction = get_class_by_name( config["feature_extraction"]["name"], default_module_name="pyannote.audio.features", ) feature_extraction = FeatureExtraction( **config["feature_extraction"].get("params", {}) ) sliding_window = feature_extraction.sliding_window dimension = feature_extraction.dimension # create metadata file at root that contains # sliding window and dimension information precomputed = Precomputed( root_dir=experiment_dir, sliding_window=sliding_window, dimension=dimension ) if parallel: extract_one = functools.partial( helper_extract, file_finder=file_finder, experiment_dir=experiment_dir, config_yml=config_yml, robust=robust, ) n_jobs = cpu_count() pool = Pool(n_jobs) imap = pool.imap else: feature_extraction = init_feature_extraction(experiment_dir) extract_one = functools.partial( helper_extract, file_finder=file_finder, experiment_dir=experiment_dir, feature_extraction=feature_extraction, robust=robust, ) imap = map for result in imap(extract_one, protocol.files()): if result is None: continue print(result)
def main(): usage = "%prog [options] database, raw_score_path" desc = "Write the output of the binary overlap detector into test based on a threshold" version = "%prog 0.1" parser = OptionParser(usage=usage, description=desc, version=version) parser.add_option("-t", "--onset", action="store", type="float", help="Onset Threshold", default=0.70) parser.add_option("-f", "--offset", action="store", type="float", help="Offset Threshold", default=0.70) parser.add_option("-d", "--dev", action="store_true", help="Print output based on development set", default=False) parser.add_option("-o", "--outputfile", action="store", type="string", help="Output file", default="./overlap.txt") (opt, args) = parser.parse_args() if(len(args)!=2): parser.error("Incorrect number of arguments") database, raw_score_path = args # get test file of protocol protocol = get_protocol(database) # load precomputed overlap scores as pyannote.core.SlidingWindowFeature precomputed = Precomputed(raw_score_path) # StackedRNN model # initialize binarizer # onset / offset are tunable parameters (and should be tuned for better # performance). we use log_scale=True because of the final log-softmax in the binarize = Binarize(onset=opt.onset, offset=opt.offset, log_scale=True) fw = open(opt.outputfile, 'wt') if opt.dev: for test_file in protocol.development(): ovl_scores = precomputed(test_file) # binarize overlap scores to obtain overlap regions as pyannote.core.Timeline ovl_regions = binarize.apply(ovl_scores, dimension=1) ovl_regions.uri = test_file['uri'] # write the output into text write_txt(fw, ovl_regions) else: for test_file in protocol.test(): ovl_scores = precomputed(test_file) # binarize overlap scores to obtain overlap regions as pyannote.core.Timeline ovl_regions = binarize.apply(ovl_scores, dimension=1) ovl_regions.uri = test_file['uri'] # write the output into text write_txt(fw, ovl_regions) fw.close()
def validate_init(self, protocol_name, subset='development'): protocol = get_protocol(protocol_name) if isinstance( protocol, (SpeakerVerificationProtocol, SpeakerDiarizationProtocol)): return msg = ('Only SpeakerVerification or SpeakerDiarization tasks are' 'supported in "validation" mode.') raise ValueError(msg)
def __init__(self, batch_size: int, segment_size_millis: int, segments_per_speaker: int = 1): self.sample_rate = 16000 self.batch_size = batch_size self.segments_per_speaker = segments_per_speaker self.segment_size_s = segment_size_millis / 1000 self.nfeat = self.sample_rate * segment_size_millis // 1000 self.config = self._create_config(self.segment_size_s) self.protocol = get_protocol(self.config.protocol_name, preprocessors=self.config.preprocessors) self.train_gen, self.dev_gen, self.test_gen = None, None, None print(f"[Segment Size: {self.segment_size_s}s]") print(f"[Network Input Size: {self.nfeat}]")
def __init__(self, collection: Optional[NoiseCollection] = None): if collection is None: collection = "MUSAN.Collection.BackgroundNoise" if not isinstance(collection, (list, tuple)): collection = [collection] self.collection = collection self.files_ = [] preprocessors = {'audio': FileFinder(), 'duration': get_audio_duration} for collection in self.collection: protocol = get_protocol(collection, preprocessors=preprocessors) self.files_.extend(protocol.files())
def speakers(args): hypotheses_path = args['<hypotheses_path>'] uri = args['<uri>'] if Path(hypotheses_path).exists(): hypotheses = load_rttm(hypotheses_path) hypothesis = hypotheses[uri] else: # protocol distances = {} protocol = get_protocol(args['<hypotheses_path>']) reference = get_file(protocol, uri) hypothesis = reference['annotation'] annotated = get_annotated(reference) print(uri) print(f"Number of speakers: {len(hypothesis.labels())}") print(f"Chart:\n{hypothesis.chart()}")
def _validation_set(self, protocol_name, subset='development'): # this generator is hacked to generate y_true # (which is stored in its internal preprocessed_ attribute) batch_generator = SpeechActivityDetectionBatchGenerator( self.feature_extraction_) batch_generator.cache_preprocessed_ = True # iterate over each test file and generate y_true protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) file_generator = getattr(protocol, subset)() for current_file in file_generator: identifier = get_unique_identifier(current_file) batch_generator.preprocess(current_file, identifier=identifier) return batch_generator.preprocessed_['y']
def tune_binarizer(app, epoch, protocol_name, subset='development'): """Tune binarizer Parameters ---------- app : SpeechActivityDetection epoch : int Epoch number. protocol_name : str E.g. 'Etape.SpeakerDiarization.TV' subset : {'train', 'development', 'test'}, optional Defaults to 'development'. Returns ------- params : dict See Binarize.tune metric : float Best achieved detection error rate """ # initialize protocol protocol = get_protocol(protocol_name, progress=False, preprocessors=app.preprocessors_) # load model for epoch 'epoch' sequence_labeling = SequenceLabeling.from_disk(app.train_dir_, epoch) # initialize sequence labeling duration = app.config_['sequences']['duration'] step = app.config_['sequences']['step'] aggregation = SequenceLabelingAggregation(sequence_labeling, app.feature_extraction_, duration=duration, step=step) aggregation.cache_preprocessed_ = False # tune Binarize thresholds (onset & offset) # with respect to detection error rate binarize_params, metric = Binarize.tune(getattr(protocol, subset)(), aggregation.apply, get_metric=DetectionErrorRate, dimension=1) return binarize_params, metric
def _validate_init_segment(self, protocol_name, subset='development'): np.random.seed(1337) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) batch_generator = SpeechSegmentGenerator( self.feature_extraction_, per_label=10, duration=self.duration) batch = next(batch_generator(protocol, subset=subset)) X = np.stack(batch['X']) y = np.stack(batch['y']).reshape((-1, 1)) # precompute same/different groundtruth y = pdist(y, metric='equal') return {'X': X, 'y': y}
def train(self, protocol_name, subset='train', restart=None, epochs=1000): train_dir = self.TRAIN_DIR.format( experiment_dir=self.experiment_dir, protocol=protocol_name, subset=subset) protocol = get_protocol(protocol_name, progress=True, preprocessors=self.preprocessors_) self.task_.fit( self.model_, self.feature_extraction_, protocol, subset=subset, restart=restart, epochs=epochs, get_optimizer=self.get_optimizer_, get_scheduler=self.get_scheduler_, learning_rate=self.learning_rate_, log_dir=train_dir, device=self.device)
def __init__(self, collection=None, snr_min=5, snr_max=20): super().__init__() if collection is None: collection = 'MUSAN.Collection.BackgroundNoise' if not isinstance(collection, (list, tuple)): collection = [collection] self.collection = collection self.snr_min = snr_min self.snr_max = snr_max # load noise database self.files_ = [] preprocessors = {'audio': FileFinder(), 'duration': get_audio_duration} for collection in self.collection: protocol = get_protocol(collection, preprocessors=preprocessors) self.files_.extend(protocol.files())
def _validate_init_segment(self, protocol_name, subset='development'): np.random.seed(1337) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) batch_generator = SpeechSegmentGenerator(self.feature_extraction_, per_label=10, duration=self.duration) batch = next(batch_generator(protocol, subset=subset)) X = np.stack(batch['X']) y = np.stack(batch['y']).reshape((-1, 1)) # precompute same/different groundtruth y = pdist(y, metric='equal') return {'X': X, 'y': y}
def train(self, protocol_name, subset='train'): train_dir = self.TRAIN_DIR.format(experiment_dir=self.experiment_dir, protocol=protocol_name, subset=subset) # sequence batch generator batch_size = self.config_['sequences'].get('batch_size', 8192) duration = self.config_['sequences']['duration'] step = self.config_['sequences']['step'] batch_generator = SpeechActivityDetectionBatchGenerator( self.feature_extraction_, duration=duration, step=step, batch_size=batch_size) batch_generator.cache_preprocessed_ = self.cache_preprocessed_ protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) # total train duration train_total = protocol.stats(subset)['annotated'] # number of batches per epoch steps_per_epoch = int(np.ceil((train_total / step) / batch_size)) # input shape (n_frames, n_features) input_shape = batch_generator.shape # generator that loops infinitely over all training files train_files = getattr(protocol, subset)() generator = batch_generator(train_files, infinite=True) labeling = SequenceLabeling() labeling.fit(input_shape, self.architecture_, generator, steps_per_epoch, 1000, optimizer=SSMORMS3(), log_dir=train_dir) return labeling
def validate_epoch(self, epoch, validation_data, protocol=None, **kwargs): _protocol = get_protocol(protocol) if isinstance(_protocol, SpeakerVerificationProtocol): return self._validate_epoch_verification(epoch, validation_data, protocol=protocol, **kwargs) elif isinstance(_protocol, SpeakerDiarizationProtocol): return self._validate_epoch_diarization(epoch, validation_data, protocol=protocol, **kwargs) else: msg = ("Only SpeakerVerification or SpeakerDiarization tasks are" 'supported in "validation" mode.') raise ValueError(msg)
def train(self, protocol_name, subset='train', restart=None, epochs=1000): train_dir = self.TRAIN_DIR.format(experiment_dir=self.experiment_dir, protocol=protocol_name, subset=subset) protocol = get_protocol(protocol_name, progress=True, preprocessors=self.preprocessors_) self.task_.fit(self.model_, self.feature_extraction_, protocol, subset=subset, restart=restart, epochs=epochs, get_optimizer=self.get_optimizer_, get_scheduler=self.get_scheduler_, learning_rate=self.learning_rate_, log_dir=train_dir, device=self.device)
def apply(self, protocol_name, output_dir, step=None): model = self.model_.to(self.device) model.eval() duration = self.duration if step is None: step = 0.25 * duration # do not use memmap as this would lead to too many open files if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False # initialize embedding extraction sequence_embedding = SequenceEmbedding(model, self.feature_extraction_, duration=duration, step=step, batch_size=self.batch_size, device=self.device) sliding_window = sequence_embedding.sliding_window dimension = sequence_embedding.dimension # create metadata file at root that contains # sliding window and dimension information precomputed = Precomputed(root_dir=output_dir, sliding_window=sliding_window, dimension=dimension) # file generator protocol = get_protocol(protocol_name, progress=True, preprocessors=self.preprocessors_) for current_file in FileFinder.protocol_file_iter(protocol, extra_keys=['audio' ]): fX = sequence_embedding.apply(current_file) precomputed.dump(current_file, fX)
def check(protocol_name, file_finder, experiment_dir): protocol = get_protocol(protocol_name) precomputed = Precomputed(experiment_dir) for subset in ['development', 'test', 'train']: try: file_generator = getattr(protocol, subset)() first_item = next(file_generator) except NotImplementedError as e: continue for current_file in getattr(protocol, subset)(): try: audio = file_finder(current_file) current_file['audio'] = audio except ValueError as e: print(e) continue duration = get_audio_duration(current_file) try: features = precomputed(current_file) except PyannoteFeatureExtractionError as e: print(e) continue if not np.isclose(duration, features.getExtent().duration, atol=1.): uri = get_unique_identifier(current_file) print('Duration mismatch for "{uri}"'.format(uri=uri)) if np.any(np.isnan(features.data)): uri = get_unique_identifier(current_file) print('NaN for "{uri}"'.format(uri=uri))
def check(protocol_name, file_finder, experiment_dir): protocol = get_protocol(protocol_name) precomputed = Precomputed(experiment_dir) for subset in ['development', 'test', 'train']: try: file_generator = getattr(protocol, subset)() first_item = next(file_generator) except NotImplementedError as e: continue for current_file in getattr(protocol, subset)(): try: audio = file_finder(current_file) current_file['audio'] = audio except ValueError as e: print(e) continue duration = get_audio_duration(current_file) try: features = precomputed(current_file) except PyannoteFeatureExtractionError as e: print(e) continue if not np.isclose(duration, features.getExtent().duration, atol=1.): uri = get_unique_identifier(current_file) print('Duration mismatch for "{uri}"'.format(uri=uri)) if np.any(np.isnan(features.data)): uri = get_unique_identifier(current_file) print('NaN for "{uri}"'.format(uri=uri))
def apply(self, protocol_name, output_dir, step=None): model = self.model_.to(self.device) model.eval() duration = self.task_.duration if step is None: step = 0.25 * duration # do not use memmap as this would lead to too many open files if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False # initialize embedding extraction sequence_labeling = SequenceLabeling( model, self.feature_extraction_, duration=duration, step=.25 * duration, batch_size=self.batch_size, source='audio', device=self.device) sliding_window = sequence_labeling.sliding_window n_classes = self.task_.n_classes # create metadata file at root that contains # sliding window and dimension information precomputed = Precomputed( root_dir=output_dir, sliding_window=sliding_window, dimension=n_classes) # file generator protocol = get_protocol(protocol_name, progress=True, preprocessors=self.preprocessors_) for current_file in FileFinder.protocol_file_iter( protocol, extra_keys=['audio']): fX = sequence_labeling.apply(current_file) precomputed.dump(current_file, fX)
def fun(threshold): binarizer = Binarize(onset=threshold, offset=threshold, log_scale=False) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) metric = DetectionErrorRate() # NOTE -- embarrasingly parallel # TODO -- parallelize this file_generator = getattr(protocol, subset)() for current_file in file_generator: uri = get_unique_identifier(current_file) hypothesis = binarizer.apply( predictions[uri], dimension=0).to_annotation() reference = current_file['annotation'] uem = get_annotated(current_file) _ = metric(reference, hypothesis, uem=uem) return abs(metric)
def eval(self, model, partition: str = 'development'): model.eval() sequence_embedding = SequenceEmbedding( model=model, feature_extraction=self.config.feature_extraction, duration=self.config.duration, step=.5 * self.config.duration, batch_size=self.batch_size, device=common.DEVICE) protocol = get_protocol(self.config.protocol_name, progress=False, preprocessors=self.config.preprocessors) y_true, y_pred, cache = [], [], {} for trial in getattr(protocol, f"{partition}_trial")(): # Compute embeddings emb1 = self._file_embedding(trial['file1'], sequence_embedding, cache) emb2 = self._file_embedding(trial['file2'], sequence_embedding, cache) # Compare embeddings dist = cdist(emb1, emb2, metric=self.distance.to_sklearn_metric())[0, 0] y_pred.append(dist) y_true.append(trial['reference']) _, _, _, eer = det_curve(np.array(y_true), np.array(y_pred), distances=True) # Returning 1-eer because the evaluator keeps track of the highest metric value return 1 - eer, y_pred, y_true
def validate_init(self, protocol_name: Text, subset: Subset = "development"): """Initialize validation data Parameters ---------- protocol_name : `str` subset : {'train', 'development', 'test'} Defaults to 'development'. Returns ------- validation_data : object Validation data. """ preprocessors = self.preprocessors_ if "audio" not in preprocessors: preprocessors["audio"] = FileFinder() if "duration" not in preprocessors: preprocessors["duration"] = get_audio_duration protocol = get_protocol(protocol_name, preprocessors=preprocessors) files = getattr(protocol, subset)() # convert lazy ProtocolFile to regular dict for multiprocessing files = [dict(file) for file in files] if isinstance(self.feature_extraction_, (Precomputed, RawAudio)): return files validation_data = [] for current_file in tqdm(files, desc="Feature extraction"): current_file["features"] = self.feature_extraction_(current_file) validation_data.append(current_file) return validation_data
def apply(self, protocol_name, output_dir): # file generator protocol = get_protocol(protocol_name, progress=True, preprocessors=self.preprocessors_) mkdir_p(output_dir) path = Path(output_dir) / f'{protocol_name}.txt' with open(path, mode='w') as fp: for current_file in FileFinder.protocol_file_iter( protocol, extra_keys=['audio']): uri = get_unique_identifier(current_file) hypothesis = self.pipeline_.apply(current_file) if isinstance(hypothesis, Timeline): for s in hypothesis: fp.write(f'{uri} {s.start:.3f} {s.end:.3f}\n') continue for s, t, l in hypothesis.itertracks(yield_label=True): fp.write(f'{uri} {s.start:.3f} {s.end:.3f} {t} {l}\n')
def xp_objective(args, **kwargs): import sys sys.path.append("/people/yin/projects/") from pyannote.database import get_protocol, get_annotated, FileFinder protocol = get_protocol('Etape.SpeakerDiarization.TV', preprocessors={'audio': FileFinder()}) from pyannote.metrics.diarization import GreedyDiarizationErrorRate metric = GreedyDiarizationErrorRate() from optimize_cluster import speaker_diarization from pyannote.audio.features import Precomputed feature_extraction = Precomputed( '/vol/work1/bredin/feature_extraction/mfcc') sad_pre = '/vol/work1/yin/speech_activity_detection/shallow/train/REPERE.SpeakerDiarization.All.train/tune/Etape.SpeakerDiarization.TV.development/apply' scd_pre = '/vol/work1/yin/speaker_change_detection/paper/train/REPERE.SpeakerDiarization.All.train/tune/Etape.SpeakerDiarization.Debug.development/apply' emb_pre = '/vol/work1/yin/embedding/20180124' args['cls__damping'] = float(args['cls__damping']) args['cls__preference'] = float(args['cls__preference']) pipeline = speaker_diarization.SpeakerDiarizationPre( feature_extraction, sad_pre, scd_pre, emb_pre, **args) try: for current_file in protocol.train(): hypothesis = pipeline(current_file, annotated=True) if hypothesis is None: return 100 reference = current_file['annotation'] uem = get_annotated(current_file) metric(reference, hypothesis, uem=uem) except MemoryError as error: return 100 return abs(metric)
# coding: utf-8 import sys sys.path.append("../") import clustering import numpy as np from pyannote.audio.features import Precomputed precomputed = Precomputed('/vol/work1/bredin/speaker_spotting/embeddings') from pyannote.database import get_protocol, FileFinder protocol = get_protocol('AMI.SpeakerSpotting.MixHeadset', progress=True) from pyannote.core import Annotation,Segment, Timeline # enrolment consists in summing all relevant embeddings def speaker_spotting_enrol(current_enrolment): enrol_with = current_enrolment['enrol_with'] embeddings = precomputed(current_enrolment) return np.sum(embeddings.crop(enrol_with), axis=0, keepdims=True) models = {} for current_enrolment in protocol.test_enrolment(): model_id = current_enrolment.pop('model_id') models[model_id] = speaker_spotting_enrol(current_enrolment) REFERENCE = {} for current_file in protocol.test(): uri = current_file['uri'] if uri not in REFERENCE: REFERENCE[uri] = Annotation(uri=uri)
def extract(protocol_name, file_finder, experiment_dir, robust=False, parallel=False): protocol = get_protocol(protocol_name, progress=False) # load configuration file config_yml = experiment_dir + '/config.yml' with open(config_yml, 'r') as fp: config = yaml.load(fp) feature_extraction_name = config['feature_extraction']['name'] features = __import__('pyannote.audio.features', fromlist=[feature_extraction_name]) FeatureExtraction = getattr(features, feature_extraction_name) feature_extraction = FeatureExtraction( **config['feature_extraction'].get('params', {})) sliding_window = feature_extraction.sliding_window() dimension = feature_extraction.dimension() if 'normalization' in config: normalization_name = config['normalization']['name'] normalization_module = __import__('pyannote.audio.features.normalization', fromlist=[normalization_name]) Normalization = getattr(normalization_module, normalization_name) normalization = Normalization( **config['normalization'].get('params', {})) else: normalization = None # create metadata file at root that contains # sliding window and dimension information precomputed = Precomputed(root_dir=experiment_dir, sliding_window=sliding_window, dimension=dimension) if parallel: extract_one = functools.partial(helper_extract, file_finder=file_finder, experiment_dir=experiment_dir, config_yml=config_yml, normalization=normalization, robust=robust) n_jobs = cpu_count() pool = Pool(n_jobs) imap = pool.imap else: feature_extraction = init_feature_extraction(experiment_dir) extract_one = functools.partial(helper_extract, file_finder=file_finder, experiment_dir=experiment_dir, feature_extraction=feature_extraction, normalization=normalization, robust=robust) imap = map for result in imap(extract_one, FileFinder.protocol_file_iter( protocol, extra_keys=['audio'])): if result is None: continue print(result)
return res def process_trial(trial, scores): res = {} pscores = process_score(scores) res['uri'] = trial['uri'] res['model_id'] = trial['model_id'] res['scores'] = pscores return res if __name__ == '__main__': arguments = docopt(__doc__, version='Speaker-spotting') # protocol protocol_name = arguments['<database.task.protocol>'] protocol = get_protocol(protocol_name, progress=True) # subset (train, development, or test) subset = arguments['--subset'] output_file = arguments['<output_file>'] from pyannote.audio.features import Precomputed precomputed = Precomputed('/vol/work1/bredin/speaker_spotting/embeddings') models = {} enrolments = getattr(protocol, '{subset}_enrolment'.format(subset=subset))() for current_enrolment in enrolments: model_id = current_enrolment.pop('model_id') models[model_id] = speaker_spotting_enrol(current_enrolment) if arguments['oracle']: REFERENCE = {} for current_file in getattr(protocol,subset)():
def validate_epoch(self, epoch, protocol_name, subset='development', validation_data=None): target_purity = self.purity # load model for current epoch model = self.load_model(epoch).to(self.device) model.eval() if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False duration = self.task_.duration step = .25 * duration sequence_labeling = SequenceLabeling( model, self.feature_extraction_, duration=duration, step=.25 * duration, batch_size=self.batch_size, source='audio', device=self.device) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) # extract predictions for all files. predictions = {} for current_file in getattr(protocol, subset)(): uri = get_unique_identifier(current_file) predictions[uri] = sequence_labeling.apply(current_file) # dichotomic search to find alpha that maximizes coverage # while having at least `target_purity` lower_alpha = 0. upper_alpha = 1. best_alpha = .5 * (lower_alpha + upper_alpha) best_coverage = 0. for _ in range(10): current_alpha = .5 * (lower_alpha + upper_alpha) peak = Peak(alpha=current_alpha, min_duration=0.0, log_scale=model.logsoftmax) metric = DiarizationPurityCoverageFMeasure() # NOTE -- embarrasingly parallel # TODO -- parallelize this for current_file in getattr(protocol, subset)(): reference = current_file['annotation'] uri = get_unique_identifier(current_file) hypothesis = peak.apply(predictions[uri], dimension=1) hypothesis = hypothesis.to_annotation() uem = get_annotated(current_file) metric(reference, hypothesis, uem=uem) purity, coverage, _ = metric.compute_metrics() if purity < target_purity: upper_alpha = current_alpha else: lower_alpha = current_alpha if coverage > best_coverage: best_coverage = coverage best_alpha = current_alpha task = 'speaker_change_detection' metric_name = f'{task}/coverage@{target_purity:.2f}purity' return { metric_name: {'minimize': False, 'value': best_coverage}, f'{task}/threshold': {'minimize': 'NA', 'value': best_alpha}}
def train(self, protocol_name, subset='development', n_calls=1): train_dir = self.TRAIN_DIR.format( experiment_dir=self.experiment_dir, protocol=protocol_name, subset=subset) mkdir_p(train_dir) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) tune_db = f'{train_dir}/tune.db' params_yml = f'{train_dir}/params.yml' params_yml_lock = f'{train_dir}/params.yml.lock' pid = os.getpid() writer = SummaryWriter(log_dir=f"{train_dir}/{pid}") progress_bar = tqdm(unit='trial') progress_bar.set_description('Trial #1 : ...') progress_bar.update(0) iterations = self.pipeline_.tune_iter( tune_db, protocol, subset=subset, sampler=self.sampler_) for s, status in enumerate(iterations): if s+1 == n_calls: break loss = status['latest']['loss'] writer.add_scalar(f'train/{protocol_name}.{subset}/loss/latest', loss, global_step=s + 1) writer.add_scalars( f'train/{protocol_name}.{subset}/params/latest', status['latest']['params'], global_step=s + 1) if 'new_best' in status: _ = self.dump(status['new_best'], params_yml, params_yml_lock) n_trials = status['new_best']['n_trials'] best_loss = status['new_best']['loss'] writer.add_scalar(f'train/{protocol_name}.{subset}/loss/best', best_loss, global_step=n_trials) writer.add_scalars( f'train/{protocol_name}.{subset}/params/best', status['new_best']['params'], global_step=n_trials) # progress bar desc = f"Trial #{s+1}" loss = status['latest']['loss'] if abs(loss) < 1: desc += f" = {100 * loss:.3f}%" desc += f" : Best = {100 * best_loss:.3f}% after {n_trials} trials" else: desc += f" = {loss:.3f}" desc += f" : Best = {best_loss:.3f} after {n_trials} trials" progress_bar.set_description(desc=desc) progress_bar.update(1) best = self.pipeline_.best(tune_db) content = self.dump(best, params_yml, params_yml_lock) sep = "=" * max(len(params_yml), max(len(l) for l in content.split('\n'))) print(f"\n{sep}\n{params_yml}\n{sep}\n{content}{sep}") print(f"Loss = {best['loss']:g} | {best['n_trials']} trials") print(f"{sep}")
# coding: utf-8 # ```bash # $ pip install pyannote.metrics==1.4.1 # $ pip install pyannote.db.odessa.ami==0.5.1 # ``` import clustering import numpy as np from pyannote.audio.features import Precomputed precomputed = Precomputed('/vol/work1/bredin/speaker_spotting/embeddings') from pyannote.database import get_protocol, FileFinder protocol = get_protocol('AMI.SpeakerSpotting.MixHeadset', progress=True) # enrolment consists in summing all relevant embeddings def speaker_spotting_enrol(current_enrolment): enrol_with = current_enrolment['enrol_with'] embeddings = precomputed(current_enrolment) return np.sum(embeddings.crop(enrol_with), axis=0, keepdims=True) models = {} for current_enrolment in protocol.development_enrolment(): model_id = current_enrolment.pop('model_id') models[model_id] = speaker_spotting_enrol(current_enrolment) REFERENCE = {current_file['uri']: current_file['annotation'] for current_file in protocol.development()}
def validate_epoch(self, epoch, protocol_name, subset='development', validation_data=None): target_precision = self.precision # load model for current epoch model = self.load_model(epoch).to(self.device) model.eval() if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False duration = self.task_.duration step = .25 * duration sequence_labeling = SequenceLabeling( model, self.feature_extraction_, duration=duration, step=.25 * duration, batch_size=self.batch_size, source='audio', device=self.device) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) predictions = {} references = {} file_generator = getattr(protocol, subset)() for current_file in file_generator: uri = get_unique_identifier(current_file) # build overlap reference reference = Timeline(uri=uri) annotation = current_file['annotation'] for track1, track2 in annotation.co_iter(annotation): if track1 == track2: continue reference.add(track1[0] & track2[0]) references[uri] = reference.to_annotation() # extract overlap scores scores = sequence_labeling.apply(current_file) if model.logsoftmax: scores = SlidingWindowFeature( np.exp(scores.data[:, 2]), scores.sliding_window) else: scores = SlidingWindowFeature( scores.data[:, 2], scores.sliding_window) predictions[uri] = scores # dichotomic search to find threshold that maximizes recall # while having at least `target_precision` lower_alpha = 0. upper_alpha = 1. best_alpha = .5 * (lower_alpha + upper_alpha) best_recall = 0. for _ in range(10): current_alpha = .5 * (lower_alpha + upper_alpha) binarizer = Binarize(onset=current_alpha, offset=current_alpha, log_scale=False) precision = DetectionPrecision() recall = DetectionRecall() for current_file in getattr(protocol, subset)(): uri = get_unique_identifier(current_file) reference = references[uri] hypothesis = binarizer.apply(predictions[uri], dimension=0) hypothesis = hypothesis.to_annotation() uem = get_annotated(current_file) _ = precision(reference, hypothesis, uem=uem) _ = recall(reference, hypothesis, uem=uem) if abs(precision) < target_precision: # precision is not high enough: try higher thresholds lower_alpha = current_alpha else: upper_alpha = current_alpha r = abs(recall) if r > best_recall: best_recall = r best_alpha = current_alpha task = 'overlap_speech_detection' metric_name = f'{task}/recall@{target_precision:.2f}precision' return { metric_name: {'minimize': False, 'value': best_recall}, f'{task}/threshold': {'minimize': 'NA', 'value': best_alpha}}
def _validate_epoch_verification(self, epoch, protocol_name, subset='development', validation_data=None): """Perform a speaker verification experiment using model at `epoch` Parameters ---------- epoch : int Epoch to validate. protocol_name : str Name of speaker verification protocol subset : {'train', 'development', 'test'}, optional Name of subset. validation_data : provided by `validate_init` Returns ------- metrics : dict """ # load current model model = self.load_model(epoch).to(self.device) model.eval() # use user-provided --duration when available # otherwise use 'duration' used for training if self.duration is None: duration = self.task_.duration else: duration = self.duration min_duration = None # if 'duration' is still None, it means that # network was trained with variable lengths if duration is None: duration = self.task_.max_duration min_duration = self.task_.min_duration step = .5 * duration if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False # initialize embedding extraction sequence_embedding = SequenceEmbedding( model, self.feature_extraction_, duration=duration, step=step, min_duration=min_duration, batch_size=self.batch_size, device=self.device) metrics = {} protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) enrolment_models, enrolment_khashes = {}, {} enrolments = getattr(protocol, '{0}_enrolment'.format(subset))() for i, enrolment in enumerate(enrolments): data = sequence_embedding.apply(enrolment, crop=enrolment['enrol_with']) model_id = enrolment['model_id'] model = np.mean(np.stack(data), axis=0, keepdims=True) enrolment_models[model_id] = model # in some specific speaker verification protocols, # enrolment data may be used later as trial data. # therefore, we cache information about enrolment data # to speed things up by reusing the enrolment as trial h = hash((get_unique_identifier(enrolment), tuple(enrolment['enrol_with']))) enrolment_khashes[h] = model_id trial_models = {} trials = getattr(protocol, '{0}_trial'.format(subset))() y_true, y_pred = [], [] for i, trial in enumerate(trials): model_id = trial['model_id'] h = hash((get_unique_identifier(trial), tuple(trial['try_with']))) # re-use enrolment model whenever possible if h in enrolment_khashes: model = enrolment_models[enrolment_khashes[h]] # re-use trial model whenever possible elif h in trial_models: model = trial_models[h] else: data = sequence_embedding.apply(trial, crop=trial['try_with']) model = np.mean(data, axis=0, keepdims=True) # cache trial model for later re-use trial_models[h] = model distance = cdist(enrolment_models[model_id], model, metric=self.metric)[0, 0] y_pred.append(distance) y_true.append(trial['reference']) _, _, _, eer = det_curve(np.array(y_true), np.array(y_pred), distances=True) metrics['EER'] = {'minimize': True, 'value': eer} return metrics
def main(): arguments = docopt(__doc__, version='Evaluation') collar = float(arguments['--collar']) skip_overlap = arguments['--skip-overlap'] tolerance = float(arguments['--tolerance']) # protocol protocol_name = arguments['<database.task.protocol>'] preprocessors = dict() if arguments['overlap']: if skip_overlap: msg = ('Option --skip-overlap is not supported ' 'when evaluating overlapped speech detection.') sys.exit(msg) preprocessors = {'annotation': to_overlap} protocol = get_protocol(protocol_name, progress=True, preprocessors=preprocessors) # subset (train, development, or test) subset = arguments['--subset'] if arguments['spotting']: hypothesis_json = arguments['<hypothesis.json>'] with open(hypothesis_json, mode='r') as fp: hypotheses = json.load(fp) output_prefix = hypothesis_json[:-5] latencies = [float(l) for l in arguments['--latency']] filters = arguments['--filter'] if filters: from sympy import sympify, lambdify, symbols speech = symbols('speech') filter_funcs = [] filter_funcs = [ lambdify([speech], sympify(expression)) for expression in filters ] filter_func = lambda speech: \ any(~func(speech) for func in filter_funcs) else: filter_func = None spotting(protocol, subset, latencies, hypotheses, output_prefix, filter_func=filter_func) sys.exit(0) hypothesis_rttm = arguments['<hypothesis.rttm>'] try: hypotheses = load_rttm(hypothesis_rttm) except FileNotFoundError: msg = f'Could not find file {hypothesis_rttm}.' sys.exit(msg) except: msg = (f'Failed to load {hypothesis_rttm}, please check its format ' f'(only RTTM files are supported).') sys.exit(msg) if arguments['detection']: detection(protocol, subset, hypotheses, collar=collar, skip_overlap=skip_overlap) if arguments['overlap']: detection(protocol, subset, hypotheses, collar=collar, skip_overlap=skip_overlap) if arguments['segmentation']: segmentation(protocol, subset, hypotheses, tolerance=tolerance) if arguments['diarization']: greedy = arguments['--greedy'] diarization(protocol, subset, hypotheses, greedy=greedy, collar=collar, skip_overlap=skip_overlap) if arguments['identification']: identification(protocol, subset, hypotheses, collar=collar, skip_overlap=skip_overlap)
def validate_epoch(self, epoch, protocol_name, subset='development', validation_data=None): # load model for current epoch model = self.load_model(epoch).to(self.device) model.eval() if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False duration = self.task_.duration step = .25 * duration sequence_labeling = SequenceLabeling( model, self.feature_extraction_, duration=duration, step=.25 * duration, batch_size=self.batch_size, source='audio', device=self.device) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) metric = DetectionErrorRate() predictions = {} file_generator = getattr(protocol, subset)() for current_file in file_generator: uri = get_unique_identifier(current_file) scores = sequence_labeling.apply(current_file) if model.logsoftmax: scores = SlidingWindowFeature( 1. - np.exp(scores.data[:, 0]), scores.sliding_window) else: scores = SlidingWindowFeature( 1. - scores.data[:, 0], scores.sliding_window) predictions[uri] = scores def fun(threshold): binarizer = Binarize(onset=threshold, offset=threshold, log_scale=False) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) metric = DetectionErrorRate() # NOTE -- embarrasingly parallel # TODO -- parallelize this file_generator = getattr(protocol, subset)() for current_file in file_generator: uri = get_unique_identifier(current_file) hypothesis = binarizer.apply( predictions[uri], dimension=0).to_annotation() reference = current_file['annotation'] uem = get_annotated(current_file) _ = metric(reference, hypothesis, uem=uem) return abs(metric) res = scipy.optimize.minimize_scalar( fun, bounds=(0., 1.), method='bounded', options={'maxiter': 10}) return { 'speech_activity_detection/error': {'minimize': True, 'value': res.fun}, 'speech_activity_detection/threshold': {'minimize': 'NA', 'value': res.x}}