def __init__(self, protocol=None, subset='train', db_yml=None, snr_min=5, snr_max=20): super().__init__() self.protocol = protocol self.subset = subset self.db_yml = db_yml self.snr_min = snr_min self.snr_max = snr_max # returns gaps in annotation as pyannote.core.Timeline instance get_gaps = lambda f: f['annotation'].get_timeline().gaps( support=get_annotated(f)) if isinstance(protocol, str): preprocessors = { 'audio': FileFinder(config_yml=db_yml), 'duration': get_audio_duration, 'gaps': get_gaps } protocol = get_protocol(self.protocol, preprocessors=preprocessors) else: protocol.preprocessors['gaps'] = get_gaps self.files_ = list(getattr(protocol, self.subset)())
def __init__(self, protocol=None, subset: Subset = "train", snr_min=5, snr_max=20): super().__init__() self.protocol = protocol self.subset = subset self.snr_min = snr_min self.snr_max = snr_max # returns gaps in annotation as pyannote.core.Timeline instance get_gaps = (lambda f: f["annotation"].get_timeline().gaps( support=get_annotated(f))) if isinstance(protocol, str): preprocessors = { "audio": FileFinder(), "duration": get_audio_duration, "gaps": get_gaps, } protocol = get_protocol(self.protocol, preprocessors=preprocessors) else: protocol.preprocessors["gaps"] = get_gaps self.files_ = list(getattr(protocol, self.subset)())
def __init__(self, experiment_dir: Path, training: bool = False): super().__init__() self.experiment_dir = experiment_dir # load configuration file config_yml = self.CONFIG_YML.format(experiment_dir=self.experiment_dir) with open(config_yml, 'r') as fp: self.config_ = yaml.load(fp) # initialize preprocessors preprocessors = {} for key, db_yml in self.config_.get('preprocessors', {}).items(): try: preprocessors[key] = FileFinder(db_yml) except FileNotFoundError as e: template = db_yml preprocessors[key] = template self.preprocessors_ = preprocessors # initialize pipeline pipeline_name = self.config_['pipeline']['name'] Klass = get_class_by_name( pipeline_name, default_module_name='pyannote.pipeline.blocks') self.pipeline_ = Klass(**self.config_['pipeline'].get('params', {})) # freeze parameters if 'freeze' in self.config_: params = self.config_['freeze'] self.pipeline_.freeze(params)
def __init__(self, experiment_dir, db_yml=None): super(Application, self).__init__() self.db_yml = db_yml self.preprocessors_ = {'audio': FileFinder(self.db_yml)} self.experiment_dir = experiment_dir # load configuration config_yml = self.CONFIG_YML.format(experiment_dir=self.experiment_dir) with open(config_yml, 'r') as fp: self.config_ = yaml.load(fp) # scheduler SCHEDULER_DEFAULT = { 'name': 'DavisKingScheduler', 'params': { 'learning_rate': 'auto' } } scheduler_cfg = self.config_.get('scheduler', SCHEDULER_DEFAULT) scheduler_name = scheduler_cfg['name'] schedulers = __import__('pyannote.audio.train.schedulers', fromlist=[scheduler_name]) Scheduler = getattr(schedulers, scheduler_name) scheduler_params = scheduler_cfg.get('params', {}) self.get_scheduler_ = partial(Scheduler, **scheduler_params) self.learning_rate_ = scheduler_params.get('learning_rate', 'auto') # optimizer OPTIMIZER_DEFAULT = { 'name': 'SGD', 'params': { 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': True } } optimizer_cfg = self.config_.get('optimizer', OPTIMIZER_DEFAULT) optimizer_name = optimizer_cfg['name'] optimizers = __import__('torch.optim', fromlist=[optimizer_name]) Optimizer = getattr(optimizers, optimizer_name) optimizer_params = optimizer_cfg.get('params', {}) self.get_optimizer_ = partial(Optimizer, **optimizer_params) # feature extraction if 'feature_extraction' in self.config_: extraction_name = self.config_['feature_extraction']['name'] features = __import__('pyannote.audio.features', fromlist=[extraction_name]) FeatureExtraction = getattr(features, extraction_name) self.feature_extraction_ = FeatureExtraction( **self.config_['feature_extraction'].get('params', {}))
def __init__(self, experiment_dir: Path, training: bool = False): super().__init__() self.experiment_dir = experiment_dir # load configuration file config_yml = self.CONFIG_YML.format(experiment_dir=self.experiment_dir) with open(config_yml, 'r') as fp: self.config_ = yaml.load(fp) # initialize preprocessors preprocessors = {} for key, preprocessor in self.config_.get('preprocessors', {}).items(): # preprocessors: # key: # name: package.module.ClassName # params: # param1: value1 # param2: value2 if isinstance(preprocessor, dict): Klass = get_class_by_name( preprocessor['name'], default_module_name='pyannote.pipeline') preprocessors[key] = Klass(**preprocessor.get('params', {})) continue try: # preprocessors: # key: /path/to/database.yml database_yml = preprocessor preprocessors[key] = FileFinder(preprocessor) except FileNotFoundError as e: # preprocessors: # key: /path/to/{uri}.wav template = preprocessor preprocessors[key] = template self.preprocessors_ = preprocessors # initialize pipeline pipeline_name = self.config_['pipeline']['name'] Klass = get_class_by_name( pipeline_name, default_module_name='pyannote.pipeline.blocks') self.pipeline_ = Klass(**self.config_['pipeline'].get('params', {})) # freeze parameters if 'freeze' in self.config_: params = self.config_['freeze'] self.pipeline_.freeze(params)
def __init__(self, collection: Optional[NoiseCollection] = None): if collection is None: collection = "MUSAN.Collection.BackgroundNoise" if not isinstance(collection, (list, tuple)): collection = [collection] self.collection = collection self.files_ = [] preprocessors = {'audio': FileFinder(), 'duration': get_audio_duration} for collection in self.collection: protocol = get_protocol(collection, preprocessors=preprocessors) self.files_.extend(protocol.files())
def main(): arguments = docopt(__doc__, version='Feature extraction') file_finder = FileFinder() protocol_name = arguments['<database.task.protocol>'] experiment_dir = arguments['<experiment_dir>'] if arguments['check']: check(protocol_name, file_finder, experiment_dir) else: robust = arguments['--robust'] parallel = arguments['--parallel'] extract(protocol_name, file_finder, experiment_dir, robust=robust, parallel=parallel)
def main(): arguments = docopt(__doc__, version="Feature extraction") file_finder = FileFinder() protocol_name = arguments["<database.task.protocol>"] experiment_dir = arguments["<experiment_dir>"] if arguments["check"]: check(protocol_name, file_finder, experiment_dir) else: robust = arguments["--robust"] parallel = arguments["--parallel"] extract( protocol_name, file_finder, experiment_dir, robust=robust, parallel=parallel )
def __init__(self, collection=None, snr_min=5, snr_max=20): super().__init__() if collection is None: collection = 'MUSAN.Collection.BackgroundNoise' if not isinstance(collection, (list, tuple)): collection = [collection] self.collection = collection self.snr_min = snr_min self.snr_max = snr_max # load noise database self.files_ = [] preprocessors = {'audio': FileFinder(), 'duration': get_audio_duration} for collection in self.collection: protocol = get_protocol(collection, preprocessors=preprocessors) self.files_.extend(protocol.files())
def apply(self, protocol_name, output_dir, step=None): model = self.model_.to(self.device) model.eval() duration = self.duration if step is None: step = 0.25 * duration # do not use memmap as this would lead to too many open files if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False # initialize embedding extraction sequence_embedding = SequenceEmbedding(model, self.feature_extraction_, duration=duration, step=step, batch_size=self.batch_size, device=self.device) sliding_window = sequence_embedding.sliding_window dimension = sequence_embedding.dimension # create metadata file at root that contains # sliding window and dimension information precomputed = Precomputed(root_dir=output_dir, sliding_window=sliding_window, dimension=dimension) # file generator protocol = get_protocol(protocol_name, progress=True, preprocessors=self.preprocessors_) for current_file in FileFinder.protocol_file_iter(protocol, extra_keys=['audio' ]): fX = sequence_embedding.apply(current_file) precomputed.dump(current_file, fX)
def apply(self, protocol_name, output_dir, step=None): model = self.model_.to(self.device) model.eval() duration = self.task_.duration if step is None: step = 0.25 * duration # do not use memmap as this would lead to too many open files if isinstance(self.feature_extraction_, Precomputed): self.feature_extraction_.use_memmap = False # initialize embedding extraction sequence_labeling = SequenceLabeling( model, self.feature_extraction_, duration=duration, step=.25 * duration, batch_size=self.batch_size, source='audio', device=self.device) sliding_window = sequence_labeling.sliding_window n_classes = self.task_.n_classes # create metadata file at root that contains # sliding window and dimension information precomputed = Precomputed( root_dir=output_dir, sliding_window=sliding_window, dimension=n_classes) # file generator protocol = get_protocol(protocol_name, progress=True, preprocessors=self.preprocessors_) for current_file in FileFinder.protocol_file_iter( protocol, extra_keys=['audio']): fX = sequence_labeling.apply(current_file) precomputed.dump(current_file, fX)
def xp_objective(args, **kwargs): import sys sys.path.append("/people/yin/projects/") from pyannote.database import get_protocol, get_annotated, FileFinder protocol = get_protocol('Etape.SpeakerDiarization.TV', preprocessors={'audio': FileFinder()}) from pyannote.metrics.diarization import GreedyDiarizationErrorRate metric = GreedyDiarizationErrorRate() from optimize_cluster import speaker_diarization from pyannote.audio.features import Precomputed feature_extraction = Precomputed( '/vol/work1/bredin/feature_extraction/mfcc') sad_pre = '/vol/work1/yin/speech_activity_detection/shallow/train/REPERE.SpeakerDiarization.All.train/tune/Etape.SpeakerDiarization.TV.development/apply' scd_pre = '/vol/work1/yin/speaker_change_detection/paper/train/REPERE.SpeakerDiarization.All.train/tune/Etape.SpeakerDiarization.Debug.development/apply' emb_pre = '/vol/work1/yin/embedding/20180124' args['cls__damping'] = float(args['cls__damping']) args['cls__preference'] = float(args['cls__preference']) pipeline = speaker_diarization.SpeakerDiarizationPre( feature_extraction, sad_pre, scd_pre, emb_pre, **args) try: for current_file in protocol.train(): hypothesis = pipeline(current_file, annotated=True) if hypothesis is None: return 100 reference = current_file['annotation'] uem = get_annotated(current_file) metric(reference, hypothesis, uem=uem) except MemoryError as error: return 100 return abs(metric)
def validate_init(self, protocol_name: Text, subset: Subset = "development"): """Initialize validation data Parameters ---------- protocol_name : `str` subset : {'train', 'development', 'test'} Defaults to 'development'. Returns ------- validation_data : object Validation data. """ preprocessors = self.preprocessors_ if "audio" not in preprocessors: preprocessors["audio"] = FileFinder() if "duration" not in preprocessors: preprocessors["duration"] = get_audio_duration protocol = get_protocol(protocol_name, preprocessors=preprocessors) files = getattr(protocol, subset)() # convert lazy ProtocolFile to regular dict for multiprocessing files = [dict(file) for file in files] if isinstance(self.feature_extraction_, (Precomputed, RawAudio)): return files validation_data = [] for current_file in tqdm(files, desc="Feature extraction"): current_file["features"] = self.feature_extraction_(current_file) validation_data.append(current_file) return validation_data
def apply(self, protocol_name, output_dir): # file generator protocol = get_protocol(protocol_name, progress=True, preprocessors=self.preprocessors_) mkdir_p(output_dir) path = Path(output_dir) / f'{protocol_name}.txt' with open(path, mode='w') as fp: for current_file in FileFinder.protocol_file_iter( protocol, extra_keys=['audio']): uri = get_unique_identifier(current_file) hypothesis = self.pipeline_.apply(current_file) if isinstance(hypothesis, Timeline): for s in hypothesis: fp.write(f'{uri} {s.start:.3f} {s.end:.3f}\n') continue for s, t, l in hypothesis.itertracks(yield_label=True): fp.write(f'{uri} {s.start:.3f} {s.end:.3f} {t} {l}\n')
import time import torch from pyannote.database import FileFinder, get_protocol from pyannote.metrics.diarization import DiarizationErrorRate, JaccardErrorRate preprocessors = {'audio': FileFinder()} protocol = get_protocol('VOXCON.SpeakerDiarization.Challenge', preprocessors=preprocessors) diarization_pipeline = torch.hub.load('pyannote/pyannote-audio', 'dia_dihard', device = 'gpu') ders = [] jers = [] hypotheses = [] derMetric = DiarizationErrorRate(collar=0.25) jerMetric = JaccardErrorRate(collar=0.25) for file in protocol.test(): hypothesis = diarization_pipeline(file) hypotheses.append(hypothesis) reference = file["annotation"] # uem = file['annotated'] der = derMetric(reference, hypothesis) jer = jerMetric(reference, hypothesis) ders.append(der) jers.append(jer) uri = file['uri'] print(f'{uri} DER = {100 * der:.1f}% JER = {100 * jer:.1f}% {time.strftime("%H:%M:%S")}')
def load_config( config_yml: Path, training: bool = False, config_default_module: Text = None, pretrained_config_yml: Path = None, ) -> Dict: """ Returns ------- config : Dict ['preprocessors'] ['learning_rate'] ['scheduler'] ['get_optimizer'] ['callbacks'] ['feature_extraction'] ['task'] ['get_model_from_specs'] ['model_resolution'] ['model_alignment'] """ # load pretrained model configuration pretrained_cfg = dict() if pretrained_config_yml is not None: with open(pretrained_config_yml, "r") as fp: pretrained_cfg = yaml.load(fp, Loader=yaml.SafeLoader) # load configuration or complain it's missing cfg = dict() if config_yml.exists(): with open(config_yml, "r") as fp: cfg = yaml.load(fp, Loader=yaml.SafeLoader) # backup user-provided config because it will be updated if pretrained_config_yml is not None: shutil.copy(config_yml, config_yml.parent / "backup+config.yml") elif pretrained_config_yml is None: msg = f"{config_yml} configuration file is missing." raise FileNotFoundError(msg) # override pretrained model config with user-provided config cfg = merge_cfg(pretrained_cfg, cfg) # save (updated) config to disk if pretrained_config_yml is not None: with open(config_yml, "w") as fp: yaml.dump(cfg, fp, default_flow_style=False) # preprocessors preprocessors = dict() for key, preprocessor in cfg.get("preprocessors", {}).items(): # preprocessors: # key: # name: package.module.ClassName # params: # param1: value1 # param2: value2 if isinstance(preprocessor, dict): Klass = get_class_by_name(preprocessor["name"]) preprocessors[key] = Klass(**preprocessor.get("params", {})) continue try: # preprocessors: # key: /path/to/database.yml preprocessors[key] = FileFinder(database_yml=preprocessor) except FileNotFoundError as e: # preprocessors: # key: /path/to/{uri}.wav preprocessors[key] = preprocessor cfg["preprocessors"] = preprocessors # scheduler SCHEDULER_DEFAULT = { "name": "DavisKingScheduler", "params": { "learning_rate": "auto" }, } scheduler_cfg = cfg.get("scheduler", SCHEDULER_DEFAULT) Scheduler = get_class_by_name( scheduler_cfg["name"], default_module_name="pyannote.audio.train.schedulers") scheduler_params = scheduler_cfg.get("params", {}) cfg["learning_rate"] = scheduler_params.pop("learning_rate", "auto") cfg["scheduler"] = Scheduler(**scheduler_params) # optimizer OPTIMIZER_DEFAULT = { "name": "SGD", "params": { "momentum": 0.9, "dampening": 0, "weight_decay": 0, "nesterov": True, }, } optimizer_cfg = cfg.get("optimizer", OPTIMIZER_DEFAULT) try: Optimizer = get_class_by_name(optimizer_cfg["name"], default_module_name="torch.optim") optimizer_params = optimizer_cfg.get("params", {}) cfg["get_optimizer"] = partial(Optimizer, **optimizer_params) # do not raise an error here as it is possible that the optimizer is # not really needed (e.g. in pipeline training) except ModuleNotFoundError as e: warnings.warn(e.args[0]) # data augmentation should only be active when training a model if training and "data_augmentation" in cfg: DataAugmentation = get_class_by_name( cfg["data_augmentation"]["name"], default_module_name="pyannote.audio.augmentation", ) augmentation = DataAugmentation( **cfg["data_augmentation"].get("params", {})) else: augmentation = None # custom callbacks callbacks = [] for callback_config in cfg.get("callbacks", {}): Callback = get_class_by_name(callback_config["name"]) callback = Callback(**callback_config.get("params", {})) callbacks.append(callback) cfg["callbacks"] = callbacks # feature extraction FEATURE_DEFAULT = {"name": "RawAudio", "params": {"sample_rate": 16000}} feature_cfg = cfg.get("feature_extraction", FEATURE_DEFAULT) FeatureExtraction = get_class_by_name( feature_cfg["name"], default_module_name="pyannote.audio.features") feature_params = feature_cfg.get("params", {}) cfg["feature_extraction"] = FeatureExtraction(**feature_params, augmentation=augmentation) # task if config_default_module is None: config_default_module = "pyannote.audio.labeling.tasks" try: TaskClass = get_class_by_name( cfg["task"]["name"], default_module_name=config_default_module) except AttributeError: TaskClass = get_class_by_name( cfg["task"]["name"], default_module_name="pyannote.audio.embedding.approaches", ) cfg["task"] = TaskClass(**cfg["task"].get("params", {})) # architecture Architecture = get_class_by_name( cfg["architecture"]["name"], default_module_name="pyannote.audio.models") params = cfg["architecture"].get("params", {}) cfg["get_model_from_specs"] = partial(Architecture, **params) task = cfg["task"].task cfg["model_resolution"] = Architecture.get_resolution(task, **params) cfg["model_alignment"] = Architecture.get_alignment(task, **params) return cfg
def _create_config(self, segment_size_sec: float): return metrics.SpeakerValidationConfig( protocol_name='VoxCeleb.SpeakerVerification.VoxCeleb2', feature_extraction=RawAudio(sample_rate=self.sample_rate), preprocessors={'audio': FileFinder()}, duration=segment_size_sec)
def extract(protocol_name, file_finder, experiment_dir, robust=False, parallel=False): protocol = get_protocol(protocol_name, progress=False) # load configuration file config_yml = experiment_dir + '/config.yml' with open(config_yml, 'r') as fp: config = yaml.load(fp) feature_extraction_name = config['feature_extraction']['name'] features = __import__('pyannote.audio.features', fromlist=[feature_extraction_name]) FeatureExtraction = getattr(features, feature_extraction_name) feature_extraction = FeatureExtraction( **config['feature_extraction'].get('params', {})) sliding_window = feature_extraction.sliding_window() dimension = feature_extraction.dimension() if 'normalization' in config: normalization_name = config['normalization']['name'] normalization_module = __import__( 'pyannote.audio.features.normalization', fromlist=[normalization_name]) Normalization = getattr(normalization_module, normalization_name) normalization = Normalization( **config['normalization'].get('params', {})) else: normalization = None # create metadata file at root that contains # sliding window and dimension information precomputed = Precomputed(root_dir=experiment_dir, sliding_window=sliding_window, dimension=dimension) if parallel: extract_one = functools.partial(helper_extract, file_finder=file_finder, experiment_dir=experiment_dir, config_yml=config_yml, normalization=normalization, robust=robust) n_jobs = cpu_count() pool = Pool(n_jobs) imap = pool.imap else: feature_extraction = init_feature_extraction(experiment_dir) extract_one = functools.partial(helper_extract, file_finder=file_finder, experiment_dir=experiment_dir, feature_extraction=feature_extraction, normalization=normalization, robust=robust) imap = map for result in imap( extract_one, FileFinder.protocol_file_iter(protocol, extra_keys=['audio'])): if result is None: continue print(result)
def _validate_epoch_diarization( self, epoch, validation_data, protocol=None, subset: Subset = "development", device: Optional[torch.device] = None, batch_size: int = 32, n_jobs: int = 1, duration: float = None, step: float = 0.25, metric: str = None, **kwargs, ): # initialize embedding extraction pretrained = Pretrained( validate_dir=self.validate_dir_, epoch=epoch, duration=duration, step=step, batch_size=batch_size, device=device, ) preprocessors = self.preprocessors_ if "audio" not in preprocessors: preprocessors["audio"] = FileFinder() if "duration" not in preprocessors: preprocessors["duration"] = get_audio_duration _protocol = get_protocol(protocol, preprocessors=preprocessors) Z, t = dict(), dict() min_d, max_d = np.inf, -np.inf for current_file in getattr(_protocol, subset)(): uri = get_unique_identifier(current_file) uem = get_annotated(current_file) reference = current_file["annotation"] X_, t_ = [], [] embedding = pretrained(current_file) for i, (turn, _) in enumerate(reference.itertracks()): # extract embedding for current speech turn x_ = embedding.crop(turn, mode="center") if len(x_) < 1: x_ = embedding.crop(turn, mode="loose") if len(x_) < 1: msg = f"No embedding for {turn} in {uri:s}." raise ValueError(msg) # each speech turn is represented by its average embedding X_.append(np.mean(x_, axis=0)) t_.append(turn) X_ = np.array(X_) # apply hierarchical agglomerative clustering # all the way up to just one cluster (ie complete dendrogram) D = pdist(X_, metric=metric) min_d = min(np.min(D), min_d) max_d = max(np.max(D), max_d) Z[uri] = linkage(X_, method="pool", metric=metric) t[uri] = np.array(t_) def fun(threshold): _metric = DiarizationPurityCoverageFMeasure(weighted=False) for current_file in getattr(_protocol, subset)(): uri = get_unique_identifier(current_file) uem = get_annotated(current_file) reference = current_file["annotation"] clusters = fcluster(Z[uri], threshold, criterion="distance") hypothesis = Annotation(uri=uri) for (start_time, end_time), cluster in zip(t[uri], clusters): hypothesis[Segment(start_time, end_time)] = cluster _ = _metric(reference, hypothesis, uem=uem) return 1.0 - abs(_metric) res = scipy.optimize.minimize_scalar(fun, bounds=(0.0, 1.0), method="bounded", options={"maxiter": 10}) threshold = res.x.item() return { "metric": "diarization_fscore", "minimize": False, "value": float(1.0 - res.fun), }
def _validate_epoch_verification( self, epoch, validation_data, protocol=None, subset: Subset = "development", device: Optional[torch.device] = None, batch_size: int = 32, n_jobs: int = 1, duration: float = None, step: float = 0.25, metric: str = None, **kwargs, ): # initialize embedding extraction pretrained = Pretrained( validate_dir=self.validate_dir_, epoch=epoch, duration=duration, step=step, batch_size=batch_size, device=device, ) preprocessors = self.preprocessors_ if "audio" not in preprocessors: preprocessors["audio"] = FileFinder() if "duration" not in preprocessors: preprocessors["duration"] = get_audio_duration _protocol = get_protocol(protocol, preprocessors=preprocessors) y_true, y_pred, cache = [], [], {} for trial in getattr(_protocol, f"{subset}_trial")(): # compute embedding for file1 file1 = trial["file1"] hash1 = self.get_hash(file1) if hash1 in cache: emb1 = cache[hash1] else: emb1 = self.get_embedding(file1, pretrained) cache[hash1] = emb1 # compute embedding for file2 file2 = trial["file2"] hash2 = self.get_hash(file2) if hash2 in cache: emb2 = cache[hash2] else: emb2 = self.get_embedding(file2, pretrained) cache[hash2] = emb2 # compare embeddings distance = cdist(emb1, emb2, metric=metric)[0, 0] y_pred.append(distance) y_true.append(trial["reference"]) _, _, _, eer = det_curve(np.array(y_true), np.array(y_pred), distances=True) return { "metric": "equal_error_rate", "minimize": True, "value": float(eer) }
def train( self, protocol_name: Text, subset: Subset = "train", warm_start: Union[int, Literal["last"], Path] = 0, epochs: int = 1000, device: Optional[torch.device] = None, n_jobs: int = 1, ): """Train model Parameters ---------- protocol_name : `str` subset : {'train', 'development', 'test'}, optional Defaults to 'train'. warm_start : `int`, "last", or `Path`, optional When `int`, restart training at this epoch. When "last", restart from last epoch. When `Path`, restart from this model checkpoint. Defaults to training from scratch (warm_start = 0). epochs : `int`, optional Train for that many epochs. Defaults to 1000. device : `torch.device`, optional Device on which the model will be allocated. Defaults to using CPU. n_jobs : `int`, optional """ # initialize batch generator preprocessors = self.preprocessors_ if "audio" not in preprocessors: preprocessors["audio"] = FileFinder() if "duration" not in preprocessors: preprocessors["duration"] = get_audio_duration protocol = get_protocol(protocol_name, preprocessors=preprocessors) batch_generator = self.task_.get_batch_generator( self.feature_extraction_, protocol, subset=subset, resolution=self.model_resolution_, alignment=self.model_alignment_, ) # initialize model architecture based on specifications model = self.get_model_from_specs_(batch_generator.specifications) # freeze (when requested) model.freeze(getattr(self, "freeze_", [])) train_dir = Path( self.TRAIN_DIR.format( experiment_dir=self.experiment_dir, protocol=protocol_name, subset=subset, )) # use last available epoch as starting point if warm_start == "last": warm_start = self.get_number_of_epochs(train_dir=train_dir) - 1 iterations = self.task_.fit_iter( model, batch_generator, warm_start=warm_start, epochs=epochs, get_optimizer=self.get_optimizer_, scheduler=self.scheduler_, learning_rate=self.learning_rate_, train_dir=train_dir, device=device, callbacks=self.callbacks_, n_jobs=n_jobs, ) for _ in iterations: pass
def apply(self, protocol_name: str, output_dir: Path, subset: Optional[str] = None): """Apply current best pipeline Parameters ---------- protocol_name : `str` Name of pyannote.database protocol to process. subset : `str`, optional Subset to process. Defaults processing all subsets. """ # file generator protocol = get_protocol(protocol_name, progress=True, preprocessors=self.preprocessors_) output_dir.mkdir(parents=True, exist_ok=True) extension = self.pipeline_.write_format if subset is None: path = output_dir / f'{protocol_name}.all.{extension}' else: path = output_dir / f'{protocol_name}.{subset}.{extension}' # initialize evaluation metric try: metric = self.pipeline_.get_metric() except NotImplementedError as e: metric = None losses = [] skip_metric = False with open(path, mode='w') as fp: if subset is None: files = FileFinder.protocol_file_iter(protocol) else: files = getattr(protocol, subset)() for current_file in files: # apply pipeline and dump output to file output = self.pipeline_(current_file) self.pipeline_.write(fp, output) if skip_metric: continue try: if metric is None: loss = self.pipeline_.loss(current_file, output) losses.append(loss) else: from pyannote.database import get_annotated _ = metric(current_file['annotation'], output, uem=get_annotated(current_file)) except Exception as e: # this may happen for files with no available groundtruth. # in this case, we simply do not perform evaluation skip_metric = True if skip_metric: msg = (f'For some (possibly good) reason, the output of this ' f'pipeline could not be evaluated on {protocol_name}.') print(msg) return # report evaluation metric if metric is None: loss = np.mean(losses) print(f'Loss = {loss:g}') else: _ = metric.report(display=True)
def apply_pretrained( validate_dir: Path, protocol_name: Text, subset: Subset = "test", duration: Optional[float] = None, step: float = 0.25, device: Optional[torch.device] = None, batch_size: int = 32, pretrained: Optional[str] = None, Pipeline: type = None, **kwargs, ): """Apply pre-trained model Parameters ---------- validate_dir : Path protocol_name : `str` subset : 'train' | 'development' | 'test', optional Defaults to 'test'. duration : `float`, optional step : `float`, optional device : `torch.device`, optional batch_size : `int`, optional pretrained : `str`, optional Pipeline : `type` """ if pretrained is None: pretrained = Pretrained( validate_dir=validate_dir, duration=duration, step=step, batch_size=batch_size, device=device, ) output_dir = validate_dir / "apply" / f"{pretrained.epoch_:04d}" else: if pretrained in torch.hub.list("pyannote/pyannote-audio"): output_dir = validate_dir / pretrained else: output_dir = validate_dir pretrained = Wrapper( pretrained, duration=duration, step=step, batch_size=batch_size, device=device, ) params = {} try: params["classes"] = pretrained.classes except AttributeError as e: pass try: params["dimension"] = pretrained.dimension except AttributeError as e: pass # create metadata file at root that contains # sliding window and dimension information precomputed = Precomputed(root_dir=output_dir, sliding_window=pretrained.sliding_window, **params) # file generator preprocessors = getattr(pretrained, "preprocessors_", dict()) if "audio" not in preprocessors: preprocessors["audio"] = FileFinder() if "duration" not in preprocessors: preprocessors["duration"] = get_audio_duration protocol = get_protocol(protocol_name, preprocessors=preprocessors) files = getattr(protocol, subset)() for current_file in tqdm(iterable=files, desc=f"{subset.title()}", unit="file"): fX = pretrained(current_file) precomputed.dump(current_file, fX) # do not proceed with the full pipeline # when there is no such thing for current task if Pipeline is None: return # do not proceed with the full pipeline when its parameters cannot be loaded. # this might happen when applying a model that has not been validated yet try: pipeline_params = pretrained.pipeline_params_ except AttributeError as e: return # instantiate pipeline pipeline = Pipeline(scores=output_dir) pipeline.instantiate(pipeline_params) # load pipeline metric (when available) try: metric = pipeline.get_metric() except NotImplementedError as e: metric = None # apply pipeline and dump output to RTTM files output_rttm = output_dir / f"{protocol_name}.{subset}.rttm" with open(output_rttm, "w") as fp: files = getattr(protocol, subset)() for current_file in tqdm(iterable=files, desc=f"{subset.title()}", unit="file"): hypothesis = pipeline(current_file) pipeline.write_rttm(fp, hypothesis) # compute evaluation metric (when possible) reference = current_file.get("annotation", None) if reference is None: metric = None # compute evaluation metric (when available) if metric is None: continue uem = get_annotated(current_file) _ = metric(reference, hypothesis, uem=uem) # print pipeline metric (when available) if metric is None: return output_eval = output_dir / f"{protocol_name}.{subset}.eval" with open(output_eval, "w") as fp: fp.write(str(metric))
def extract(protocol_name, file_finder, experiment_dir, robust=False, parallel=False): protocol = get_protocol(protocol_name, progress=False) # load configuration file config_yml = experiment_dir + '/config.yml' with open(config_yml, 'r') as fp: config = yaml.load(fp) feature_extraction_name = config['feature_extraction']['name'] features = __import__('pyannote.audio.features', fromlist=[feature_extraction_name]) FeatureExtraction = getattr(features, feature_extraction_name) feature_extraction = FeatureExtraction( **config['feature_extraction'].get('params', {})) sliding_window = feature_extraction.sliding_window() dimension = feature_extraction.dimension() if 'normalization' in config: normalization_name = config['normalization']['name'] normalization_module = __import__('pyannote.audio.features.normalization', fromlist=[normalization_name]) Normalization = getattr(normalization_module, normalization_name) normalization = Normalization( **config['normalization'].get('params', {})) else: normalization = None # create metadata file at root that contains # sliding window and dimension information precomputed = Precomputed(root_dir=experiment_dir, sliding_window=sliding_window, dimension=dimension) if parallel: extract_one = functools.partial(helper_extract, file_finder=file_finder, experiment_dir=experiment_dir, config_yml=config_yml, normalization=normalization, robust=robust) n_jobs = cpu_count() pool = Pool(n_jobs) imap = pool.imap else: feature_extraction = init_feature_extraction(experiment_dir) extract_one = functools.partial(helper_extract, file_finder=file_finder, experiment_dir=experiment_dir, feature_extraction=feature_extraction, normalization=normalization, robust=robust) imap = map for result in imap(extract_one, FileFinder.protocol_file_iter( protocol, extra_keys=['audio'])): if result is None: continue print(result)
def load_config(config_yml: Path, training: bool = False, config_default_module: Text = None, pretrained_config_yml: Path = None) -> Dict: """ Returns ------- config : Dict ['preprocessors'] ['learning_rate'] ['scheduler'] ['get_optimizer'] ['callbacks'] ['feature_extraction'] ['task'] ['get_model_from_specs'] ['model_resolution'] ['model_alignment'] """ # load pretrained model configuration pretrained_cfg = dict() if pretrained_config_yml is not None: with open(pretrained_config_yml, 'r') as fp: pretrained_cfg = yaml.load(fp, Loader=yaml.SafeLoader) # load configuration or complain it's missing cfg = dict() if config_yml.exists(): with open(config_yml, 'r') as fp: cfg = yaml.load(fp, Loader=yaml.SafeLoader) # backup user-provided config because it will be updated if pretrained_config_yml is not None: shutil.copy(config_yml, config_yml.parent / 'backup+config.yml') elif pretrained_config_yml is None: msg = f'{config_yml} configuration file is missing.' raise FileNotFoundError(msg) # override pretrained model config with user-provided config cfg = merge_cfg(pretrained_cfg, cfg) # save (updated) config to disk if pretrained_config_yml is not None: with open(config_yml, 'w') as fp: yaml.dump(cfg, fp, default_flow_style=False) # preprocessors preprocessors = dict() for key, preprocessor in cfg.get('preprocessors', {}).items(): # preprocessors: # key: # name: package.module.ClassName # params: # param1: value1 # param2: value2 if isinstance(preprocessor, dict): Klass = get_class_by_name(preprocessor['name']) preprocessors[key] = Klass(**preprocessor.get('params', {})) continue try: # preprocessors: # key: /path/to/database.yml preprocessors[key] = FileFinder(preprocessor) except FileNotFoundError as e: # preprocessors: # key: /path/to/{uri}.wav preprocessors[key] = preprocessor if 'audio' not in preprocessors: preprocessors['audio'] = FileFinder() if 'duration' not in preprocessors: preprocessors['duration'] = get_audio_duration cfg['preprocessors'] = preprocessors # scheduler SCHEDULER_DEFAULT = { 'name': 'DavisKingScheduler', 'params': { 'learning_rate': 'auto' } } scheduler_cfg = cfg.get('scheduler', SCHEDULER_DEFAULT) Scheduler = get_class_by_name( scheduler_cfg['name'], default_module_name='pyannote.audio.train.schedulers') scheduler_params = scheduler_cfg.get('params', {}) cfg['learning_rate'] = scheduler_params.pop('learning_rate', 'auto') cfg['scheduler'] = Scheduler(**scheduler_params) # optimizer OPTIMIZER_DEFAULT = { 'name': 'SGD', 'params': { 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': True } } optimizer_cfg = cfg.get('optimizer', OPTIMIZER_DEFAULT) try: Optimizer = get_class_by_name(optimizer_cfg['name'], default_module_name='torch.optim') optimizer_params = optimizer_cfg.get('params', {}) cfg['get_optimizer'] = partial(Optimizer, **optimizer_params) # do not raise an error here as it is possible that the optimizer is # not really needed (e.g. in pipeline training) except ModuleNotFoundError as e: warnings.warn(e.args[0]) # data augmentation should only be active when training a model if training and 'data_augmentation' in cfg: DataAugmentation = get_class_by_name( cfg['data_augmentation']['name'], default_module_name='pyannote.audio.augmentation') augmentation = DataAugmentation( **cfg['data_augmentation'].get('params', {})) else: augmentation = None # custom callbacks callbacks = [] for callback_config in cfg.get('callbacks', {}): Callback = get_class_by_name(callback_config['name']) callback = Callback(**callback_config.get('params', {})) callbacks.append(callback) cfg['callbacks'] = callbacks # feature extraction FEATURE_DEFAULT = {'name': 'RawAudio', 'params': {'sample_rate': 16000}} feature_cfg = cfg.get('feature_extraction', FEATURE_DEFAULT) FeatureExtraction = get_class_by_name( feature_cfg['name'], default_module_name='pyannote.audio.features') feature_params = feature_cfg.get('params', {}) cfg['feature_extraction'] = FeatureExtraction(**feature_params, augmentation=augmentation) # task if config_default_module is None: config_default_module = 'pyannote.audio.labeling.tasks' try: TaskClass = get_class_by_name( cfg['task']['name'], default_module_name=config_default_module) except AttributeError: TaskClass = get_class_by_name( cfg['task']['name'], default_module_name='pyannote.audio.embedding.approaches') cfg['task'] = TaskClass(**cfg['task'].get('params', {})) # architecture Architecture = get_class_by_name( cfg['architecture']['name'], default_module_name='pyannote.audio.models') params = cfg['architecture'].get('params', {}) cfg['get_model_from_specs'] = partial(Architecture, **params) cfg['model_resolution'] = Architecture.get_resolution(**params) cfg['model_alignment'] = Architecture.get_alignment(**params) return cfg
def apply(self, protocol_name: str, output_dir: Path, subset: Optional[str] = None): """Apply current best pipeline Parameters ---------- protocol_name : `str` Name of pyannote.database protocol to process. subset : `str`, optional Subset to process. Defaults processing all subsets. """ # file generator protocol = get_protocol(protocol_name, progress=True, preprocessors=self.preprocessors_) output_dir.mkdir(parents=True, exist_ok=False) if subset is None: path = output_dir / f'{protocol_name}.all.txt' else: path = output_dir / f'{protocol_name}.{subset}.txt' # initialize evaluation metric try: metric = self.pipeline_.get_metric() except NotImplementedError as e: metric = None losses = [] with open(path, mode='w') as fp: if subset is None: files = FileFinder.protocol_file_iter(protocol) else: files = getattr(protocol, subset)() for current_file in files: output = self.pipeline_(current_file) # evaluate output if metric is None: loss = self.pipeline_.loss(current_file, output) losses.append(loss) else: from pyannote.database import get_annotated _ = metric(current_file['annotation'], output, uem=get_annotated(current_file)) self.pipeline_.write(fp, output) # report evaluation metric if metric is None: loss = np.mean(losses) print(f'Loss = {loss:g}') else: _ = metric.report(display=True)