def __init__(self, precomputed=None, **kwargs): super(SpeechActivityDetection, self).__init__() self.precomputed = precomputed self.precomputed_ = Precomputed(self.precomputed) self.has_overlap_ = self.precomputed_.dimension() == 3 self.with_params(**kwargs)
def __init__(self, feature_extraction, sad__pre, scd__pre, emb__pre, sad__onset=0.7, sad__offset=0.7, sad__dimension=1, scd__alpha=0.5, scd__min_duration=1., scd__dimension=1, emb__internal=False, cls__damping=0.8, cls__preference=-20, cls__metric='cosine'): super(SpeakerDiarizationPreStages, self).__init__() self.feature_extraction = feature_extraction # speech activity detection hyper-parameters self.sad__onset = sad__onset self.sad__offset = sad__offset self.sad__dimension = sad__dimension # speaker change detection hyper-parameters self.scd__alpha = scd__alpha self.scd__min_duration = scd__min_duration self.scd__dimension = scd__dimension # embedding hyper-parameters self.emb__internal = emb__internal # clustering hyper-parameters self.cls__damping = cls__damping self.cls__preference = cls__preference self.cls__metric = cls__metric step = self.feature_extraction.sliding_window().step # initialize speech activity detection module self.sad_ = Precomputed(sad__pre) self.sad_binarize_ = Binarize(onset=self.sad__onset, offset=self.sad__offset) # initialize speaker change detection module self.scd_ = Precomputed(scd__pre) self.scd_peak_ = Peak(alpha=self.scd__alpha, min_duration=self.scd__min_duration, percentile=False) # initialize speech turn embedding module self.emb_ = Precomputed(emb__pre) # initialize clustering module self.cls_ = my_cluster.ClusteringAP(metric=self.cls__metric, damping=self.cls__damping, preference=self.cls__preference)
def __init__(self, feature_extraction, sad__pre, scd__pre, emb__pre, sad__onset=0.7, sad__offset=0.7, sad__dimension=1, scd__alpha=0.5, scd__min_duration=1., scd__dimension=1, emb__internal=False, cls__method='average', cls__threshold=5, cls__metric='cosine'): super(SpeakerDiarizationHACPre, self).__init__() self.feature_extraction = feature_extraction # speech activity detection hyper-parameters self.sad__onset = sad__onset self.sad__offset = sad__offset self.sad__dimension = sad__dimension # speaker change detection hyper-parameters self.scd__alpha = scd__alpha self.scd__min_duration = scd__min_duration self.scd__dimension = scd__dimension # embedding hyper-parameters self.emb__internal = emb__internal # clustering hyper-parameters self.cls__method = cls__method self.cls__threshold = cls__threshold self.cls__metric = cls__metric step = self.feature_extraction.sliding_window().step # initialize speech activity detection module self.sad_ = Precomputed(sad__pre) self.sad_binarize_ = Binarize(onset=self.sad__onset, offset=self.sad__offset) # initialize speaker change detection module self.scd_ = Precomputed(scd__pre) self.scd_peak_ = Peak(alpha=self.scd__alpha, min_duration=self.scd__min_duration, percentile=False) # initialize speech turn embedding module self.emb_ = Precomputed(emb__pre) # initialize clustering module self.cls_ = my_cluster.ClusteringHAC(metric=self.cls__metric, method=self.cls__method, threshold=self.cls__threshold)
def __init__(self, split='', n_support=0, n_query=0, n_way=0, if_cuda=False): self.precomputed = Precomputed(VCTK_FEATURE_DIR) self.dataset = self.load_dataset(from_disk=True) self.split = split self.n_support = n_support self.n_query = n_query self.n_way = n_way self.dataset = None self.transforms = None self.if_cuda = if_cuda
def update_distances(args): """Loads user annotation from json path, converts it to pyannote `Annotation` using regions timings. From the annotation uri and precomputed embeddings, it computes the in-cluster distances between every speech turns Dumps the updated (with correct distances) JSON file to a timestamped file. """ json_path = Path(args['<json_path>']) uri = args['<uri>'] with open(json_path, 'r') as file: gecko_json = json.load(file) hypothesis, _, _, _ = gecko_JSON_to_Annotation(gecko_json, uri, 'speaker') colors = get_colors(uri) precomputed = Precomputed(embeddings) protocol = args['<database.task.protocol>'] protocol = get_protocol(protocol) for reference in getattr(protocol, 'test')(): if reference['uri'] == uri: features = precomputed(reference) break distances_per_speaker = get_distances_per_speaker(features, hypothesis) gecko_json = annotation_to_GeckoJSON(hypothesis, distances_per_speaker, colors) name = f"{json_path.stem}.{TIMESTAMP}.json" updated_path = Path(json_path.parent, name) with open(updated_path, 'w') as file: json.dump(gecko_json, file) print(f"succefully dumped {updated_path}")
def __init__(self, feature_extraction, emb__pre, emb__internal=False, cls__damping=0.8, cls__preference=-20, cls__metric='cosine'): super(SpeakerDiarizationOracleSegAP, self).__init__() self.feature_extraction = feature_extraction # embedding hyper-parameters self.emb__internal = emb__internal # clustering hyper-parameters self.cls__damping = cls__damping self.cls__preference = cls__preference self.cls__metric = cls__metric step = self.feature_extraction.sliding_window().step # initialize speech turn embedding module self.emb_ = Precomputed(emb__pre) # initialize clustering module self.cls_ = my_cluster.ClusteringAP(metric=self.cls__metric, damping=self.cls__damping, preference=self.cls__preference)
def extract(protocol_name, file_finder, experiment_dir, robust=False, parallel=False): protocol = get_protocol(protocol_name) # load configuration file config_yml = experiment_dir + "/config.yml" with open(config_yml, "r") as fp: config = yaml.load(fp, Loader=yaml.SafeLoader) FeatureExtraction = get_class_by_name( config["feature_extraction"]["name"], default_module_name="pyannote.audio.features", ) feature_extraction = FeatureExtraction( **config["feature_extraction"].get("params", {}) ) sliding_window = feature_extraction.sliding_window dimension = feature_extraction.dimension # create metadata file at root that contains # sliding window and dimension information precomputed = Precomputed( root_dir=experiment_dir, sliding_window=sliding_window, dimension=dimension ) if parallel: extract_one = functools.partial( helper_extract, file_finder=file_finder, experiment_dir=experiment_dir, config_yml=config_yml, robust=robust, ) n_jobs = cpu_count() pool = Pool(n_jobs) imap = pool.imap else: feature_extraction = init_feature_extraction(experiment_dir) extract_one = functools.partial( helper_extract, file_finder=file_finder, experiment_dir=experiment_dir, feature_extraction=feature_extraction, robust=robust, ) imap = map for result in imap(extract_one, protocol.files()): if result is None: continue print(result)
def get_file(protocol, uri, embeddings=None): for reference in protocol.files(): if reference['uri'] == uri: if embeddings: precomputed = Precomputed(embeddings) features = precomputed(reference) return reference, features return reference raise ValueError(f'{uri} is not in {protocol}')
def with_params(self, sad_onset=0.7, sad_offset=0.7, scd_alpha=0.5, scd_min_duration=1.): # initialize speech activity detection self.sad_ = Precomputed(self.sad) self.sad_onset = sad_onset self.sad_offset = sad_offset self.sad_binarize_ = Binarize(onset=sad_onset, offset=sad_offset) # initialize speaker change detection self.scd_ = Precomputed(self.scd) self.scd_alpha = scd_alpha self.scd_min_duration = scd_min_duration self.scd_peak_ = Peak(alpha=scd_alpha, min_duration=scd_min_duration) return self
def main(): usage = "%prog [options] database, raw_score_path" desc = "Write the output of the binary overlap detector into test based on a threshold" version = "%prog 0.1" parser = OptionParser(usage=usage, description=desc, version=version) parser.add_option("-t", "--onset", action="store", type="float", help="Onset Threshold", default=0.70) parser.add_option("-f", "--offset", action="store", type="float", help="Offset Threshold", default=0.70) parser.add_option("-d", "--dev", action="store_true", help="Print output based on development set", default=False) parser.add_option("-o", "--outputfile", action="store", type="string", help="Output file", default="./overlap.txt") (opt, args) = parser.parse_args() if(len(args)!=2): parser.error("Incorrect number of arguments") database, raw_score_path = args # get test file of protocol protocol = get_protocol(database) # load precomputed overlap scores as pyannote.core.SlidingWindowFeature precomputed = Precomputed(raw_score_path) # StackedRNN model # initialize binarizer # onset / offset are tunable parameters (and should be tuned for better # performance). we use log_scale=True because of the final log-softmax in the binarize = Binarize(onset=opt.onset, offset=opt.offset, log_scale=True) fw = open(opt.outputfile, 'wt') if opt.dev: for test_file in protocol.development(): ovl_scores = precomputed(test_file) # binarize overlap scores to obtain overlap regions as pyannote.core.Timeline ovl_regions = binarize.apply(ovl_scores, dimension=1) ovl_regions.uri = test_file['uri'] # write the output into text write_txt(fw, ovl_regions) else: for test_file in protocol.test(): ovl_scores = precomputed(test_file) # binarize overlap scores to obtain overlap regions as pyannote.core.Timeline ovl_regions = binarize.apply(ovl_scores, dimension=1) ovl_regions.uri = test_file['uri'] # write the output into text write_txt(fw, ovl_regions) fw.close()
def helper_extract(current_file, file_finder=None, experiment_dir=None, config_yml=None, feature_extraction=None, robust=False): if feature_extraction is None: feature_extraction = init_feature_extraction(experiment_dir) precomputed = Precomputed(root_dir=experiment_dir) return process_current_file(current_file, file_finder=file_finder, precomputed=precomputed, feature_extraction=feature_extraction, robust=robust)
def with_params(self, sad_onset=0.7, sad_offset=0.7, scd_alpha=0.5, scd_min_duration=1., cls_preference=-7.0, cls_damping=0.8): # initialize speech activity detection and speaker change detection super().with_params(sad_onset=sad_onset, sad_offset=sad_offset, scd_alpha=scd_alpha, scd_min_duration=scd_min_duration) # initialize speech turn embedding self.emb_ = Precomputed(self.emb) # initialize clustering module self.cls_damping = cls_damping self.cls_preference = cls_preference # NOTE cls_preference could be a multiplicative factor of a default # affinity value (e.g. median affinity value) self.cls_ = sklearn.cluster.AffinityPropagation( damping=cls_damping, preference=cls_preference, affinity='precomputed', max_iter=200, convergence_iter=15) # sklearn documentation: Preferences for each point - points with # larger values of preferences are more likely to be chosen as # exemplars. The number of exemplars, ie of clusters, is influenced by # the input preferences value. If the preferences are not passed as # arguments, they will be set to the median of the input similarities. # NOTE one could set the preference value of each speech turn # according to their duration. longer speech turns are expected to # have more accurate embeddings, therefore should be prefered for # exemplars return self
def __init__(self, emb__pre, cls__method='average', cls__threshold=5, cls__metric='cosine'): super(SpeakerDiarizationOnSceneHAC, self).__init__() # clustering hyper-parameters self.cls__method = cls__method self.cls__threshold = cls__threshold self.cls__metric = cls__metric # initialize speech turn embedding module self.emb_ = Precomputed(emb__pre) # initialize clustering module self.cls_ = my_cluster.ClusteringHAC(metric=self.cls__metric, method=self.cls__method, threshold=self.cls__threshold)
def with_params(self, sad_onset=0.7, sad_offset=0.7, scd_alpha=0.5, scd_min_duration=1., cls_threshold=0.8): # initialize speech activity detection and speaker change detection super().with_params(sad_onset=sad_onset, sad_offset=sad_offset, scd_alpha=scd_alpha, scd_min_duration=scd_min_duration) # initialize speech turn embedding self.emb_ = Precomputed(self.emb) # initialize clustering module self.cls_threshold = cls_threshold self.cls_ = HierarchicalPoolingClustering(metric=self.metric) return self
def check(protocol_name, file_finder, experiment_dir): protocol = get_protocol(protocol_name) precomputed = Precomputed(experiment_dir) for subset in ['development', 'test', 'train']: try: file_generator = getattr(protocol, subset)() first_item = next(file_generator) except NotImplementedError as e: continue for current_file in getattr(protocol, subset)(): try: audio = file_finder(current_file) current_file['audio'] = audio except ValueError as e: print(e) continue duration = get_audio_duration(current_file) try: features = precomputed(current_file) except PyannoteFeatureExtractionError as e: print(e) continue if not np.isclose(duration, features.getExtent().duration, atol=1.): uri = get_unique_identifier(current_file) print('Duration mismatch for "{uri}"'.format(uri=uri)) if np.any(np.isnan(features.data)): uri = get_unique_identifier(current_file) print('NaN for "{uri}"'.format(uri=uri))
def xp_objective(args, **kwargs): import sys sys.path.append("/people/yin/projects/") from pyannote.database import get_protocol, get_annotated, FileFinder protocol = get_protocol('Etape.SpeakerDiarization.TV', preprocessors={'audio': FileFinder()}) from pyannote.metrics.diarization import GreedyDiarizationErrorRate metric = GreedyDiarizationErrorRate() from optimize_cluster import speaker_diarization from pyannote.audio.features import Precomputed feature_extraction = Precomputed( '/vol/work1/bredin/feature_extraction/mfcc') sad_pre = '/vol/work1/yin/speech_activity_detection/shallow/train/REPERE.SpeakerDiarization.All.train/tune/Etape.SpeakerDiarization.TV.development/apply' scd_pre = '/vol/work1/yin/speaker_change_detection/paper/train/REPERE.SpeakerDiarization.All.train/tune/Etape.SpeakerDiarization.Debug.development/apply' emb_pre = '/vol/work1/yin/embedding/20180124' args['cls__damping'] = float(args['cls__damping']) args['cls__preference'] = float(args['cls__preference']) pipeline = speaker_diarization.SpeakerDiarizationPre( feature_extraction, sad_pre, scd_pre, emb_pre, **args) try: for current_file in protocol.train(): hypothesis = pipeline(current_file, annotated=True) if hypothesis is None: return 100 reference = current_file['annotation'] uem = get_annotated(current_file) metric(reference, hypothesis, uem=uem) except MemoryError as error: return 100 return abs(metric)
from pyannote.core import Segment from pyannote.audio.features import Precomputed, utils import pandas as pd import numpy as np from glob import glob # REPERE evaluation protocol # cf. https://github.com/pyannote/pyannote-database#pyannote-database from pyannote.database import get_protocol #different from the files I use protocol = get_protocol('REPERE.SpeakerDiarization.Plumcot') precomputed = Precomputed('/vol/work1/dyab/training_set/mfcc') train_dir = "/vol/work1/dyab/training_set/residual_local/" output_dir_train = "/vol/work1/dyab/training_set/numpy_arrays_local_audio/" y_labels_dir_train = "/vol/work1/dyab/training_set/numpy_arrays_local_landmarks/" dev_dir = "/vol/work1/dyab/development_set/residual_cluster_old/" output_dir_dev = "/vol/work1/dyab/development_set/numpy_arrays_cluster_old_audio/" y_labels_dir_dev = "/vol/work1/dyab/development_set/numpy_arrays_cluster_old_landmarks/" test_dir = "/vol/work1/dyab/test_set/residual/" output_dir_test = "/vol/work1/dyab/test_set/numpy_arrays_audio/" y_labels_dir_test = "/vol/work1/dyab/test_set/numpy_arrays_landmarks/" def generate_audio_features(dir, output_dir, y_labels_dir): for current_file in protocol.test( ): # iterate on all files of Phase2 training set print(current_file['uri'])
import numpy as np import matplotlib.pyplot as plt # AMI protocol from pyannote.database import get_protocol protocol = get_protocol('Test.SpeakerDiarization.MixHeadset') from pyannote.database import get_annotated # precomputed scores from pyannote.audio.features import Precomputed precomputed = Precomputed('./precomputed/scd') from pyannote.metrics.diarization import DiarizationPurityCoverageFMeasure metric = DiarizationPurityCoverageFMeasure() from pyannote.metrics.segmentation import SegmentationPurityCoverageFMeasure metric = SegmentationPurityCoverageFMeasure() # peak detection min_duration = 1.0 from pyannote.audio.signal import Peak # alpha / min_duration are tunable parameters (and should be tuned for better performance) # we use log_scale = True because of the final log-softmax in the StackedRNN model alphas = np.linspace(0, 1, 20) purity_list = [] coverage_list = [] for alpha in alphas:
def apply_pretrained(validate_dir: Path, protocol_name: str, subset: Optional[str] = "test", duration: Optional[float] = None, step: float = 0.25, device: Optional[torch.device] = None, batch_size: int = 32, pretrained: Optional[str] = None, Pipeline: type = None, **kwargs): """Apply pre-trained model Parameters ---------- validate_dir : Path protocol_name : `str` subset : 'train' | 'development' | 'test', optional Defaults to 'test'. duration : `float`, optional step : `float`, optional device : `torch.device`, optional batch_size : `int`, optional pretrained : `str`, optional Pipeline : `type` """ if pretrained is None: pretrained = Pretrained(validate_dir=validate_dir, duration=duration, step=step, batch_size=batch_size, device=device) output_dir = validate_dir / 'apply' / f'{pretrained.epoch_:04d}' else: if pretrained in torch.hub.list('pyannote/pyannote-audio'): output_dir = validate_dir / pretrained else: output_dir = validate_dir pretrained = Wrapper(pretrained, duration=duration, step=step, batch_size=batch_size, device=device) params = {} try: params['classes'] = pretrained.classes except AttributeError as e: pass try: params['dimension'] = pretrained.dimension except AttributeError as e: pass # create metadata file at root that contains # sliding window and dimension information precomputed = Precomputed(root_dir=output_dir, sliding_window=pretrained.sliding_window, **params) # file generator protocol = get_protocol(protocol_name, progress=True, preprocessors=pretrained.preprocessors_) for current_file in getattr(protocol, subset)(): fX = pretrained(current_file) precomputed.dump(current_file, fX) # do not proceed with the full pipeline # when there is no such thing for current task if Pipeline is None: return # do not proceed with the full pipeline when its parameters cannot be loaded. # this might happen when applying a model that has not been validated yet try: pipeline_params = pretrained.pipeline_params_ except AttributeError as e: return # instantiate pipeline pipeline = Pipeline(scores=output_dir) pipeline.instantiate(pipeline_params) # load pipeline metric (when available) try: metric = pipeline.get_metric() except NotImplementedError as e: metric = None # apply pipeline and dump output to RTTM files output_rttm = output_dir / f'{protocol_name}.{subset}.rttm' with open(output_rttm, 'w') as fp: for current_file in getattr(protocol, subset)(): hypothesis = pipeline(current_file) pipeline.write_rttm(fp, hypothesis) # compute evaluation metric (when possible) if 'annotation' not in current_file: metric = None # compute evaluation metric (when available) if metric is None: continue reference = current_file['annotation'] uem = get_annotated(current_file) _ = metric(reference, hypothesis, uem=uem) # print pipeline metric (when available) if metric is None: return output_eval = output_dir / f'{protocol_name}.{subset}.eval' with open(output_eval, 'w') as fp: fp.write(str(metric))
import pickle import random from itertools import cycle, islice dirname = os.path.dirname(os.path.realpath(__file__)) #VCTK_DATA_DIR = os.path.join(dirname, '../../data/vctk') VCTK_DATA_DIR = '/w/148/spoclab/data3/jixuan/SpeakerEmbedding/few_shot_learning/data/vctk' VCTK_AUDIO_DIR = '/p/spoclab/data3/jixuan/VCTK-Corpus/wav48' VCTK_FEATURE_DIR = '/p/spoclab/data3/jixuan/VCTK-Corpus/playground/feature-extraction' #VCTK_AUDIO_DIR = '/h/jixuan/Documents/data/VCTK-Corpus/wav48' #VCTK_FEATURE_DIR = '/h/jixuan/Documents/data/VCTK-Corpus/playground/feature-extraction' OMNIGLOT_CACHE = {} DATASET_CACHE = {} precomputed = Precomputed(VCTK_FEATURE_DIR) def get_feature(cfile, seg): return precomputed.crop(cfile, seg, mode='center', fixed=2.0) def convert_tensor(key, d): d[key] = torch.from_numpy(np.array(d[key], np.float32, copy=False)) return d def convert_cuda(key, d): if hasattr(d[key], 'cuda'): d[key] = d[key].cuda() return d
class SpeechActivityDetection(Pipeline): """Speech activity detection pipeline Parameters ---------- precomputed : str Path to precomputed SAD scores. """ def __init__(self, precomputed=None, **kwargs): super(SpeechActivityDetection, self).__init__() self.precomputed = precomputed self.precomputed_ = Precomputed(self.precomputed) self.has_overlap_ = self.precomputed_.dimension() == 3 self.with_params(**kwargs) def get_tune_space(self): space = { 'speech_onset': chocolate.uniform(0., 1.), 'speech_offset': chocolate.uniform(0., 1.), 'speech_min_duration_on': chocolate.uniform(0., 2.), 'speech_min_duration_off': chocolate.uniform(0., 2.), 'speech_pad_onset': chocolate.uniform(-1., 1.), 'speech_pad_offset': chocolate.uniform(-1., 1.) } if self.has_overlap_: space.update({ 'overlap_onset': chocolate.uniform(0., 1.), 'overlap_offset': chocolate.uniform(0., 1.), 'overlap_min_duration_on': chocolate.uniform(0., 2.), 'overlap_min_duration_off': chocolate.uniform(0., 2.), 'overlap_pad_onset': chocolate.uniform(-1., 1.), 'overlap_pad_offset': chocolate.uniform(-1., 1.) }) return space def get_tune_metric(self): return DetectionErrorRate() def with_params(self, **params): # initialize speech/non-speech binarizer speech_params = { '_'.join(param.split('_')[1:]): value for param, value in params.items() if param.startswith('speech_') } self.speech_binarize_ = Binarize(**speech_params) # initialize overlap binarizer if self.has_overlap_: overlap_params = { '_'.join(param.split('_')[1:]): value for param, value in params.items() if param.startswith('overlap_') } self.overlap_binarize_ = Binarize(**overlap_params) return self def apply(self, current_file): # extract precomputed scores precomputed = self.precomputed_(current_file) # if this check has not been done yet, do it once and for all if not hasattr(self, "log_scale_"): # heuristic to determine whether scores are log-scaled if np.nanmean(precomputed.data) < 0: self.log_scale_ = True else: self.log_scale_ = False data = np.exp(precomputed.data) if self.log_scale_ \ else precomputed.data # speech vs. non-speech speech_prob = SlidingWindowFeature(1. - data[:, 0], precomputed.sliding_window) speech = self.speech_binarize_.apply(speech_prob) if self.has_overlap_: # overlap vs. non-overlap overlap_prob = SlidingWindowFeature(data[:, 2], precomputed.sliding_window) overlap = self.overlap_binarize_.apply(overlap_prob) # overlap speech can only happen in speech regions overlap = overlap.crop(speech) else: # empty timeline overlap = Timeline() speech = speech.to_annotation(generator='string') overlap = overlap.to_annotation(generator='int') hypothesis = speech.update(overlap) return hypothesis
def apply(self, protocol_name, output_dir, step=None, internal=False): # load best performing model with open(self.validate_txt_, 'r') as fp: eers = SortedDict(np.loadtxt(fp)) best_epoch = int(eers.iloc[np.argmin(eers.values())]) embedding = SequenceEmbeddingAutograd.load(self.train_dir_, best_epoch) # guess sequence duration from path (.../3.2+0.8/...) directory = basename(dirname(self.experiment_dir)) duration, _, _, _ = self._directory_to_params(directory) if step is None: step = 0.5 * duration # initialize embedding extraction batch_size = self.approach_.batch_size extraction = Extraction(embedding, self.feature_extraction_, duration, step=step, batch_size=batch_size, internal=internal) sliding_window = extraction.sliding_window dimension = extraction.dimension # create metadata file at root that contains # sliding window and dimension information path = Precomputed.get_config_path(output_dir) mkdir_p(dirname(path)) f = h5py.File(path) f.attrs['start'] = sliding_window.start f.attrs['duration'] = sliding_window.duration f.attrs['step'] = sliding_window.step f.attrs['dimension'] = dimension f.close() # file generator protocol = get_protocol(protocol_name, progress=True, preprocessors=self.preprocessors_) for subset in ['development', 'test', 'train']: try: file_generator = getattr(protocol, subset)() first_item = next(file_generator) except NotImplementedError as e: continue file_generator = getattr(protocol, subset)() for current_file in file_generator: fX = extraction.apply(current_file) path = Precomputed.get_path(output_dir, current_file) mkdir_p(dirname(path)) f = h5py.File(path) f.attrs['start'] = sliding_window.start f.attrs['duration'] = sliding_window.duration f.attrs['step'] = sliding_window.step f.attrs['dimension'] = dimension f.create_dataset('features', data=fX.data) f.close()
def __init__(self, wrappable: Wrappable, **params): super().__init__() from pyannote.audio.features import Pretrained from pyannote.audio.features import Precomputed from pyannote.audio.features import FeatureExtraction from pyannote.audio.features import RawAudio scorer = None msg = "" # corner if isinstance(wrappable, dict): wrappable, custom_params = dict(wrappable).popitem() params.update(**custom_params) # If `wrappable` already complies with the `FeatureExtraction` API , it # is kept unchanged. This includes instances of any `FeatureExtraction` # subclass,`RawAudio` instances, `Precomputed` instances, and # `Pretrained` instances. if isinstance(wrappable, (FeatureExtraction, RawAudio, Pretrained, Precomputed)): scorer = wrappable elif Path(wrappable).is_dir(): directory = Path(wrappable) # If `wrappable` is a `Path` to a directory containing precomputed # features or scores, wrap the corresponding `Precomputed` instance try: scorer = Precomputed(root_dir=directory) except Exception as e: scorer = None # If `wrappable` is a `Path` to a validation directory, # wrap the corresponding `Pretrained` instance if scorer is None: try: scorer = Pretrained(validate_dir=directory, **params) except Exception as e: scorer = None if scorer is None: msg = (f'"{wrappable}" directory does not seem to be the path ' f"to precomputed features nor the path to a model " f"validation step.") # If `wrappable` is a `Path` to a pretrined model checkpoint, # wrap the corresponding `Pretrained` instance elif Path(wrappable).is_file(): checkpoint = Path(wrappable) try: validate_dir = checkpoint.parents[1] / "validate" / "fake" epoch = int(checkpoint.stem) scorer = Pretrained(validate_dir=validate_dir, epoch=epoch, **params) except Exception as e: msg = (f'"{wrappable}" directory does not seem to be the path ' f"to a pretrained model checkpoint.") scorer = None elif isinstance(wrappable, Text): # If `wrappable` is a `Text` starting with '@' such as '@key', # it means that one should read the "key" key of protocol files if wrappable.startswith("@"): key = wrappable[1:] scorer = partial(_use_existing_key, key) # scorer = lambda current_file: current_file[key] # If `wrappable` is a `Text` containing the name of an existing # `torch.hub` model, wrap the corresponding `Pretrained`. else: try: import torch scorer = torch.hub.load("pyannote/pyannote-audio", wrappable, **params) if not isinstance(scorer, Pretrained): msg = ( f'"{wrappable}" exists on torch.hub but does not ' f"return a `Pretrained` model instance.") scorer = None except Exception as e: msg = (f"Could not load {wrappable} model from torch.hub. " f"The following exception was raised:\n{e}") scorer = None # warn the user the something went wrong if scorer is None: raise ValueError(msg) self.scorer_ = scorer
res['scores'] = pscores return res if __name__ == '__main__': arguments = docopt(__doc__, version='Speaker-spotting') # protocol protocol_name = arguments['<database.task.protocol>'] embedding_path = arguments['<embedding_path>'] protocol = get_protocol(protocol_name, progress=True) # subset (train, development, or test) subset = arguments['--subset'] output_file = arguments['<output_file>'] from pyannote.audio.features import Precomputed precomputed = Precomputed(embedding_path) models = {} enrolments = getattr(protocol, '{subset}_enrolment'.format(subset=subset))() for current_enrolment in enrolments: model_id = current_enrolment.pop('model_id') models[model_id] = speaker_spotting_enrol(current_enrolment) if arguments['oracle']: REFERENCE = {} for current_file in getattr(protocol, subset)(): uri = current_file['uri'] if uri not in REFERENCE: REFERENCE[uri] = Annotation(uri=uri) REFERENCE[uri].update(current_file['annotation'])
class SpeechActivityDetection(Pipeline): """Speech activity detection pipeline Parameters ---------- precomputed : str Path to precomputed SAD scores. """ def __init__(self, precomputed=None, **kwargs): super(SpeechActivityDetection, self).__init__() self.precomputed = precomputed self.precomputed_ = Precomputed(self.precomputed) self.has_overlap_ = self.precomputed_.dimension() == 3 self.with_params(**kwargs) def get_tune_space(self): space = { 'speech_onset': chocolate.uniform(0., 1.), 'speech_offset': chocolate.uniform(0., 1.), 'speech_min_duration_on': chocolate.uniform(0., 2.), 'speech_min_duration_off': chocolate.uniform(0., 2.), 'speech_pad_onset': chocolate.uniform(-1., 1.), 'speech_pad_offset': chocolate.uniform(-1., 1.) } if self.has_overlap_: space.update({ 'overlap_onset': chocolate.uniform(0., 1.), 'overlap_offset': chocolate.uniform(0., 1.), 'overlap_min_duration_on': chocolate.uniform(0., 2.), 'overlap_min_duration_off': chocolate.uniform(0., 2.), 'overlap_pad_onset': chocolate.uniform(-1., 1.), 'overlap_pad_offset': chocolate.uniform(-1., 1.) }) return space def get_tune_metric(self): return DetectionErrorRate() def with_params(self, **params): # initialize speech/non-speech binarizer speech_params = { '_'.join(param.split('_')[1:]): value for param, value in params.items() if param.startswith('speech_')} self.speech_binarize_ = Binarize(**speech_params) # initialize overlap binarizer if self.has_overlap_: overlap_params = { '_'.join(param.split('_')[1:]): value for param, value in params.items() if param.startswith('overlap_')} self.overlap_binarize_ = Binarize(**overlap_params) return self def apply(self, current_file): # extract precomputed scores precomputed = self.precomputed_(current_file) # if this check has not been done yet, do it once and for all if not hasattr(self, "log_scale_"): # heuristic to determine whether scores are log-scaled if np.nanmean(precomputed.data) < 0: self.log_scale_ = True else: self.log_scale_ = False data = np.exp(precomputed.data) if self.log_scale_ \ else precomputed.data # speech vs. non-speech speech_prob = SlidingWindowFeature( 1. - data[:, 0], precomputed.sliding_window) speech = self.speech_binarize_.apply(speech_prob) if self.has_overlap_: # overlap vs. non-overlap overlap_prob = SlidingWindowFeature( data[:, 2], precomputed.sliding_window) overlap = self.overlap_binarize_.apply(overlap_prob) # overlap speech can only happen in speech regions overlap = overlap.crop(speech) else: # empty timeline overlap = Timeline() speech = speech.to_annotation(generator='string') overlap = overlap.to_annotation(generator='int') hypothesis = speech.update(overlap) return hypothesis
# coding: utf-8 import sys sys.path.append("../") import clustering import numpy as np from pyannote.audio.features import Precomputed precomputed = Precomputed('/vol/work1/bredin/speaker_spotting/embeddings') from pyannote.database import get_protocol, FileFinder protocol = get_protocol('AMI.SpeakerSpotting.MixHeadset', progress=True) from pyannote.core import Annotation,Segment, Timeline # enrolment consists in summing all relevant embeddings def speaker_spotting_enrol(current_enrolment): enrol_with = current_enrolment['enrol_with'] embeddings = precomputed(current_enrolment) return np.sum(embeddings.crop(enrol_with), axis=0, keepdims=True) models = {} for current_enrolment in protocol.test_enrolment(): model_id = current_enrolment.pop('model_id') models[model_id] = speaker_spotting_enrol(current_enrolment) REFERENCE = {} for current_file in protocol.test(): uri = current_file['uri'] if uri not in REFERENCE: REFERENCE[uri] = Annotation(uri=uri)
class VCTKLoader: def __init__(self, split='', n_support=0, n_query=0, n_way=0, if_cuda=False): self.precomputed = Precomputed(VCTK_FEATURE_DIR) self.dataset = self.load_dataset(from_disk=True) self.split = split self.n_support = n_support self.n_query = n_query self.n_way = n_way self.dataset = None self.transforms = None self.if_cuda = if_cuda def shuffle_dataset(self): num_label = len(self.dataset['class']) num_data = len(self.dataset['data'][0]) # x: num_labels * num_samples_per_label # num_label * num_data data_index = np.tile(np.arange(num_data), (num_label,1)) label_index = np.tile(np.arange(num_label).reshape(-1, 1), (1,num_data)) label_index = np.expand_dims(label_index, axis=2) data_index = np.expand_dims(data_index, axis=2) data_label_idx = np.concatenate((data_index, label_index), axis=2) # shuffle rows (labels) np.random.shuffle(data_label_idx) # shuffle data index for each row (for each label) for dl in data_label_idx: np.random.shuffle(dl) index_batches = sub_matrix(data_label_idx, self.n_support+self.n_query, self.n_way) # optional: shuffle batches np.random.shuffle(index_batches) return index_batches def __iter__(self): if self.dataset is None: self.dataset = self.load_dataset(from_disk=True)[self.split] transforms = [partial(batch_from_index, self.dataset['data']), partial(convert_tensor, 'data')] if self.if_cuda: transforms.append(CudaTransform()) self.transforms = compose(transforms) index_batches = self.shuffle_dataset() batches = TransformDataset(ListDataset(index_batches), self.transforms) print(f"\nSize of batches: {len(batches)}") for batch in batches: batch['n_way'] = self.n_way batch['n_support'] = self.n_support batch['n_query'] = self.n_query yield batch def get_feature(self, cfile, seg): return self.precomputed.crop(cfile, seg, mode='center', fixed=2.0) # return precomputed(cfile).crop(seg, mode='center', fixed=2.0) def load_speaker_file(self, protocol_name='SpeakerEmbedding.All', from_disk=False): database = VCTK() protocol = database.get_protocol(protocol_name.split('.')[0], protocol_name.split('.')[1]) speaker_file = { 'train': {}, 'val': {}, 'test':{}, 'unseen':{}} if from_disk: print('Loading speaker_file from disk...') vctk_file_name = os.path.join(VCTK_DATA_DIR, 'vctk_speaker_file') if not os.path.isfile(vctk_file_name): raise ValueError(f'{vctk_file_name} not found') else: with open(vctk_file_name, 'rb') as vctk_file: speaker_file = pickle.load(vctk_file) return speaker_file print('Loading unseen set...') for current_file in protocol.unseen_iter(): speaker = current_file['uri'].split('_')[0] if not speaker in speaker_file['unseen']: speaker_file['unseen'][speaker] = [] speaker_file['unseen'][speaker].append(current_file) print('Loading training set...') for current_file in protocol.train(): speaker = current_file['uri'].split('_')[0] if not speaker in speaker_file['train']: speaker_file['train'][speaker] = [] speaker_file['train'][speaker].append(current_file) print('Loading test set...') for current_file in protocol.test(): speaker = current_file['uri'].split('_')[0] if not speaker in speaker_file['test']: speaker_file['test'][speaker] = [] speaker_file['test'][speaker].append(current_file) print('Loading development set...') for current_file in protocol.development(): speaker = current_file['uri'].split('_')[0] if not speaker in speaker_file['val']: speaker_file['val'][speaker] = [] speaker_file['val'][speaker].append(current_file) with open(os.path.join(VCTK_DATA_DIR, 'vctk_speaker_file'), 'wb') as vctk_file: pickle.dump(speaker_file, vctk_file, -1) return speaker_file def load_speaker_segments(self, seg_dur=2.0, overlap_ratio=0.25, from_disk=False): ''' 2 seconds segments, with overlapping ratio = 0.25 |----||----| |----| ''' spk_seg = { 'train': {}, 'val': {}, 'test':{}, 'unseen':{}} if from_disk: print('Loading speaker_segments from disk...') vctk_file_name = os.path.join(VCTK_DATA_DIR, 'vctk_speaker_segments') if os.path.isfile(vctk_file_name): with open(vctk_file_name, 'rb') as vctk_file: spk_seg = pickle.load(vctk_file) return spk_seg else: raise ValueError(f'{vctk_file_name} not found') def fetch_spk_seg(speaker_file): speaker_seg = {} for spk, sfiles in speaker_file.items(): speaker_seg[spk] = [] for sfile in sfiles: duration = sfile['annotated'].duration() if duration < seg_dur: continue half_seg = seg_dur / 2 for mid in np.arange(half_seg, duration-half_seg, seg_dur*(1-overlap_ratio)): speaker_seg[spk].append( (Segment(mid-half_seg, mid+half_seg), sfile) ) return speaker_seg spk_file = self.load_speaker_file(from_disk=from_disk) for sub in ['train', 'val', 'test', 'unseen']: spk_file[sub] = fetch_spk_seg(spk_file[sub]) with open(os.path.join(VCTK_DATA_DIR, 'vctk_speaker_segments'), 'wb') as vctk_file: pickle.dump(spk_file, vctk_file, -1) return spk_file def load_dataset(self, from_disk=False): print('Loading dataset...') dataset = { 'train': {}, 'val': {}, 'test':{}, 'unseen':{}} if from_disk: vctk_file_name = os.path.join(VCTK_DATA_DIR, 'vctk_datasets') if os.path.isfile(vctk_file_name): with open(vctk_file_name, 'rb') as vctk_file: dataset = pickle.load(vctk_file) return dataset else: raise ValueError(f'{vctk_file_name} not found') spk_seg = self.load_speaker_segments(from_disk=from_disk) with open(os.path.join(VCTK_DATA_DIR, 'vctk_datasets'), 'wb') as vctk_file: for sub in dataset.keys(): speaker_segments = spk_seg[sub] seg_count = [] for spk, seg in speaker_segments.items(): seg_count.append(len(seg)) max_count = max(seg_count) for spk in speaker_segments.keys(): speaker_segments[spk] = list(islice(cycle(speaker_segments[spk]), max_count)) y_labels = speaker_segments.keys() dataset[sub] = { 'class': list(y_labels), 'data': [speaker_segments[label] for label in y_labels] } pickle.dump(dataset, vctk_file, -1) return dataset # load MFCC features into memory # take too much memory, not recommand def load_features(self, from_disk=False): feature_dataset = {'train': {}, 'val': {}, 'test': {}, 'unseen':{}} if from_disk: vctk_file_name = os.path.join(VCTK_DATA_DIR, 'vctk_feature_datasets') if os.path.isfile(vctk_file_name): with open(vctk_file_name, 'rb') as vctk_file: feature_dataset = pickle.load(vctk_file) return feature_dataset else: raise ValueError(f'{vctk_file_name} not found') dataset = self.load_dataset(from_disk=True) with open(os.path.join(VCTK_DATA_DIR, 'vctk_feature_datasets'), 'wb') as vctk_file: for sub in feature_dataset.items(): print(f'Loading feature: {sub}') subset = dataset[sub] for spk, seg_list in subset.items(): print(f'Speaker: {spk}') feature_list = [] for seg, cfile in seg_list: feature_list.append(self.get_feature(cfile, seg)) feature_dataset[sub][spk] = np.array(feature_list) pickle.dump(feature_dataset, vctk_file, -1) return feature_dataset # load data for same/different experiments def load_same_diff_data(self, from_disk=False): exp_dataset = {'train': {},'val': {}, 'test': {}, 'unseen': {}} n_pair = 40 n_pair_unseen = 100 print(f'Loading same/diff data, #pair: {n_pair}, #pair_unseen: {n_pair_unseen}') file_name = os.path.join(VCTK_DATA_DIR, f'same_diff_exp_norepeat_{n_pair}_{n_pair_unseen}') if from_disk: if os.path.isfile(file_name): with open(file_name, 'rb') as dfile: exp_dataset = pickle.load(dfile) return exp_dataset else: raise ValueError(f'{file_name} not found, generate first?') def gen_same_diff(data, labels, first_n=-1, n_same_pair=20): same_pairs = [] diff_pairs = [] for spk, spk_data in zip(labels, data): first_n = len(spk_data) # if first_n == -1 or first_n > len(spk_data) else first_n # get same pairs ind = list(range(first_n)) n = 0 same_pair_ind = [] ind_his = set() while n < n_same_pair: i1 = random.choice(ind) ind.remove(i1) i2 = random.choice(ind) if (not (i1, i2) in ind_his) and (not (i2, i1) in ind_his): ind_his.add((i1, i2)) same_pair_ind.append((i1, i2)) n += 1 else: print('skip repeated pair') ind = list(range(first_n)) spk_same_pairs = [(spk_data[ind[0]], spk_data[ind[1]]) for ind in same_pair_ind] same_pairs.extend(spk_same_pairs) # get different pairs labels_ind = list(range(len(labels))) n = 0 pair_his = set() while n < n_same_pair * len(labels): s1 = random.choice(labels_ind) ind1 = random.choice(list(range(len(data[s1])))) labels_ind.remove(s1) s2 = random.choice(labels_ind) ind2 = random.choice(list(range(len(data[s2])))) pair1 = (s1, ind1); pair2 = (s2, ind2) if (not (pair1, pair2) in pair_his ) and (not (pair2, pair1) in pair_his): pair_his.add((pair1, pair2)) diff_pairs.append((data[s1][ind1], data[s2][ind2])) n += 1 else: print('skip repeated pair') labels_ind = list(range(len(labels))) print(len(same_pairs), len(diff_pairs)) return same_pairs, diff_pairs print("Loading dataset from disk, instead of generating from scratch") dataset = self.load_dataset(from_disk=True) for subset in ['train', 'val', 'test', 'unseen']: first_n = -1 # if subset == 'unseen' else 200 npair = n_pair_unseen if subset == 'unseen' else n_pair data = dataset[subset]['data'] labels = dataset[subset]['class'] same_pairs, diff_pairs = gen_same_diff(data, labels, first_n=first_n, n_same_pair=npair) assert len(same_pairs) == len(diff_pairs) assert len(same_pairs) == len(labels) * npair exp_dataset[subset] = { 'same': same_pairs, 'diff': diff_pairs } with open(os.path.join(VCTK_DATA_DIR, file_name), 'wb') as dfile: pickle.dump(exp_dataset, dfile, -1) return exp_dataset