def train(protocol, experiment_dir, train_dir, subset='train'): # -- TRAINING -- nb_epoch = 100 optimizer = SSMORMS3() # load configuration file config_yml = experiment_dir + '/config.yml' with open(config_yml, 'r') as fp: config = yaml.load(fp) # -- FEATURE EXTRACTION -- feature_extraction_name = config['feature_extraction']['name'] features = __import__('pyannote.audio.features', fromlist=[feature_extraction_name]) FeatureExtraction = getattr(features, feature_extraction_name) feature_extraction = FeatureExtraction( **config['feature_extraction'].get('params', {})) # -- ARCHITECTURE -- architecture_name = config['architecture']['name'] models = __import__('pyannote.audio.labeling.models', fromlist=[architecture_name]) Architecture = getattr(models, architecture_name) architecture = Architecture(**config['architecture'].get('params', {})) # -- SEQUENCE GENERATOR -- batch_size = config['sequences'].get('batch_size', 1024) duration = config['sequences']['duration'] step = config['sequences']['step'] balance = config['sequences']['balance'] generator = ChangeDetectionBatchGenerator(feature_extraction, batch_size=batch_size, duration=duration, step=step, balance=balance) # number of steps per epoch seconds_per_epoch = protocol.stats(subset)['annotated'] steps_per_epoch = int(np.ceil((seconds_per_epoch / step) / batch_size)) # input shape (n_frames, n_features) input_shape = generator.shape labeling = SequenceLabeling() labeling.fit(input_shape, architecture, generator(getattr(protocol, subset)(), infinite=True), steps_per_epoch, nb_epoch, loss='binary_crossentropy', optimizer=optimizer, log_dir=train_dir)
def train(self, protocol_name, subset='train'): train_dir = self.TRAIN_DIR.format(experiment_dir=self.experiment_dir, protocol=protocol_name, subset=subset) # sequence batch generator batch_size = self.config_['sequences'].get('batch_size', 8192) duration = self.config_['sequences']['duration'] step = self.config_['sequences']['step'] batch_generator = SpeechActivityDetectionBatchGenerator( self.feature_extraction_, duration=duration, step=step, batch_size=batch_size) batch_generator.cache_preprocessed_ = self.cache_preprocessed_ protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) # total train duration train_total = protocol.stats(subset)['annotated'] # number of batches per epoch steps_per_epoch = int(np.ceil((train_total / step) / batch_size)) # input shape (n_frames, n_features) input_shape = batch_generator.shape # generator that loops infinitely over all training files train_files = getattr(protocol, subset)() generator = batch_generator(train_files, infinite=True) labeling = SequenceLabeling() labeling.fit(input_shape, self.architecture_, generator, steps_per_epoch, 1000, optimizer=SSMORMS3(), log_dir=train_dir) return labeling
def tune_binarizer(app, epoch, protocol_name, subset='development'): """Tune binarizer Parameters ---------- app : SpeechActivityDetection epoch : int Epoch number. protocol_name : str E.g. 'Etape.SpeakerDiarization.TV' subset : {'train', 'development', 'test'}, optional Defaults to 'development'. Returns ------- params : dict See Binarize.tune metric : float Best achieved detection error rate """ # initialize protocol protocol = get_protocol(protocol_name, progress=False, preprocessors=app.preprocessors_) # load model for epoch 'epoch' sequence_labeling = SequenceLabeling.from_disk(app.train_dir_, epoch) # initialize sequence labeling duration = app.config_['sequences']['duration'] step = app.config_['sequences']['step'] aggregation = SequenceLabelingAggregation(sequence_labeling, app.feature_extraction_, duration=duration, step=step) aggregation.cache_preprocessed_ = False # tune Binarize thresholds (onset & offset) # with respect to detection error rate binarize_params, metric = Binarize.tune(getattr(protocol, subset)(), aggregation.apply, get_metric=DetectionErrorRate, dimension=1) return binarize_params, metric
def train(dataset, medium_template, config_yml): # load configuration file with open(config_yml, 'r') as fp: config = yaml.load(fp) # deduce workdir from path of configuration file workdir = os.path.dirname(config_yml) # this is where model weights are saved after each epoch log_dir = workdir + '/' + dataset # -- DATASET -- db, task, protocol, subset = dataset.split('.') database = get_database(db, medium_template=medium_template) protocol = database.get_protocol(task, protocol) if not hasattr(protocol, subset): raise NotImplementedError('') file_generator = getattr(protocol, subset)() # -- FEATURE EXTRACTION -- # input sequence duration duration = config['feature_extraction']['duration'] # MFCCs feature_extractor = YaafeMFCC(**config['feature_extraction']['mfcc']) # normalization normalize = config['feature_extraction']['normalize'] # -- NETWORK STRUCTURE -- # internal model structure lstm = config['network']['lstm'] dense = config['network']['dense'] # bi-directional bidirectional = config['network']['bidirectional'] # -- TRAINING -- # number training set hours (speech + non speech) to use in each epoch # FIXME -- update ETAPE so that we can query this information directly hours_per_epoch = config['training']['hours_per_epoch'] # overlap ratio between each window overlap = config['training']['overlap'] # batch size batch_size = config['training']['batch_size'] # number of epochs nb_epoch = config['training']['nb_epoch'] # optimizer optimizer = config['training']['optimizer'] # labeling n_classes = 2 design_model = StackedLSTM(n_classes=n_classes, lstm=lstm, bidirectional=bidirectional, dense=dense) labeling = SequenceLabeling(design_model, optimizer=optimizer, log_dir=log_dir) # segment generator for training step = duration * (1. - overlap) batch_generator = SpeechActivityDetectionBatchGenerator( feature_extractor, duration=duration, normalize=normalize, step=step, batch_size=batch_size) # log loss and accuracy during training and # keep track of best models for both metrics log = [('train', 'loss'), ('train', 'accuracy')] callback = LoggingCallback(log_dir=log_dir, log=log) # number of samples per epoch + round it to closest batch samples_per_epoch = batch_size * int( np.ceil((3600 * hours_per_epoch / step) / batch_size)) # input shape (n_frames, n_features) input_shape = batch_generator.get_shape() generator = batch_generator(file_generator, infinite=True) labeling.fit(input_shape, generator, samples_per_epoch, nb_epoch, callbacks=[callback])
def test(dataset, medium_template, config_yml, weights_h5, output_dir): # load configuration file with open(config_yml, 'r') as fp: config = yaml.load(fp) # this is where model architecture was saved architecture_yml = os.path.dirname( os.path.dirname(weights_h5)) + '/architecture.yml' # -- DATASET -- db, task, protocol, subset = dataset.split('.') database = get_database(db, medium_template=medium_template) protocol = database.get_protocol(task, protocol) if not hasattr(protocol, subset): raise NotImplementedError('') file_generator = getattr(protocol, subset)() # -- FEATURE EXTRACTION -- # input sequence duration duration = config['feature_extraction']['duration'] # MFCCs feature_extractor = YaafeMFCC(**config['feature_extraction']['mfcc']) # normalization normalize = config['feature_extraction']['normalize'] # -- TESTING -- # overlap ratio between each window overlap = config['testing']['overlap'] step = duration * (1. - overlap) # prediction smoothing onset = config['testing']['binarize']['onset'] offset = config['testing']['binarize']['offset'] binarizer = Binarize(onset=0.5, offset=0.5) sequence_labeling = SequenceLabeling.from_disk(architecture_yml, weights_h5) aggregation = SequenceLabelingAggregation(sequence_labeling, feature_extractor, normalize=normalize, duration=duration, step=step) collar = 0.500 error_rate = DetectionErrorRate(collar=collar) accuracy = DetectionAccuracy(collar=collar) precision = DetectionPrecision(collar=collar) recall = DetectionRecall(collar=collar) LINE = '{uri} {e:.3f} {a:.3f} {p:.3f} {r:.3f} {f:.3f}\n' PATH = '{output_dir}/eval.{dataset}.{subset}.txt' path = PATH.format(output_dir=output_dir, dataset=dataset, subset=subset) with open(path, 'w') as fp: header = '# uri error accuracy precision recall f_measure\n' fp.write(header) fp.flush() for current_file in file_generator: uri = current_file['uri'] wav = current_file['medium']['wav'] annotated = current_file['annotated'] annotation = current_file['annotation'] predictions = aggregation.apply(wav) hypothesis = binarizer.apply(predictions, dimension=1) e = error_rate(annotation, hypothesis, uem=annotated) a = accuracy(annotation, hypothesis, uem=annotated) p = precision(annotation, hypothesis, uem=annotated) r = recall(annotation, hypothesis, uem=annotated) f = f_measure(p, r) line = LINE.format(uri=uri, e=e, a=a, p=p, r=r, f=f) fp.write(line) fp.flush() PATH = '{output_dir}/{uri}.json' path = PATH.format(output_dir=output_dir, uri=uri) dump_to(hypothesis, path) # average on whole corpus uri = '{dataset}.{subset}'.format(dataset=dataset, subset=subset) e = abs(error_rate) a = abs(accuracy) p = abs(precision) r = abs(recall) f = f_measure(p, r) line = LINE.format(uri=uri, e=e, a=a, p=p, r=r, f=f) fp.write(line) fp.flush()
def apply(protocol, train_dir, store_dir, threshold, subset='development', epoch=None, min_duration=1.0): # -- LOAD MODEL -- nb_epoch = 0 while True: weights_h5 = LoggingCallback.WEIGHTS_H5.format(log_dir=train_dir, epoch=nb_epoch) if not os.path.isfile(weights_h5): break nb_epoch += 1 config_dir = os.path.dirname(os.path.dirname(train_dir)) config_yml = config_dir + '/config.yml' with open(config_yml, 'r') as fp: config = yaml.load(fp) # -- FEATURE EXTRACTION -- feature_extraction_name = config['feature_extraction']['name'] features = __import__('pyannote.audio.features', fromlist=[feature_extraction_name]) FeatureExtraction = getattr(features, feature_extraction_name) feature_extraction = FeatureExtraction( **config['feature_extraction'].get('params', {})) # -- SEQUENCE GENERATOR -- duration = config['sequences']['duration'] step = config['sequences']['step'] def saveSeg(filepath, filename, segmentation): f = open(filepath, 'w') for idx, val in enumerate(segmentation): line = filename + ' ' + str(idx) + ' 1 ' + str(int( val[0] * 100)) + ' ' + str( int(val[1] * 100 - val[0] * 100)) + '\n' f.write(line) f.close() filepath = store_dir + '/' + str(threshold) + '/' mkdir_p(filepath) # -- CHOOSE MODEL -- if epoch > nb_epoch: raise ValueError('Epoch should be less than ' + str(nb_epoch)) if epoch is None: epoch = nb_epoch - 1 sequence_labeling = SequenceLabeling.from_disk(train_dir, epoch) aggregation = SequenceLabelingAggregation(sequence_labeling, feature_extraction, duration=duration, step=step) # -- PREDICTION -- predictions = {} for dev_file in getattr(protocol, subset)(): uri = dev_file['uri'] predictions[uri] = aggregation.apply(dev_file) # initialize peak detection algorithm peak = Peak(alpha=threshold, min_duration=min_duration) for dev_file in getattr(protocol, subset)(): uri = dev_file['uri'] hypothesis = peak.apply(predictions[uri]) filepath = store_dir + '/' + str(threshold) + '/' + uri + '.0.seg' saveSeg(filepath, uri, hypothesis)
def evaluate(protocol, train_dir, store_dir, subset='development', epoch=None, min_duration=1.0): mkdir_p(store_dir) # -- LOAD MODEL -- nb_epoch = 0 while True: weights_h5 = LoggingCallback.WEIGHTS_H5.format(log_dir=train_dir, epoch=nb_epoch) if not os.path.isfile(weights_h5): break nb_epoch += 1 config_dir = os.path.dirname(os.path.dirname(train_dir)) config_yml = config_dir + '/config.yml' with open(config_yml, 'r') as fp: config = yaml.load(fp) # -- FEATURE EXTRACTION -- feature_extraction_name = config['feature_extraction']['name'] features = __import__('pyannote.audio.features', fromlist=[feature_extraction_name]) FeatureExtraction = getattr(features, feature_extraction_name) feature_extraction = FeatureExtraction( **config['feature_extraction'].get('params', {})) # -- SEQUENCE GENERATOR -- duration = config['sequences']['duration'] step = config['sequences']['step'] groundtruth = {} for dev_file in getattr(protocol, subset)(): uri = dev_file['uri'] groundtruth[uri] = dev_file['annotation'] # -- CHOOSE MODEL -- if epoch > nb_epoch: raise ValueError('Epoch should be less than ' + str(nb_epoch)) if epoch is None: epoch = nb_epoch - 1 sequence_labeling = SequenceLabeling.from_disk(train_dir, epoch) aggregation = SequenceLabelingAggregation(sequence_labeling, feature_extraction, duration=duration, step=step) # -- PREDICTION -- predictions = {} for dev_file in getattr(protocol, subset)(): uri = dev_file['uri'] predictions[uri] = aggregation.apply(dev_file) alphas = np.linspace(0, 1, 20) purity = [SegmentationPurity(parallel=False) for alpha in alphas] coverage = [SegmentationCoverage(parallel=False) for alpha in alphas] # -- SAVE RESULTS -- for i, alpha in enumerate(alphas): # initialize peak detection algorithm peak = Peak(alpha=alpha, min_duration=min_duration) for uri, reference in groundtruth.items(): # apply peak detection hypothesis = peak.apply(predictions[uri]) # compute purity and coverage purity[i](reference, hypothesis) coverage[i](reference, hypothesis) TEMPLATE = '{alpha:g} {purity:.3f}% {coverage:.3f}%' with open(store_dir + '/res.txt', 'a') as fp: for i, a in enumerate(alphas): p = 100 * abs(purity[i]) c = 100 * abs(coverage[i]) print(TEMPLATE.format(alpha=a, purity=p, coverage=c)) fp.write(TEMPLATE.format(alpha=a, purity=p, coverage=c) + '\n')
def apply(self, protocol_name, subset='test'): apply_dir = self.APPLY_DIR.format(tune_dir=self.tune_dir_) mkdir_p(apply_dir) # load tuning results tune_yml = self.TUNE_YML.format(tune_dir=self.tune_dir_) with io.open(tune_yml, 'r') as fp: self.tune_ = yaml.load(fp) # load model for epoch 'epoch' epoch = self.tune_['epoch'] sequence_labeling = SequenceLabeling.from_disk(self.train_dir_, epoch) # initialize sequence labeling duration = self.config_['sequences']['duration'] step = self.config_['sequences']['step'] aggregation = SequenceLabelingAggregation(sequence_labeling, self.feature_extraction_, duration=duration, step=step) # initialize protocol protocol = get_protocol(protocol_name, progress=True, preprocessors=self.preprocessors_) for i, item in enumerate(getattr(protocol, subset)()): prediction = aggregation.apply(item) if i == 0: # create metadata file at root that contains # sliding window and dimension information path = Precomputed.get_config_path(apply_dir) f = h5py.File(path) f.attrs['start'] = prediction.sliding_window.start f.attrs['duration'] = prediction.sliding_window.duration f.attrs['step'] = prediction.sliding_window.step f.attrs['dimension'] = 2 f.close() path = Precomputed.get_path(apply_dir, item) # create parent directory mkdir_p(dirname(path)) f = h5py.File(path) f.attrs['start'] = prediction.sliding_window.start f.attrs['duration'] = prediction.sliding_window.duration f.attrs['step'] = prediction.sliding_window.step f.attrs['dimension'] = 2 f.create_dataset('features', data=prediction.data) f.close() # initialize binarizer onset = self.tune_['onset'] offset = self.tune_['offset'] binarize = Binarize(onset=onset, offset=offset) precomputed = Precomputed(root_dir=apply_dir) writer = MDTMParser() path = self.HARD_MDTM.format(apply_dir=apply_dir, protocol=protocol_name, subset=subset) with io.open(path, mode='w') as gp: for item in getattr(protocol, subset)(): prediction = precomputed(item) segmentation = binarize.apply(prediction, dimension=1) writer.write(segmentation.to_annotation(), f=gp, uri=item['uri'], modality='speaker')
def validate(self, protocol_name, subset='development'): # prepare paths validate_dir = self.VALIDATE_DIR.format(train_dir=self.train_dir_, protocol=protocol_name) validate_txt = self.VALIDATE_TXT.format(validate_dir=validate_dir, subset=subset) validate_png = self.VALIDATE_PNG.format(validate_dir=validate_dir, subset=subset) validate_eps = self.VALIDATE_EPS.format(validate_dir=validate_dir, subset=subset) # create validation directory mkdir_p(validate_dir) # Build validation set y = self._validation_set(protocol_name, subset=subset) # list of equal error rates, and current epoch eers, epoch = [], 0 desc_format = ('EER = {eer:.2f}% @ epoch #{epoch:d} ::' ' Best EER = {best_eer:.2f}% @ epoch #{best_epoch:d} :') progress_bar = tqdm(unit='epoch', total=1000) with open(validate_txt, mode='w') as fp: # watch and evaluate forever while True: weights_h5 = LoggingCallback.WEIGHTS_H5.format( log_dir=self.train_dir_, epoch=epoch) # wait until weight file is available if not isfile(weights_h5): time.sleep(60) continue # load model for current epoch sequence_labeling = SequenceLabeling.from_disk( self.train_dir_, epoch) # initialize sequence labeling duration = self.config_['sequences']['duration'] step = duration # hack to make things faster # step = self.config_['sequences']['step'] aggregation = SequenceLabelingAggregation( sequence_labeling, self.feature_extraction_, duration=duration, step=step) aggregation.cache_preprocessed_ = False # estimate equal error rate (average of all files) eers_ = [] protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) file_generator = getattr(protocol, subset)() for current_file in file_generator: identifier = get_unique_identifier(current_file) uem = get_annotated(current_file) y_true = y[identifier].crop(uem)[:, 1] counts = Counter(y_true) if counts[0] * counts[1] == 0: continue y_pred = aggregation.apply(current_file).crop(uem)[:, 1] _, _, _, eer = det_curve(y_true, y_pred, distances=False) eers_.append(eer) eer = np.mean(eers_) eers.append(eer) # save equal error rate to file fp.write( self.VALIDATE_TXT_TEMPLATE.format(epoch=epoch, eer=eer)) fp.flush() # keep track of best epoch so far best_epoch, best_eer = np.argmin(eers), np.min(eers) progress_bar.set_description( desc_format.format(epoch=epoch, eer=100 * eer, best_epoch=best_epoch, best_eer=100 * best_eer)) progress_bar.update(1) # plot fig = plt.figure() plt.plot(eers, 'b') plt.plot([best_epoch], [best_eer], 'bo') plt.plot([0, epoch], [best_eer, best_eer], 'k--') plt.grid(True) plt.xlabel('epoch') plt.ylabel('EER on {subset}'.format(subset=subset)) TITLE = '{best_eer:.5g} @ epoch #{best_epoch:d}' title = TITLE.format(best_eer=best_eer, best_epoch=best_epoch, subset=subset) plt.title(title) plt.tight_layout() plt.savefig(validate_png, dpi=75) plt.savefig(validate_eps) plt.close(fig) # validate next epoch epoch += 1 progress_bar.close()