def train(dataset, medium_template, config_yml): # load configuration file with open(config_yml, 'r') as fp: config = yaml.load(fp) # deduce workdir from path of configuration file workdir = os.path.dirname(config_yml) # this is where model weights are saved after each epoch log_dir = workdir + '/' + dataset # -- DATASET -- db, task, protocol, subset = dataset.split('.') database = get_database(db, medium_template=medium_template) protocol = database.get_protocol(task, protocol) if not hasattr(protocol, subset): raise NotImplementedError('') file_generator = getattr(protocol, subset)() # -- FEATURE EXTRACTION -- # input sequence duration duration = config['feature_extraction']['duration'] # MFCCs feature_extractor = YaafeMFCC(**config['feature_extraction']['mfcc']) # normalization normalize = config['feature_extraction']['normalize'] # -- NETWORK STRUCTURE -- # internal model structure lstm = config['network']['lstm'] dense = config['network']['dense'] # bi-directional bidirectional = config['network']['bidirectional'] # -- TRAINING -- # number training set hours (speech + non speech) to use in each epoch # FIXME -- update ETAPE so that we can query this information directly hours_per_epoch = config['training']['hours_per_epoch'] # overlap ratio between each window overlap = config['training']['overlap'] # batch size batch_size = config['training']['batch_size'] # number of epochs nb_epoch = config['training']['nb_epoch'] # optimizer optimizer = config['training']['optimizer'] # labeling n_classes = 2 design_model = StackedLSTM(n_classes=n_classes, lstm=lstm, bidirectional=bidirectional, dense=dense) labeling = SequenceLabeling(design_model, optimizer=optimizer, log_dir=log_dir) # segment generator for training step = duration * (1. - overlap) batch_generator = SpeechActivityDetectionBatchGenerator( feature_extractor, duration=duration, normalize=normalize, step=step, batch_size=batch_size) # log loss and accuracy during training and # keep track of best models for both metrics log = [('train', 'loss'), ('train', 'accuracy')] callback = LoggingCallback(log_dir=log_dir, log=log) # number of samples per epoch + round it to closest batch samples_per_epoch = batch_size * int( np.ceil((3600 * hours_per_epoch / step) / batch_size)) # input shape (n_frames, n_features) input_shape = batch_generator.get_shape() generator = batch_generator(file_generator, infinite=True) labeling.fit(input_shape, generator, samples_per_epoch, nb_epoch, callbacks=[callback])
def test(dataset, medium_template, config_yml, weights_h5, output_dir): # load configuration file with open(config_yml, 'r') as fp: config = yaml.load(fp) # this is where model architecture was saved architecture_yml = os.path.dirname( os.path.dirname(weights_h5)) + '/architecture.yml' # -- DATASET -- db, task, protocol, subset = dataset.split('.') database = get_database(db, medium_template=medium_template) protocol = database.get_protocol(task, protocol) if not hasattr(protocol, subset): raise NotImplementedError('') file_generator = getattr(protocol, subset)() # -- FEATURE EXTRACTION -- # input sequence duration duration = config['feature_extraction']['duration'] # MFCCs feature_extractor = YaafeMFCC(**config['feature_extraction']['mfcc']) # normalization normalize = config['feature_extraction']['normalize'] # -- TESTING -- # overlap ratio between each window overlap = config['testing']['overlap'] step = duration * (1. - overlap) # prediction smoothing onset = config['testing']['binarize']['onset'] offset = config['testing']['binarize']['offset'] binarizer = Binarize(onset=0.5, offset=0.5) sequence_labeling = SequenceLabeling.from_disk(architecture_yml, weights_h5) aggregation = SequenceLabelingAggregation(sequence_labeling, feature_extractor, normalize=normalize, duration=duration, step=step) collar = 0.500 error_rate = DetectionErrorRate(collar=collar) accuracy = DetectionAccuracy(collar=collar) precision = DetectionPrecision(collar=collar) recall = DetectionRecall(collar=collar) LINE = '{uri} {e:.3f} {a:.3f} {p:.3f} {r:.3f} {f:.3f}\n' PATH = '{output_dir}/eval.{dataset}.{subset}.txt' path = PATH.format(output_dir=output_dir, dataset=dataset, subset=subset) with open(path, 'w') as fp: header = '# uri error accuracy precision recall f_measure\n' fp.write(header) fp.flush() for current_file in file_generator: uri = current_file['uri'] wav = current_file['medium']['wav'] annotated = current_file['annotated'] annotation = current_file['annotation'] predictions = aggregation.apply(wav) hypothesis = binarizer.apply(predictions, dimension=1) e = error_rate(annotation, hypothesis, uem=annotated) a = accuracy(annotation, hypothesis, uem=annotated) p = precision(annotation, hypothesis, uem=annotated) r = recall(annotation, hypothesis, uem=annotated) f = f_measure(p, r) line = LINE.format(uri=uri, e=e, a=a, p=p, r=r, f=f) fp.write(line) fp.flush() PATH = '{output_dir}/{uri}.json' path = PATH.format(output_dir=output_dir, uri=uri) dump_to(hypothesis, path) # average on whole corpus uri = '{dataset}.{subset}'.format(dataset=dataset, subset=subset) e = abs(error_rate) a = abs(accuracy) p = abs(precision) r = abs(recall) f = f_measure(p, r) line = LINE.format(uri=uri, e=e, a=a, p=p, r=r, f=f) fp.write(line) fp.flush()
def generate_test(dataset, medium_template, config): # -- DATASET -- db, task, protocol, subset = dataset.split('.') database = get_database(db, medium_template=medium_template) protocol = database.get_protocol(task, protocol) if not hasattr(protocol, subset): raise NotImplementedError('') file_generator = getattr(protocol, subset)() # -- FEATURE EXTRACTION -- # input sequence duration duration = config['feature_extraction']['duration'] # MFCCs feature_extractor = YaafeMFCC(**config['feature_extraction']['mfcc']) # normalization normalize = config['feature_extraction']['normalize'] overlap = config['testing']['overlap'] per_label = config['testing']['per_label'] batch_size = config['testing']['batch_size'] batch_generator = LabeledFixedDurationSequencesBatchGenerator( feature_extractor, duration=duration, normalize=normalize, step=(1 - overlap) * duration, batch_size=-1) X, y = [], [] for sequences, labels in batch_generator(file_generator): X.append(sequences) y.append(labels) X = np.vstack(X) y = np.hstack(y) unique, y, counts = np.unique(y, return_inverse=True, return_counts=True) # randomly (but deterministically) select 'per_label' samples from each class # only compute (positive vs. negative distances for those samples) # this should ensure all speakers have the same weights np.random.seed(1337) # indices contains the list of indices of all sequences # to be used for later triplet selection indices = [] n_labels = len(unique) for label in range(n_labels): # randomly choose 'per_label' sequences # from the set of available sequences i = np.random.choice(np.where(y == label)[0], size=per_label, replace=True) # append indices of selected sequences indices.append(i) # turn indices into a 1-dimensional numpy array. indices = np.hstack(indices) # selected sequences X = X[indices] # their pairwise similarity y_true = pdist(y[indices, np.newaxis], metric='chebyshev') < 1 return X, y_true
def train(dataset, medium_template, config_yml): # load configuration file with open(config_yml, 'r') as fp: config = yaml.load(fp) # deduce workdir from path of configuration file workdir = os.path.dirname(config_yml) # this is where model weights are saved after each epoch log_dir = workdir + '/' + dataset # -- DATASET -- db, task, protocol, subset = dataset.split('.') database = get_database(db, medium_template=medium_template) protocol = database.get_protocol(task, protocol) if not hasattr(protocol, subset): raise NotImplementedError('') file_generator = getattr(protocol, subset)() # -- FEATURE EXTRACTION -- # input sequence duration duration = config['feature_extraction']['duration'] # MFCCs feature_extractor = YaafeMFCC(**config['feature_extraction']['mfcc']) # normalization normalize = config['feature_extraction']['normalize'] # -- NETWORK STRUCTURE -- # internal model structure output_dim = config['network']['output_dim'] lstm = config['network']['lstm'] pooling = config['network'].get('pooling', 'last') dense = config['network']['dense'] # bi-directional bidirectional = config['network']['bidirectional'] space = config['network']['space'] # -- TRAINING -- # batch size batch_size = config['training']['batch_size'] # number of epochs nb_epoch = config['training']['nb_epoch'] # optimizer optimizer = config['training']['optimizer'] # -- TRIPLET LOSS -- margin = config['training']['triplet_loss']['margin'] per_fold = config['training']['triplet_loss']['per_fold'] per_label = config['training']['triplet_loss']['per_label'] overlap = config['training']['triplet_loss']['overlap'] # embedding get_embedding = TristouNet(lstm=lstm, bidirectional=bidirectional, pooling=pooling, dense=dense, output_dim=output_dim, space=space) loss = TripletLoss(get_embedding, margin=margin) embedding = SequenceEmbedding(loss=loss, optimizer=optimizer, log_dir=log_dir) # triplet generator for training generator = TripletBatchGenerator(feature_extractor, file_generator, embedding, margin=margin, duration=duration, overlap=overlap, normalize=normalize, per_fold=per_fold, per_label=per_label, batch_size=batch_size) # log loss during training and keep track of best model log = [('train', 'loss')] callback = LoggingCallback(log_dir=log_dir, log=log, get_model=loss.get_embedding) # estimated number of triplets per epoch # (rounded to closest batch_size multiple) samples_per_epoch = per_label * (per_label - 1) * generator.n_labels samples_per_epoch = samples_per_epoch - (samples_per_epoch % batch_size) # input shape (n_samples, n_features) input_shape = generator.get_shape() embedding.fit(input_shape, generator, samples_per_epoch, nb_epoch, callbacks=[callback])
# ---- </edit> --------------------------------------------------------------- # sequence duration (in seconds) import sys duration = float(sys.argv[1]) LOG_DIR = LOG_DIR + '/{duration:.1f}s'.format(duration=duration) import numpy as np np.random.seed(1337) # for reproducibility # feature extraction from pyannote.audio.features.yaafe import YaafeMFCC feature_extractor = YaafeMFCC(e=False, De=False, DDe=False, coefs=11, D=False, DD=False) # ETAPE database medium_template = {'wav': WAV_TEMPLATE} from pyannote.database import Etape database = Etape(medium_template=medium_template) # experimental protocol (ETAPE TV subset) protocol = database.get_protocol('SpeakerDiarization', 'TV') from pyannote.audio.segmentation import GaussianDivergenceSegmentation segmentation = GaussianDivergenceSegmentation(feature_extractor, duration=duration, step=0.100)