Python YaafeMFCCの例、pyannote.audio.features.yaafe.YaafeMFCC Pythonの例

コード例 #1

0

ファイルを表示

ファイル: speech_activity_detection.py プロジェクト: GregGovit/pyannote-audio

def train(dataset, medium_template, config_yml):

    # load configuration file
    with open(config_yml, 'r') as fp:
        config = yaml.load(fp)

    # deduce workdir from path of configuration file
    workdir = os.path.dirname(config_yml)

    # this is where model weights are saved after each epoch
    log_dir = workdir + '/' + dataset

    # -- DATASET --
    db, task, protocol, subset = dataset.split('.')
    database = get_database(db, medium_template=medium_template)
    protocol = database.get_protocol(task, protocol)

    if not hasattr(protocol, subset):
        raise NotImplementedError('')

    file_generator = getattr(protocol, subset)()

    # -- FEATURE EXTRACTION --
    # input sequence duration
    duration = config['feature_extraction']['duration']
    # MFCCs
    feature_extractor = YaafeMFCC(**config['feature_extraction']['mfcc'])
    # normalization
    normalize = config['feature_extraction']['normalize']

    # -- NETWORK STRUCTURE --
    # internal model structure
    lstm = config['network']['lstm']
    dense = config['network']['dense']
    # bi-directional
    bidirectional = config['network']['bidirectional']

    # -- TRAINING --
    # number training set hours (speech + non speech) to use in each epoch
    # FIXME -- update ETAPE so that we can query this information directly
    hours_per_epoch = config['training']['hours_per_epoch']
    # overlap ratio between each window
    overlap = config['training']['overlap']
    # batch size
    batch_size = config['training']['batch_size']
    # number of epochs
    nb_epoch = config['training']['nb_epoch']
    # optimizer
    optimizer = config['training']['optimizer']

    # labeling
    n_classes = 2
    design_model = StackedLSTM(n_classes=n_classes,
                               lstm=lstm,
                               bidirectional=bidirectional,
                               dense=dense)

    labeling = SequenceLabeling(design_model,
                                optimizer=optimizer,
                                log_dir=log_dir)

    # segment generator for training
    step = duration * (1. - overlap)
    batch_generator = SpeechActivityDetectionBatchGenerator(
        feature_extractor,
        duration=duration,
        normalize=normalize,
        step=step,
        batch_size=batch_size)

    # log loss and accuracy during training and
    # keep track of best models for both metrics
    log = [('train', 'loss'), ('train', 'accuracy')]
    callback = LoggingCallback(log_dir=log_dir, log=log)

    # number of samples per epoch + round it to closest batch
    samples_per_epoch = batch_size * int(
        np.ceil((3600 * hours_per_epoch / step) / batch_size))

    # input shape (n_frames, n_features)
    input_shape = batch_generator.get_shape()

    generator = batch_generator(file_generator, infinite=True)

    labeling.fit(input_shape,
                 generator,
                 samples_per_epoch,
                 nb_epoch,
                 callbacks=[callback])

コード例 #2

0

ファイルを表示

ファイル: speech_activity_detection.py プロジェクト: GregGovit/pyannote-audio

def test(dataset, medium_template, config_yml, weights_h5, output_dir):

    # load configuration file
    with open(config_yml, 'r') as fp:
        config = yaml.load(fp)

    # this is where model architecture was saved
    architecture_yml = os.path.dirname(
        os.path.dirname(weights_h5)) + '/architecture.yml'

    # -- DATASET --
    db, task, protocol, subset = dataset.split('.')
    database = get_database(db, medium_template=medium_template)
    protocol = database.get_protocol(task, protocol)

    if not hasattr(protocol, subset):
        raise NotImplementedError('')

    file_generator = getattr(protocol, subset)()

    # -- FEATURE EXTRACTION --
    # input sequence duration
    duration = config['feature_extraction']['duration']
    # MFCCs
    feature_extractor = YaafeMFCC(**config['feature_extraction']['mfcc'])
    # normalization
    normalize = config['feature_extraction']['normalize']

    # -- TESTING --
    # overlap ratio between each window
    overlap = config['testing']['overlap']
    step = duration * (1. - overlap)

    # prediction smoothing
    onset = config['testing']['binarize']['onset']
    offset = config['testing']['binarize']['offset']
    binarizer = Binarize(onset=0.5, offset=0.5)

    sequence_labeling = SequenceLabeling.from_disk(architecture_yml,
                                                   weights_h5)

    aggregation = SequenceLabelingAggregation(sequence_labeling,
                                              feature_extractor,
                                              normalize=normalize,
                                              duration=duration,
                                              step=step)

    collar = 0.500
    error_rate = DetectionErrorRate(collar=collar)
    accuracy = DetectionAccuracy(collar=collar)
    precision = DetectionPrecision(collar=collar)
    recall = DetectionRecall(collar=collar)

    LINE = '{uri} {e:.3f} {a:.3f} {p:.3f} {r:.3f} {f:.3f}\n'

    PATH = '{output_dir}/eval.{dataset}.{subset}.txt'
    path = PATH.format(output_dir=output_dir, dataset=dataset, subset=subset)

    with open(path, 'w') as fp:

        header = '# uri error accuracy precision recall f_measure\n'
        fp.write(header)
        fp.flush()

        for current_file in file_generator:

            uri = current_file['uri']
            wav = current_file['medium']['wav']
            annotated = current_file['annotated']
            annotation = current_file['annotation']

            predictions = aggregation.apply(wav)
            hypothesis = binarizer.apply(predictions, dimension=1)

            e = error_rate(annotation, hypothesis, uem=annotated)
            a = accuracy(annotation, hypothesis, uem=annotated)
            p = precision(annotation, hypothesis, uem=annotated)
            r = recall(annotation, hypothesis, uem=annotated)
            f = f_measure(p, r)

            line = LINE.format(uri=uri, e=e, a=a, p=p, r=r, f=f)
            fp.write(line)
            fp.flush()

            PATH = '{output_dir}/{uri}.json'
            path = PATH.format(output_dir=output_dir, uri=uri)
            dump_to(hypothesis, path)

        # average on whole corpus
        uri = '{dataset}.{subset}'.format(dataset=dataset, subset=subset)
        e = abs(error_rate)
        a = abs(accuracy)
        p = abs(precision)
        r = abs(recall)
        f = f_measure(p, r)
        line = LINE.format(uri=uri, e=e, a=a, p=p, r=r, f=f)
        fp.write(line)
        fp.flush()

コード例 #3

0

ファイルを表示

ファイル: speaker_embedding.py プロジェクト: GregGovit/pyannote-audio

def generate_test(dataset, medium_template, config):

    # -- DATASET --
    db, task, protocol, subset = dataset.split('.')
    database = get_database(db, medium_template=medium_template)
    protocol = database.get_protocol(task, protocol)

    if not hasattr(protocol, subset):
        raise NotImplementedError('')

    file_generator = getattr(protocol, subset)()

    # -- FEATURE EXTRACTION --
    # input sequence duration
    duration = config['feature_extraction']['duration']
    # MFCCs
    feature_extractor = YaafeMFCC(**config['feature_extraction']['mfcc'])
    # normalization
    normalize = config['feature_extraction']['normalize']

    overlap = config['testing']['overlap']
    per_label = config['testing']['per_label']
    batch_size = config['testing']['batch_size']

    batch_generator = LabeledFixedDurationSequencesBatchGenerator(
        feature_extractor,
        duration=duration,
        normalize=normalize,
        step=(1 - overlap) * duration,
        batch_size=-1)

    X, y = [], []
    for sequences, labels in batch_generator(file_generator):
        X.append(sequences)
        y.append(labels)
    X = np.vstack(X)
    y = np.hstack(y)

    unique, y, counts = np.unique(y, return_inverse=True, return_counts=True)

    # randomly (but deterministically) select 'per_label' samples from each class
    # only compute (positive vs. negative distances for those samples)
    # this should ensure all speakers have the same weights
    np.random.seed(1337)

    # indices contains the list of indices of all sequences
    # to be used for later triplet selection
    indices = []

    n_labels = len(unique)
    for label in range(n_labels):

        # randomly choose 'per_label' sequences
        # from the set of available sequences
        i = np.random.choice(np.where(y == label)[0],
                             size=per_label,
                             replace=True)

        # append indices of selected sequences
        indices.append(i)

    # turn indices into a 1-dimensional numpy array.
    indices = np.hstack(indices)

    # selected sequences
    X = X[indices]

    # their pairwise similarity
    y_true = pdist(y[indices, np.newaxis], metric='chebyshev') < 1

    return X, y_true

コード例 #4

0

ファイルを表示

ファイル: speaker_embedding.py プロジェクト: GregGovit/pyannote-audio

def train(dataset, medium_template, config_yml):

    # load configuration file
    with open(config_yml, 'r') as fp:
        config = yaml.load(fp)

    # deduce workdir from path of configuration file
    workdir = os.path.dirname(config_yml)

    # this is where model weights are saved after each epoch
    log_dir = workdir + '/' + dataset

    # -- DATASET --
    db, task, protocol, subset = dataset.split('.')
    database = get_database(db, medium_template=medium_template)
    protocol = database.get_protocol(task, protocol)

    if not hasattr(protocol, subset):
        raise NotImplementedError('')

    file_generator = getattr(protocol, subset)()

    # -- FEATURE EXTRACTION --
    # input sequence duration
    duration = config['feature_extraction']['duration']
    # MFCCs
    feature_extractor = YaafeMFCC(**config['feature_extraction']['mfcc'])
    # normalization
    normalize = config['feature_extraction']['normalize']

    # -- NETWORK STRUCTURE --
    # internal model structure
    output_dim = config['network']['output_dim']
    lstm = config['network']['lstm']
    pooling = config['network'].get('pooling', 'last')
    dense = config['network']['dense']
    # bi-directional
    bidirectional = config['network']['bidirectional']
    space = config['network']['space']

    # -- TRAINING --
    # batch size
    batch_size = config['training']['batch_size']
    # number of epochs
    nb_epoch = config['training']['nb_epoch']
    # optimizer
    optimizer = config['training']['optimizer']

    # -- TRIPLET LOSS --
    margin = config['training']['triplet_loss']['margin']
    per_fold = config['training']['triplet_loss']['per_fold']
    per_label = config['training']['triplet_loss']['per_label']
    overlap = config['training']['triplet_loss']['overlap']

    # embedding
    get_embedding = TristouNet(lstm=lstm,
                               bidirectional=bidirectional,
                               pooling=pooling,
                               dense=dense,
                               output_dim=output_dim,
                               space=space)

    loss = TripletLoss(get_embedding, margin=margin)

    embedding = SequenceEmbedding(loss=loss,
                                  optimizer=optimizer,
                                  log_dir=log_dir)

    # triplet generator for training
    generator = TripletBatchGenerator(feature_extractor,
                                      file_generator,
                                      embedding,
                                      margin=margin,
                                      duration=duration,
                                      overlap=overlap,
                                      normalize=normalize,
                                      per_fold=per_fold,
                                      per_label=per_label,
                                      batch_size=batch_size)

    # log loss during training and keep track of best model
    log = [('train', 'loss')]
    callback = LoggingCallback(log_dir=log_dir,
                               log=log,
                               get_model=loss.get_embedding)

    # estimated number of triplets per epoch
    # (rounded to closest batch_size multiple)
    samples_per_epoch = per_label * (per_label - 1) * generator.n_labels
    samples_per_epoch = samples_per_epoch - (samples_per_epoch % batch_size)

    # input shape (n_samples, n_features)
    input_shape = generator.get_shape()

    embedding.fit(input_shape,
                  generator,
                  samples_per_epoch,
                  nb_epoch,
                  callbacks=[callback])

コード例 #5

0

ファイルを表示

# ---- </edit> ---------------------------------------------------------------

# sequence duration (in seconds)
import sys
duration = float(sys.argv[1])

LOG_DIR = LOG_DIR + '/{duration:.1f}s'.format(duration=duration)

import numpy as np
np.random.seed(1337)  # for reproducibility

# feature extraction
from pyannote.audio.features.yaafe import YaafeMFCC
feature_extractor = YaafeMFCC(e=False,
                              De=False,
                              DDe=False,
                              coefs=11,
                              D=False,
                              DD=False)

# ETAPE database
medium_template = {'wav': WAV_TEMPLATE}
from pyannote.database import Etape
database = Etape(medium_template=medium_template)

# experimental protocol (ETAPE TV subset)
protocol = database.get_protocol('SpeakerDiarization', 'TV')

from pyannote.audio.segmentation import GaussianDivergenceSegmentation
segmentation = GaussianDivergenceSegmentation(feature_extractor,
                                              duration=duration,
                                              step=0.100)