Esempio n. 1
0
def load_labels(filelist, predictions, fps, datadir):
    labels = []
    for fn in filelist:
        ffn = os.path.join(datadir, 'labels', fn.rsplit('.', 1)[0] + '.lab')
        with io.open(ffn) as f:
            segments = [l.rstrip().split() for l in f if l.rstrip()]
        segments = [(float(start), float(end), label == 'sing')
                    for start, end, label in segments]
        timestamps = np.arange(len(predictions[fn])) / float(fps)
        labels.append(create_aligned_targets(segments, timestamps, np.bool))
    return labels
    def prepare_audio_batches(self,
                              sample_rate,
                              frame_len,
                              fps,
                              blocklen,
                              batchsize,
                              batch_data=True):

        spects = []
        for fn in progress(self.filelist, 'File'):
            cache_fn = (self.featuredir
                        and os.path.join(self.featuredir, fn + '.npy'))
            spects.append(
                cached(cache_fn, audio.read_ffmpeg,
                       os.path.join(self.datadir, 'audio', fn), sample_rate))

        # - load and convert corresponding labels
        print("Loading labels...")
        labels = []
        for fn, spect in zip(self.filelist, spects):
            fn = os.path.join(self.datadir, 'labels',
                              fn.rsplit('.', 1)[0] + '.lab')
            with io.open(fn) as f:
                segments = [l.rstrip().split() for l in f if l.rstrip()]
            segments = [(float(start), float(end), label == 'sing')
                        for start, end, label in segments]
            timestamps = np.arange(len(spect)) / float(sample_rate)
            labels.append(create_aligned_targets(segments, timestamps,
                                                 np.bool))

        if (batch_data):
            batches = augment.grab_random_audio_excerpts(
                spects, labels, batchsize, sample_rate, frame_len, fps,
                blocklen)
            return batches
        else:
            return spects, labels
    def prepare_batches(self,
                        sample_rate,
                        frame_len,
                        fps,
                        mel_bands,
                        mel_min,
                        mel_max,
                        blocklen,
                        batchsize,
                        batch_data=True):

        bin_nyquist = frame_len // 2 + 1
        bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate
        spects = []
        for fn in progress(self.filelist, 'File'):
            cache_fn = (self.featuredir
                        and os.path.join(self.featuredir, fn + '.npy'))
            spects.append(
                cached(cache_fn, audio.extract_spect,
                       os.path.join(self.datadir, 'audio', fn), sample_rate,
                       frame_len, fps))
        # - load and convert corresponding labels
        print("Loading labels...")
        labels = []
        for fn, spect in zip(self.filelist, spects):
            fn = os.path.join(self.datadir, 'labels',
                              fn.rsplit('.', 1)[0] + '.lab')
            with io.open(fn) as f:
                segments = [l.rstrip().split() for l in f if l.rstrip()]
            segments = [(float(start), float(end), label == 'sing')
                        for start, end, label in segments]
            timestamps = np.arange(len(spect)) / float(fps)
            labels.append(create_aligned_targets(segments, timestamps,
                                                 np.bool))

        if (self.input_type == 'stft'):
            print('Create dataset with stft output')
            if (batch_data):
                batches = augment.grab_random_excerpts(spects, labels,
                                                       batchsize, blocklen)
                return batches
            else:
                return spects, labels

        if (self.input_type == 'mel_spects'
                or self.input_type == 'mel_spects_norm'):
            # - prepare mel filterbank
            filterbank = audio.create_mel_filterbank(sample_rate, frame_len,
                                                     mel_bands, mel_min,
                                                     mel_max)
            filterbank = filterbank[:bin_mel_max].astype(floatX)

            # - precompute mel spectra, if needed, otherwise just define a generator
            mel_spects = (np.log(
                np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7))
                          for spect in spects)

            if not self.augment:
                mel_spects = list(mel_spects)
                del spects

            # - load mean/std or compute it, if not computed yet
            meanstd_file = os.path.join(os.path.dirname(__file__),
                                        '%s_meanstd.npz' % self.dataset)
            try:
                with np.load(meanstd_file) as f:
                    mean = f['mean']
                    std = f['std']
            except (IOError, KeyError):
                print("Computing mean and standard deviation...")
                mean, std = znorm.compute_mean_std(mel_spects)
                np.savez(meanstd_file, mean=mean, std=std)
            mean = mean.astype(floatX)
            istd = np.reciprocal(std).astype(floatX)
            #print(meanstd_file,mean, istd)
            #input('wait')
            #print(znorm.compute_mean_std(mel_spects))
            #input('wait')
        # - prepare training data generator
        print("Preparing training data feed...")
        if not self.augment:
            # Without augmentation, we just precompute the normalized mel spectra
            # and create a generator that returns mini-batches of random excerpts
            if (self.input_type == 'mel_spects'):
                print('Creating batches of mel spects without znorm')
                if (batch_data):
                    batches = augment.grab_random_excerpts(
                        mel_spects, labels, batchsize, blocklen)
                else:
                    pad = np.tile((np.log(1e-7).astype(floatX)),
                                  (blocklen // 2, 80))
                    mel_spects = (np.concatenate((pad, spect, pad), axis=0)
                                  for spect in mel_spects)
                    mel_spects = list(mel_spects)
                    return mel_spects, labels
            elif (self.input_type == 'mel_spects_norm'):
                print('Creating batches of mel spects with znorm')
                #mel_spects = [(spect - mean) * istd for spect in mel_spects]
                if (batch_data):
                    mel_spects = [(spect - mean) * istd
                                  for spect in mel_spects]
                    batches = augment.grab_random_excerpts(
                        mel_spects, labels, batchsize, blocklen)
                else:
                    #pad = np.tile((np.log(1e-7)-mean)*istd, (blocklen//2, 1))
                    #input(pad)
                    pad = np.tile((np.log(1e-7).astype(floatX)),
                                  (blocklen // 2, 80))
                    mel_spects = (np.concatenate((pad, spect, pad), axis=0)
                                  for spect in mel_spects)
                    mel_spects = [(spect - mean) * istd
                                  for spect in mel_spects]
                    #input(mean.shape)
                    #mel_spects = [(spect - np.zeros(80)) * np.ones(80) for spect in mel_spects]

                    mel_spects = list(mel_spects)

                    return mel_spects, labels
            else:
                print('Creating batches of stfts')
                batches = augment.grab_random_excerpts(spects, labels,
                                                       batchsize, blocklen)
        else:
            # For time stretching and pitch shifting, it pays off to preapply the
            # spline filter to each input spectrogram, so it does not need to be
            # applied to each mini-batch later.
            spline_order = cfg['spline_order']
            if spline_order > 1:
                from scipy.ndimage import spline_filter
                spects = [
                    spline_filter(spect, spline_order).astype(floatX)
                    for spect in spects
                ]

            # We define a function to create the mini-batch generator. This allows
            # us to easily create multiple generators for multithreading if needed.
            def create_datafeed(spects, labels):
                # With augmentation, as we want to apply random time-stretching,
                # we request longer excerpts than we finally need to return.
                max_stretch = cfg['max_stretch']
                batches = augment.grab_random_excerpts(
                    spects,
                    labels,
                    batchsize=batchsize,
                    frames=int(blocklen / (1 - max_stretch)))

                # We wrap the generator in another one that applies random time
                # stretching and pitch shifting, keeping a given number of frames
                # and bins only.
                max_shift = cfg['max_shift']
                batches = augment.apply_random_stretch_shift(
                    batches,
                    max_stretch,
                    max_shift,
                    keep_frames=blocklen,
                    keep_bins=bin_mel_max,
                    order=spline_order,
                    prefiltered=True)

                # We transform the excerpts to mel frequency and log magnitude.
                batches = augment.apply_filterbank(batches, filterbank)
                batches = augment.apply_logarithm(batches)

                # We apply random frequency filters
                max_db = cfg['max_db']
                batches = augment.apply_random_filters(batches,
                                                       filterbank,
                                                       mel_max,
                                                       max_db=max_db)

                # We apply normalization
                batches = augment.apply_znorm(batches, mean, istd)

                return batches

            # We start the mini-batch generator and augmenter in one or more
            # background threads or processes (unless disabled).
            bg_threads = cfg['bg_threads']
            bg_processes = cfg['bg_processes']
            if not bg_threads and not bg_processes:
                # no background processing: just create a single generator
                batches = create_datafeed(spects, labels)
            elif bg_threads:
                # multithreading: create a separate generator per thread
                batches = augment.generate_in_background(
                    [
                        create_datafeed(spects, labels)
                        for _ in range(bg_threads)
                    ],
                    num_cached=bg_threads * 5)
            elif bg_processes:
                # multiprocessing: single generator is forked along with processes
                batches = augment.generate_in_background(
                    [create_datafeed(spects, labels)] * bg_processes,
                    num_cached=bg_processes * 25,
                    in_processes=True)

        return batches
Esempio n. 4
0
def main():
    # parse command line
    parser = opts_parser()
    options = parser.parse_args()
    modelfile = options.modelfile
    sample_rate = 22050
    frame_len = 1024
    fps = 70
    mel_bands = 80
    mel_min = 27.5
    mel_max = 8000
    blocklen = 115
    batchsize = 32
    
    bin_nyquist = frame_len // 2 + 1
    bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate

    # prepare dataset
    datadir = os.path.join(os.path.dirname(__file__),
                           os.path.pardir, 'datasets', options.dataset)

    # - load filelist
    with io.open(os.path.join(datadir, 'filelists', 'train')) as f:
        filelist = [l.rstrip() for l in f if l.rstrip()]

    # - compute spectra
    print("Computing%s spectra..." %
          (" or loading" if options.cache_spectra else ""))
    spects = []
    for fn in progress(filelist, 'File '):
        cache_fn = (options.cache_spectra and
                    os.path.join(options.cache_spectra, fn + '.npy'))
        spects.append(cached(cache_fn,
                             audio.extract_spect,
                             os.path.join(datadir, 'audio', fn),
                             sample_rate, frame_len, fps))

    # - load and convert corresponding labels
    print("Loading labels...")
    labels = []
    for fn, spect in zip(filelist, spects):
        fn = os.path.join(datadir, 'labels', fn.rsplit('.', 1)[0] + '.lab')
        with io.open(fn) as f:
            segments = [l.rstrip().split() for l in f if l.rstrip()]
        segments = [(float(start), float(end), label == 'sing')
                    for start, end, label in segments]
        timestamps = np.arange(len(spect)) / float(fps)
        labels.append(create_aligned_targets(segments, timestamps, np.bool))

    # - prepare mel filterbank
    filterbank = audio.create_mel_filterbank(sample_rate, frame_len, mel_bands,
                                             mel_min, mel_max)
    filterbank = filterbank[:bin_mel_max].astype(floatX)

    # - precompute mel spectra, if needed, otherwise just define a generator
    mel_spects = (np.log(np.maximum(np.dot(spect[:, :bin_mel_max], filterbank),
                                    1e-7))
                  for spect in spects)
    if not options.augment:
        mel_spects = list(mel_spects)
        del spects

    # - load mean/std or compute it, if not computed yet
    meanstd_file = os.path.join(os.path.dirname(__file__),
                                '%s_meanstd.npz' % options.dataset)
    try:
        with np.load(meanstd_file) as f:
            mean = f['mean']
            std = f['std']
    except (IOError, KeyError):
        print("Computing mean and standard deviation...")
        mean, std = znorm.compute_mean_std(mel_spects)
        np.savez(meanstd_file, mean=mean, std=std)
    mean = mean.astype(floatX)
    istd = np.reciprocal(std).astype(floatX)

    # - prepare training data generator
    print("Preparing training data feed...")
    if not options.augment:
        # Without augmentation, we just precompute the normalized mel spectra
        # and create a generator that returns mini-batches of random excerpts
        mel_spects = [(spect - mean) * istd for spect in mel_spects]
        batches = augment.grab_random_excerpts(
            mel_spects, labels, batchsize, blocklen)
    else:
        # For time stretching and pitch shifting, it pays off to preapply the
        # spline filter to each input spectrogram, so it does not need to be
        # applied to each mini-batch later.
        spline_order = 2
        if spline_order > 1:
            from scipy.ndimage import spline_filter
            spects = [spline_filter(spect, spline_order).astype(floatX)
                      for spect in spects]

        # We define a function to create the mini-batch generator. This allows
        # us to easily create multiple generators for multithreading if needed.
        def create_datafeed(spects, labels):
            # With augmentation, as we want to apply random time-stretching,
            # we request longer excerpts than we finally need to return.
            max_stretch = .3
            batches = augment.grab_random_excerpts(
                    spects, labels, batchsize=batchsize,
                    frames=int(blocklen / (1 - max_stretch)))

            # We wrap the generator in another one that applies random time
            # stretching and pitch shifting, keeping a given number of frames
            # and bins only.
            max_shift = .3
            batches = augment.apply_random_stretch_shift(
                    batches, max_stretch, max_shift,
                    keep_frames=blocklen, keep_bins=bin_mel_max,
                    order=spline_order, prefiltered=True)

            # We transform the excerpts to mel frequency and log magnitude.
            batches = augment.apply_filterbank(batches, filterbank)
            batches = augment.apply_logarithm(batches)

            # We apply random frequency filters
            batches = augment.apply_random_filters(batches, filterbank,
                                                   mel_max, max_db=10)

            # We apply normalization
            batches = augment.apply_znorm(batches, mean, istd)

            return batches

        # We start the mini-batch generator and augmenter in one or more
        # background threads or processes (unless disabled).
        bg_threads = 3
        bg_processes = 0
        if not bg_threads and not bg_processes:
            # no background processing: just create a single generator
            batches = create_datafeed(spects, labels)
        elif bg_threads:
            # multithreading: create a separate generator per thread
            batches = augment.generate_in_background(
                    [create_datafeed(spects, labels)
                     for _ in range(bg_threads)],
                    num_cached=bg_threads * 5)
        elif bg_processes:
            # multiprocessing: single generator is forked along with processes
            batches = augment.generate_in_background(
                    [create_datafeed(spects, labels)] * bg_processes,
                    num_cached=bg_processes * 25,
                    in_processes=True)


    print("Preparing training function...")
    # instantiate neural network
    input_var = T.tensor3('input')
    inputs = input_var.dimshuffle(0, 'x', 1, 2)  # insert "channels" dimension
    network = model.architecture(inputs, (None, 1, blocklen, mel_bands))

    # create cost expression
    target_var = T.vector('targets')
    targets = (0.02 + 0.96 * target_var)  # map 0 -> 0.02, 1 -> 0.98
    targets = targets.dimshuffle(0, 'x')  # turn into column vector
    outputs = lasagne.layers.get_output(network, deterministic=False)
    cost = T.mean(lasagne.objectives.binary_crossentropy(outputs, targets))

    # prepare and compile training function
    params = lasagne.layers.get_all_params(network, trainable=True)
    initial_eta = 0.01
    eta_decay = 0.85
    momentum = 0.95
    eta = theano.shared(lasagne.utils.floatX(initial_eta))
    updates = lasagne.updates.nesterov_momentum(cost, params, eta, momentum)
    print("Compiling training function...")
    train_fn = theano.function([input_var, target_var], cost, updates=updates)

    # run training loop
    print("Training:")
    epochs = 20
    epochsize = 2000
    batches = iter(batches)
    for epoch in range(epochs):
        err = 0
        for batch in progress(
                range(epochsize), min_delay=.5,
                desc='Epoch %d/%d: Batch ' % (epoch + 1, epochs)):
            err += train_fn(*next(batches))
            if not np.isfinite(err):
                print("\nEncountered NaN loss in training. Aborting.")
                sys.exit(1)
        print("Train loss: %.3f" % (err / epochsize))
        eta.set_value(eta.get_value() * lasagne.utils.floatX(eta_decay))

    # save final network
    print("Saving final model")
    np.savez(modelfile, **{'param%d' % i: p for i, p in enumerate(
            lasagne.layers.get_all_param_values(network))})
Esempio n. 5
0
def main():
    # parse command line
    parser = opts_parser()
    options = parser.parse_args()
    outdir = options.outdir
    if options.load_spectra != 'memory' and not options.cache_spectra:
        parser.error('option --load-spectra=%s requires --cache-spectra' %
                     options.load_spectra)

    # read configuration files and immediate settings
    cfg = {}
    for fn in options.vars:
        cfg.update(config.parse_config_file(fn))
    cfg.update(config.parse_variable_assignments(options.var))

    # read some settings into local variables
    sample_rate = cfg['sample_rate']
    frame_len = cfg['frame_len']
    fps = cfg['fps']
    mel_bands = cfg['mel_bands']
    mel_min = cfg['mel_min']
    mel_max = cfg['mel_max']

    # prepare dataset
    datadir = os.path.join(os.path.dirname(__file__), os.path.pardir,
                           'datasets', options.dataset)

    # - load filelist
    filelist = []
    ranges = {}
    for part in 'train', 'valid', 'test':
        a = len(filelist)
        with io.open(
                os.path.join(datadir, 'filelists',
                             cfg.get('filelist.%s' % part, part))) as f:
            filelist.extend(l.rstrip() for l in f if l.rstrip())
        ranges[part] = slice(a, len(filelist))

    # - compute spectra
    print("Computing%s spectra..." %
          (" or loading" if options.cache_spectra else ""))
    spects = []
    for fn in progress(filelist, 'File '):
        cache_fn = (options.cache_spectra
                    and os.path.join(options.cache_spectra, fn + '.npy'))
        spects.append(
            cached(cache_fn,
                   audio.extract_spect,
                   os.path.join(datadir, 'audio', fn),
                   sample_rate,
                   frame_len,
                   fps,
                   loading_mode=options.load_spectra))

    # - load and convert corresponding labels
    print("Loading labels...")
    labels = []
    for fn, spect in zip(filelist, spects):
        fn = os.path.join(datadir, 'labels', fn.rsplit('.', 1)[0] + '.lab')
        with io.open(fn) as f:
            segments = [l.rstrip().split() for l in f if l.rstrip()]
        segments = [(float(start), float(end), label == 'sing')
                    for start, end, label in segments]
        timestamps = np.arange(len(spect)) / float(fps)
        labels.append(create_aligned_targets(segments, timestamps, np.bool))

    # compute and save different variants of summarized magnitudes
    print("Saving files...")

    # - ground truth
    outfile = os.path.join(outdir, '%s_gt.pkl' % options.dataset)
    print(outfile)
    with io.open(outfile, 'wb') as f:
        pickle.dump({'labels': labels, 'splits': ranges}, f, protocol=-1)

    # - summarized spectra
    save_spectral_sums(
        os.path.join(outdir, '%s_spect_sum.pkl' % options.dataset), spects)

    # - summarized mel spectra
    bank = audio.create_mel_filterbank(sample_rate, frame_len, mel_bands,
                                       mel_min, mel_max).astype(np.float32)
    spects = [np.dot(spect[:, ], bank) for spect in spects]
    save_spectral_sums(
        os.path.join(outdir, '%s_spect_mel_sum.pkl' % options.dataset), spects)

    # - summarized log-mel spectra
    spects = [np.log(np.maximum(1e-7, spect)) for spect in spects]
    save_spectral_sums(
        os.path.join(outdir, '%s_spect_mel_log_sum.pkl' % options.dataset),
        spects)

    # - summarized standardized log-mel spectra
    m, s = znorm.compute_mean_std(spects[ranges['train']], axis=0)
    spects = [((spect - m) / s).astype(np.float32) for spect in spects]
    save_spectral_sums(
        os.path.join(outdir, '%s_spect_mel_log_std_sum.pkl' % options.dataset),
        spects)
Esempio n. 6
0
def main():
    # parse command line
    parser = opts_parser()
    options = parser.parse_args()
    modelfile = options.modelfile
    if options.load_spectra != 'memory' and not options.cache_spectra:
        parser.error('option --load-spectra=%s requires --cache-spectra' %
                     options.load_spectra)

    # read configuration files and immediate settings
    cfg = {}
    for fn in options.vars:
        cfg.update(config.parse_config_file(fn))
    cfg.update(config.parse_variable_assignments(options.var))

    # read some settings into local variables
    sample_rate = cfg['sample_rate']
    frame_len = cfg['frame_len']
    fps = cfg['fps']
    mel_bands = cfg['mel_bands']
    mel_min = cfg['mel_min']
    mel_max = cfg['mel_max']
    blocklen = cfg['blocklen']
    batchsize = cfg['batchsize']

    bin_nyquist = frame_len // 2 + 1
    if cfg['filterbank'] == 'mel_learn':
        bin_mel_max = bin_nyquist
    else:
        bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate

    # prepare dataset
    datadir = os.path.join(os.path.dirname(__file__), os.path.pardir,
                           'datasets', options.dataset)

    # - load filelist
    with io.open(
            os.path.join(datadir, 'filelists',
                         cfg.get('filelist.train', 'train'))) as f:
        filelist = [l.rstrip() for l in f if l.rstrip()]
    if options.validate:
        with io.open(
                os.path.join(datadir, 'filelists',
                             cfg.get('filelist.valid', 'valid'))) as f:
            filelist_val = [l.rstrip() for l in f if l.rstrip()]
        filelist.extend(filelist_val)
    else:
        filelist_val = []

    # - compute spectra
    print("Computing%s spectra..." %
          (" or loading" if options.cache_spectra else ""))
    spects = []
    for fn in progress(filelist, 'File '):
        cache_fn = (options.cache_spectra
                    and os.path.join(options.cache_spectra, fn + '.npy'))
        spects.append(
            cached(cache_fn,
                   audio.extract_spect,
                   os.path.join(datadir, 'audio', fn),
                   sample_rate,
                   frame_len,
                   fps,
                   loading_mode=options.load_spectra))

    # - load and convert corresponding labels
    print("Loading labels...")
    labels = []
    for fn, spect in zip(filelist, spects):
        fn = os.path.join(datadir, 'labels', fn.rsplit('.', 1)[0] + '.lab')
        with io.open(fn) as f:
            segments = [l.rstrip().split() for l in f if l.rstrip()]
        segments = [(float(start), float(end), label == 'sing')
                    for start, end, label in segments]
        timestamps = np.arange(len(spect)) / float(fps)
        labels.append(create_aligned_targets(segments, timestamps, np.bool))

    # - split off validation data, if needed
    if options.validate:
        spects_val = spects[-len(filelist_val):]
        spects = spects[:-len(filelist_val)]
        labels_val = labels[-len(filelist_val):]
        labels = labels[:-len(filelist_val)]

    # - prepare training data generator
    print("Preparing training data feed...")
    if not options.augment:
        # Without augmentation, we just create a generator that returns
        # mini-batches of random excerpts
        batches = augment.grab_random_excerpts(spects, labels, batchsize,
                                               blocklen, bin_mel_max)
        batches = augment.generate_in_background([batches], num_cached=15)
    else:
        # For time stretching and pitch shifting, it pays off to preapply the
        # spline filter to each input spectrogram, so it does not need to be
        # applied to each mini-batch later.
        spline_order = cfg['spline_order']
        if spline_order > 1 and options.load_spectra == 'memory':
            from scipy.ndimage import spline_filter
            spects = [
                spline_filter(spect, spline_order).astype(floatX)
                for spect in spects
            ]
            prefiltered = True
        else:
            prefiltered = False

        # We define a function to create the mini-batch generator. This allows
        # us to easily create multiple generators for multithreading if needed.
        def create_datafeed(spects, labels):
            # With augmentation, as we want to apply random time-stretching,
            # we request longer excerpts than we finally need to return.
            max_stretch = cfg['max_stretch']
            batches = augment.grab_random_excerpts(
                spects,
                labels,
                batchsize=batchsize,
                frames=int(blocklen / (1 - max_stretch)))

            # We wrap the generator in another one that applies random time
            # stretching and pitch shifting, keeping a given number of frames
            # and bins only.
            max_shift = cfg['max_shift']
            batches = augment.apply_random_stretch_shift(
                batches,
                max_stretch,
                max_shift,
                keep_frames=blocklen,
                keep_bins=bin_mel_max,
                order=spline_order,
                prefiltered=prefiltered)

            # We apply random frequency filters
            max_db = cfg['max_db']
            batches = augment.apply_random_filters(batches, mel_max, max_db)

            # We apply random loudness changes
            max_loudness = cfg['max_loudness']
            if max_loudness:
                batches = augment.apply_random_loudness(batches, max_loudness)

            return batches

        # We start the mini-batch generator and augmenter in one or more
        # background threads or processes (unless disabled).
        bg_threads = cfg['bg_threads']
        bg_processes = cfg['bg_processes']
        if not bg_threads and not bg_processes:
            # no background processing: just create a single generator
            batches = create_datafeed(spects, labels)
        elif bg_threads:
            # multithreading: create a separate generator per thread
            batches = augment.generate_in_background(
                [create_datafeed(spects, labels) for _ in range(bg_threads)],
                num_cached=bg_threads * 5)
        elif bg_processes:
            # multiprocessing: single generator is forked along with processes
            batches = augment.generate_in_background(
                [create_datafeed(spects, labels)] * bg_processes,
                num_cached=bg_processes * 25,
                in_processes=True)

    print("Preparing training function...")
    # instantiate neural network
    input_var = T.tensor3('input')
    inputs = input_var.dimshuffle(0, 'x', 1, 2)  # insert "channels" dimension
    network = model.architecture(inputs, (None, 1, blocklen, bin_mel_max), cfg)
    print(
        "- %d layers (%d with weights), %f mio params" %
        (len(lasagne.layers.get_all_layers(network)),
         sum(hasattr(l, 'W') for l in lasagne.layers.get_all_layers(network)),
         lasagne.layers.count_params(network, trainable=True) / 1e6))
    print("- weight shapes: %r" % [
        l.W.get_value().shape for l in lasagne.layers.get_all_layers(network)
        if hasattr(l, 'W') and hasattr(l.W, 'get_value')
    ])

    # create cost expression
    target_var = T.vector('targets')
    targets = (0.02 + 0.96 * target_var)  # map 0 -> 0.02, 1 -> 0.98
    targets = targets.dimshuffle(0, 'x')  # turn into column vector
    outputs = lasagne.layers.get_output(network, deterministic=False)
    cost = T.mean(lasagne.objectives.binary_crossentropy(outputs, targets))
    if cfg.get('l2_decay', 0):
        cost_l2 = lasagne.regularization.regularize_network_params(
            network, lasagne.regularization.l2) * cfg['l2_decay']
    else:
        cost_l2 = 0

    # prepare and compile training function
    params = lasagne.layers.get_all_params(network, trainable=True)
    initial_eta = cfg['initial_eta']
    eta_decay = cfg['eta_decay']
    eta_decay_every = cfg.get('eta_decay_every', 1)
    patience = cfg.get('patience', 0)
    trials_of_patience = cfg.get('trials_of_patience', 1)
    patience_criterion = cfg.get(
        'patience_criterion',
        'valid_loss' if options.validate else 'train_loss')
    momentum = cfg['momentum']
    first_params = params[:cfg['first_params']]
    first_params_eta_scale = cfg['first_params_eta_scale']
    if cfg['learn_scheme'] == 'nesterov':
        learn_scheme = lasagne.updates.nesterov_momentum
    elif cfg['learn_scheme'] == 'momentum':
        learn_scheme = lasagne.update.momentum
    elif cfg['learn_scheme'] == 'adam':
        learn_scheme = lasagne.updates.adam
    else:
        raise ValueError('Unknown learn_scheme=%s' % cfg['learn_scheme'])
    eta = theano.shared(lasagne.utils.floatX(initial_eta))
    if not first_params or first_params_eta_scale == 1:
        updates = learn_scheme(cost + cost_l2, params, eta, momentum)
    else:
        grads = theano.grad(cost + cost_l2, params)
        updates = learn_scheme(grads[len(first_params):],
                               params[len(first_params):], eta, momentum)
        if first_params_eta_scale > 0:
            updates.update(
                learn_scheme(grads[:len(first_params)], first_params,
                             eta * first_params_eta_scale, momentum))
    print("Compiling training function...")
    train_fn = theano.function([input_var, target_var], cost, updates=updates)

    # prepare and compile validation function, if requested
    if options.validate:
        print("Compiling validation function...")
        import model_to_fcn
        network_test = model_to_fcn.model_to_fcn(network, allow_unlink=False)
        outputs_test = lasagne.layers.get_output(network_test,
                                                 deterministic=True)
        cost_test = T.mean(
            lasagne.objectives.binary_crossentropy(outputs_test, targets))
        val_fn = theano.function([input_var, target_var],
                                 [cost_test, outputs_test])

    # run training loop
    print("Training:")
    epochs = cfg['epochs']
    epochsize = cfg['epochsize']
    batches = iter(batches)
    if options.save_errors:
        errors = []
    if first_params and cfg['first_params_log']:
        first_params_hist = []
    if patience > 0:
        best_error = np.inf
        best_state = get_state(network, updates)
    for epoch in range(epochs):
        # actual training
        err = 0
        for batch in progress(range(epochsize),
                              min_delay=.5,
                              desc='Epoch %d/%d: Batch ' %
                              (epoch + 1, epochs)):
            err += train_fn(*next(batches))
            if not np.isfinite(err):
                print("\nEncountered NaN loss in training. Aborting.")
                sys.exit(1)
            if first_params and cfg['first_params_log'] and (
                    batch % cfg['first_params_log'] == 0):
                first_params_hist.append(
                    tuple(param.get_value() for param in first_params))
                np.savez(
                    modelfile[:-4] + '.hist.npz', **{
                        'param%d' % i: param
                        for i, param in enumerate(zip(*first_params_hist))
                    })

        # report training loss
        print("Train loss: %.3f" % (err / epochsize))
        if options.save_errors:
            errors.append(err / epochsize)

        # compute and report validation loss, if requested
        if options.validate:
            val_err = 0
            preds = []
            max_len = int(fps * cfg.get('val.max_len', 30))
            for spect, label in zip(spects_val, labels_val):
                # pick excerpt of val.max_len seconds in center of file
                excerpt = slice(max(0, (len(spect) - max_len) // 2),
                                (len(spect) + max_len) // 2)
                # crop to maximum length and required spectral bins
                spect = spect[None, excerpt, :bin_mel_max]
                # crop to maximum length and remove edges lost in the network
                label = label[excerpt][blocklen // 2:-(blocklen // 2)]
                e, pred = val_fn(spect, label)
                val_err += e
                preds.append((pred[:, 0], label))
            print("Validation loss: %.3f" % (val_err / len(filelist_val)))
            from eval import evaluate
            _, results = evaluate(*zip(*preds))
            print("Validation error: %.3f" % (1 - results['accuracy']))
            if options.save_errors:
                errors.append(val_err / len(filelist_val))
                errors.append(1 - results['accuracy'])

        # update learning rate and/or apply early stopping, if needed
        if patience > 0:
            if patience_criterion == 'train_loss':
                cur_error = err / epochsize
            elif patience_criterion == 'valid_loss':
                cur_error = val_err / len(filelist_val)
            elif patience_criterion == 'valid_error':
                cur_error = 1 - results['accuracy']
            if cur_error <= best_error:
                best_error = cur_error
                best_state = get_state(network, updates)
                patience = cfg['patience']
            else:
                patience -= 1
                if patience == 0:
                    if eta_decay_every == 'trial_of_patience' and eta_decay != 1:
                        eta.set_value(eta.get_value() *
                                      lasagne.utils.floatX(eta_decay))
                    restore_state(network, updates, best_state)
                    patience = cfg['patience']
                    trials_of_patience -= 1
                    print("Lost patience (%d remaining trials)." %
                          trials_of_patience)
                    if trials_of_patience == 0:
                        break
        if eta_decay_every != 'trial_of_patience' and eta_decay != 1 and \
                (epoch + 1) % eta_decay_every == 0:
            eta.set_value(eta.get_value() * lasagne.utils.floatX(eta_decay))

    # save final network
    print("Saving final model")
    np.savez(
        modelfile, **{
            'param%d' % i: p
            for i, p in enumerate(lasagne.layers.get_all_param_values(network))
        })
    with io.open(modelfile + '.vars', 'wb') as f:
        f.writelines('%s=%s\n' % kv for kv in cfg.items())
    if options.save_errors:
        np.savez(modelfile[:-len('.npz')] + '.err.npz',
                 np.asarray(errors).reshape(epoch + 1, -1))
Esempio n. 7
0
def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # parse command line
    parser = opts_parser()
    options = parser.parse_args()
    modelfile = options.modelfile

    cfg = {}
    for fn in options.vars:
        cfg.update(config.parse_config_file(fn))

    cfg.update(config.parse_variable_assignments(options.var))

    sample_rate = cfg['sample_rate']
    frame_len = cfg['frame_len']
    fps = cfg['fps']
    mel_bands = cfg['mel_bands']
    mel_min = cfg['mel_min']
    mel_max = cfg['mel_max']
    blocklen = cfg['blocklen']
    batchsize = cfg['batchsize']

    bin_nyquist = frame_len // 2 + 1
    bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate

    # prepare dataset
    datadir = os.path.join(os.path.dirname(__file__), os.path.pardir,
                           'datasets', options.dataset)

    # - load filelist
    with io.open(os.path.join(datadir, 'filelists', 'train')) as f:
        filelist = [l.rstrip() for l in f if l.rstrip()]
    if options.validate:
        with io.open(os.path.join(datadir, 'filelists', 'valid')) as f:
            filelist_val = [l.strip() for l in f if l.strip()]
        filelist.extend(filelist_val)
    else:
        filelist_val = []
    # - compute spectra
    print("Computing%s spectra..." %
          (" or loading" if options.cache_spectra else ""))
    spects = []
    for fn in progress(filelist, 'File '):
        cache_fn = (options.cache_spectra
                    and os.path.join(options.cache_spectra, fn + '.npy'))
        spects.append(
            cached(cache_fn, audio.extract_spect,
                   os.path.join(datadir, 'audio', fn), sample_rate, frame_len,
                   fps))

    # - load and convert corresponding labels
    print("Loading labels...")
    labels = []
    for fn, spect in zip(filelist, spects):
        fn = os.path.join(datadir, 'labels', fn.rsplit('.', 1)[0] + '.lab')
        with io.open(fn) as f:
            segments = [l.rstrip().split() for l in f if l.rstrip()]
        segments = [(float(start), float(end), label == 'sing')
                    for start, end, label in segments]
        timestamps = np.arange(len(spect)) / float(fps)
        labels.append(create_aligned_targets(segments, timestamps, np.bool))

    # - prepare mel filterbank
    filterbank = audio.create_mel_filterbank(sample_rate, frame_len, mel_bands,
                                             mel_min, mel_max)
    filterbank = filterbank[:bin_mel_max].astype(floatX)

    if options.validate:
        spects_val = spects[-len(filelist_val):]
        spects = spects[:-len(filelist_val)]
        labels_val = labels[-len(filelist_val):]
        labels = labels[:-len(filelist_val)]

    # - precompute mel spectra, if needed, otherwise just define a generator
    mel_spects = (np.log(
        np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7))
                  for spect in spects)

    if not options.augment:
        mel_spects = list(mel_spects)
        del spects

    # - load mean/std or compute it, if not computed yet
    meanstd_file = os.path.join(os.path.dirname(__file__),
                                '%s_meanstd.npz' % options.dataset)
    try:
        with np.load(meanstd_file) as f:
            mean = f['mean']
            std = f['std']
    except (IOError, KeyError):
        print("Computing mean and standard deviation...")
        mean, std = znorm.compute_mean_std(mel_spects)
        np.savez(meanstd_file, mean=mean, std=std)
    mean = mean.astype(floatX)
    istd = np.reciprocal(std).astype(floatX)

    # - prepare training data generator
    print("Preparing training data feed...")
    if not options.augment:
        # Without augmentation, we just precompute the normalized mel spectra
        # and create a generator that returns mini-batches of random excerpts
        mel_spects = [(spect - mean) * istd for spect in mel_spects]
        batches = augment.grab_random_excerpts(mel_spects, labels, batchsize,
                                               blocklen)
    else:
        # For time stretching and pitch shifting, it pays off to preapply the
        # spline filter to each input spectrogram, so it does not need to be
        # applied to each mini-batch later.
        spline_order = cfg['spline_order']
        if spline_order > 1:
            from scipy.ndimage import spline_filter
            spects = [
                spline_filter(spect, spline_order).astype(floatX)
                for spect in spects
            ]

        # We define a function to create the mini-batch generator. This allows
        # us to easily create multiple generators for multithreading if needed.
        def create_datafeed(spects, labels):
            # With augmentation, as we want to apply random time-stretching,
            # we request longer excerpts than we finally need to return.
            max_stretch = cfg['max_stretch']
            batches = augment.grab_random_excerpts(
                spects,
                labels,
                batchsize=batchsize,
                frames=int(blocklen / (1 - max_stretch)))

            # We wrap the generator in another one that applies random time
            # stretching and pitch shifting, keeping a given number of frames
            # and bins only.
            max_shift = cfg['max_shift']
            batches = augment.apply_random_stretch_shift(batches,
                                                         max_stretch,
                                                         max_shift,
                                                         keep_frames=blocklen,
                                                         keep_bins=bin_mel_max,
                                                         order=spline_order,
                                                         prefiltered=True)

            # We transform the excerpts to mel frequency and log magnitude.
            batches = augment.apply_filterbank(batches, filterbank)
            batches = augment.apply_logarithm(batches)

            # We apply random frequency filters
            max_db = cfg['max_db']
            batches = augment.apply_random_filters(batches,
                                                   filterbank,
                                                   mel_max,
                                                   max_db=max_db)

            # We apply normalization
            batches = augment.apply_znorm(batches, mean, istd)

            return batches

        # We start the mini-batch generator and augmenter in one or more
        # background threads or processes (unless disabled).
        bg_threads = cfg['bg_threads']
        bg_processes = cfg['bg_processes']
        if not bg_threads and not bg_processes:
            # no background processing: just create a single generator
            batches = create_datafeed(spects, labels)
        elif bg_threads:
            # multithreading: create a separate generator per thread
            batches = augment.generate_in_background(
                [create_datafeed(spects, labels) for _ in range(bg_threads)],
                num_cached=bg_threads * 5)
        elif bg_processes:
            # multiprocessing: single generator is forked along with processes
            batches = augment.generate_in_background(
                [create_datafeed(spects, labels)] * bg_processes,
                num_cached=bg_processes * 25,
                in_processes=True)

    ###########################################################################
    #-----------Main changes to code to make it work with pytorch-------------#
    ###########################################################################

    print("preparing training function...")
    mdl = model.CNNModel()
    mdl = mdl.to(device)

    #Setting up learning rate and learning rate parameters
    initial_eta = cfg['initial_eta']
    eta_decay = cfg['eta_decay']
    momentum = cfg['momentum']
    eta_decay_every = cfg.get('eta_decay_every', 1)
    eta = initial_eta

    #set up loss
    criterion = torch.nn.BCELoss()

    #set up optimizer
    optimizer = torch.optim.SGD(mdl.parameters(),
                                lr=eta,
                                momentum=momentum,
                                nesterov=True)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=eta_decay_every,
                                                gamma=eta_decay)

    #set up optimizer
    writer = SummaryWriter(os.path.join(modelfile, 'runs'))

    epochs = cfg['epochs']
    epochsize = cfg['epochsize']
    batches = iter(batches)

    #conditions to save model
    best_val_loss = 100000.
    best_val_error = 1.

    for epoch in range(epochs):
        # - Initialize certain parameters that are used to monitor training
        err = 0
        total_norm = 0
        loss_accum = 0
        mdl.train(True)
        # - Compute the L-2 norm of the gradients
        for p in mdl.parameters():
            if p.grad is not None:
                param_norm = p.grad.data.norm(2)
                total_norm += param_norm.item()**2
        total_norm = total_norm**(1. / 2)

        # - Start the training for this epoch
        for batch in progress(range(epochsize),
                              min_delay=0.5,
                              desc='Epoch %d/%d: Batch ' %
                              (epoch + 1, epochs)):
            data = next(batches)
            input_data = np.transpose(data[0][:, :, :, np.newaxis],
                                      (0, 3, 1, 2))
            labels = data[1][:, np.newaxis].astype(np.float32)

            #map labels to make them softer
            labels = (0.02 + 0.96 * labels)
            optimizer.zero_grad()

            outputs = mdl(torch.from_numpy(input_data).to(device))
            loss = criterion(outputs, torch.from_numpy(labels).to(device))
            loss.backward()
            optimizer.step()
            loss_accum += loss.item()

        # - Compute validation loss and error if desired
        if options.validate:

            from eval import evaluate
            mdl.train(False)
            val_loss = 0
            preds = []
            labs = []
            max_len = fps

            mel_spects_val = (np.log(
                np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7))
                              for spect in spects_val)

            mel_spects_val = [(spect - mean) * istd
                              for spect in mel_spects_val]

            num_iter = 0

            for spect, label in zip(mel_spects_val, labels_val):
                num_excerpts = len(spect) - blocklen + 1
                excerpts = np.lib.stride_tricks.as_strided(
                    spect,
                    shape=(num_excerpts, blocklen, spect.shape[1]),
                    strides=(spect.strides[0], spect.strides[0],
                             spect.strides[1]))

                # - Pass mini-batches through the network and concatenate results
                for pos in range(0, num_excerpts, batchsize):
                    input_data = np.transpose(
                        excerpts[pos:pos + batchsize, :, :, np.newaxis],
                        (0, 3, 1, 2))
                    if (pos + batchsize > num_excerpts):
                        label_batch = label[blocklen // 2 + pos:blocklen // 2 +
                                            num_excerpts,
                                            np.newaxis].astype(np.float32)
                    else:
                        label_batch = label[blocklen // 2 + pos:blocklen // 2 +
                                            pos + batchsize,
                                            np.newaxis].astype(np.float32)

                    pred = mdl(torch.from_numpy(input_data).to(device))
                    e = criterion(pred,
                                  torch.from_numpy(label_batch).to(device))
                    preds = np.append(preds, pred[:, 0].cpu().detach().numpy())
                    labs = np.append(labs, label_batch)
                    val_loss += e.item()
                    num_iter += 1

            print("Validation loss: %.3f" % (val_loss / num_iter))
            _, results = evaluate(preds, labs)
            print("Validation error: %.3f" % (1 - results['accuracy']))

            if (val_loss / num_iter < best_val_loss
                    and (1 - results['accuracy']) < best_val_error):
                torch.save(mdl.state_dict(),
                           os.path.join(modelfile, 'model.pth'))
                best_val_loss = val_loss / num_iter
                best_val_error = 1 - results['accuracy']
                print('New saved model', best_val_loss, best_val_error)

        #Update the learning rate
        scheduler.step()

        print('Training Loss per epoch', loss_accum / epochsize)

        # - Save parameters for examining
        writer.add_scalar('Training Loss', loss_accum / epochsize, epoch)
        writer.add_scalar('Validation loss', val_loss / num_iter, epoch)
        writer.add_scalar('Gradient norm', total_norm, epoch)
        writer.add_scalar('Validation error', 1 - results['accuracy'])
        for param_group in optimizer.param_groups:
            print(param_group['lr'])

    if not options.validate:
        torch.save(mdl.state_dict(), os.path.join(modelfile, 'model.pth'))