def EstimateChord(idx, dnnmodel, todir=False):
    #dnn = networks.FeatureDNN()
    #dnn = networks.ConvnetFeatExtractor()
    dnn = networks.FullCNNFeatExtractor()
    #dnn = networks.NoOperation()
    dnn.load(dnnmodel)
    dnn.to_gpu(0)
    decoder = networks.NBLSTMCRF()
    decoder.load()
    decoder.to_gpu(0)
    cqtfilelist = np.array(find_files(const.PATH_HCQT, ext="npy"))[idx]
    i = 0
    chainer.config.train = False
    chainer.config.enable_backprop = False
    for cqtfile in cqtfilelist:
        cqt = utils.Embed(utils.PreprocessSpec(np.load(cqtfile)[:, :, :]), 1)
        chroma = dnn.GetFeature(cp.asarray(cqt)).data
        path = decoder.argmax(chroma)
        feat = cp.asnumpy(chroma)
        if todir:
            fname = cqtfile.split("/")[-1] + ".lab"
            alb = cqtfile.split("/")[-2]
            utils.SaveEstimatedLabelsFramewise(
                path, const.PATH_ESTIMATE_CROSS + alb + "/" + fname, feat)
        else:
            utils.SaveEstimatedLabelsFramewise(
                path, const.PATH_ESTIMATE + "%03d.lab" % i, feat)
        i += 1
def TrainConvnetExtractor(trainidx, epoch=20, saveas="convnet.model"):
    cqtfilelist = np.array(find_files(const.PATH_MIDIHCQT,
                                      ext="npz"))[trainidx]
    #midifilelist = find_files(const.PATH_MIDI,ext="mid")[:filecnt]
    config.train = True
    config.enable_backprop = True
    convnet = networks.FullCNNFeatExtractor()
    model = networks.ConvnetPredictor(convnet)
    model.to_gpu(0)
    opt = optimizers.AdaDelta()
    opt.setup(model)
    print("train set length: %d" % trainidx.size)
    print("start epochs...")
    S = []
    T = []

    for cqtfile in cqtfilelist:
        dat = np.load(cqtfile)
        spec = utils.PreprocessSpec(dat["spec"])[:const.CQT_H, :, :]
        targ = GetConvnetTargetFromPianoroll(dat["target"]).astype(np.int32)
        assert (spec.shape[1] == targ.shape[0])
        S.append(spec)
        T.append(targ)
    S = np.concatenate(S, axis=1)
    T = np.concatenate(T, axis=0)

    for ep in range(epoch):
        sum_loss = 0

        assert (S.shape[1] == T.shape[0])
        randidx = np.random.randint(0,
                                    S.shape[1] - const.CONV_TRAIN_SEQLEN - 1,
                                    S.shape[1] // const.CONV_TRAIN_SEQLEN * 4)
        for i in range(0, randidx.size - const.CONV_TRAIN_BATCH,
                       const.CONV_TRAIN_BATCH):
            x_batch = np.stack([
                S[:, randidx[j]:randidx[j] + const.CONV_TRAIN_SEQLEN, :]
                for j in range(i, i + const.CONV_TRAIN_BATCH)
            ])
            t_batch = np.stack([
                T[randidx[j]:randidx[j] + const.CONV_TRAIN_SEQLEN, :]
                for j in range(i, i + const.CONV_TRAIN_BATCH)
            ])
            x_in = cp.asarray(x_batch)
            t_in = cp.asarray(t_batch)
            model.cleargrads()
            loss = model(x_in, t_in)
            loss.backward()
            opt.update()
            sum_loss += loss.data

        convnet.save(saveas)
        print("epoch: %d/%d  loss:%.04f" %
              (ep + 1, epoch, sum_loss / const.CONV_TRAIN_BATCH))

    convnet.save(saveas)
Ejemplo n.º 3
0
                    type=str, default="nblstm_crf.model", action="store")
args = parser.parse_args()

audio_list = find_files("Datas/audios_estimation")

for audiofile in audio_list:
    fname = audiofile.split("/")[-1]
    print("Processing: %s" % fname)
    #load audio
    y,sr = load(audiofile,sr=C.SR)
    
    #extract Harmonic-CQT from audio
    fmin = note_to_hz("C1")
    hcqt = np.stack([np.abs(cqt(y,sr=C.SR,hop_length=C.H,n_bins=C.BIN_CNT,bins_per_octave=C.OCT_BIN,fmin=fmin*(h+1),filter_scale=2,tuning=None)).T.astype(np.float32) for h in range(C.CQT_H)])
    
    #extract feature using trained CNN extractor
    cnn_feat_extractor = N.FullCNNFeatExtractor()
    cnn_feat_extractor.load(args.f)
    
    feat = cnn_feat_extractor.GetFeature(U.PreprocessSpec(hcqt)).data
    
    #decode label sequence
    decoder = N.NBLSTMCRF()
    decoder.load(args.d)
    
    labels = decoder.argmax(feat)
    
    #convert into .lab file
    labfile = os.path.join("Datas/labs_estimated",fname+".lab")
    U.SaveEstimatedLabelsFramewise(labels,labfile,feat)
def TrainNStepCRF(idx,
                  epoch=20,
                  augment=0,
                  featmodel=const.DEFAULT_CONVNETFILE,
                  path_blstm="blstm.model",
                  savefile="nblstm_crf.model"):
    cqtfilelist = np.array(find_files(const.PATH_HCQT, ext="npy"))
    chordlablist = np.array(
        find_files(const.PATH_CHORDLAB, ext=["lab", "chords"]))
    if idx is not None:
        cqtfilelist = cqtfilelist[idx]
        chordlablist = chordlablist[idx]
    chainer.config.train = False
    chainer.config.enable_backprop = False
    #dnn = networks.TripleDNNExtractor()
    #dnn = networks.FeatureDNN()
    dnn = networks.FullCNNFeatExtractor()
    #dnn = networks.NoOperation()
    #dnn = networks.ConvnetFeatExtractor()
    dnn.load(featmodel)
    dnn.to_gpu(0)

    rnn = networks.NBLSTMCRF()
    rnn.blstm.load(path_blstm)
    rnn.to_gpu(0)
    opt = optimizers.MomentumSGD()
    opt.setup(rnn)
    #opt.add_hook(optimizer.WeightDecay(0.001))
    X = []
    T = []
    for cqtfile, labfile in zip(cqtfilelist, chordlablist):
        cqt = utils.Embed(utils.PreprocessSpec(np.load(cqtfile)[:, :, :]), 1)
        feature = cp.asnumpy(dnn.GetFeature(cp.asarray(cqt)).data)
        lab = utils.LoadLabelArr(labfile)
        min_sz = min([feature.shape[0], lab.shape[0]])
        X.append(feature[:min_sz, :])
        T.append(lab[:min_sz])
    sizes = np.array([x.shape[0] for x in X], dtype="int32")
    print("start epoch:")
    chainer.config.train = False
    chainer.config.enable_backprop = True
    last_loss = np.inf
    for ep in range(epoch):
        sum_loss = 0.0
        rand_songid = np.random.randint(len(X),
                                        size=np.sum(sizes) //
                                        const.DECODER_TRAIN_SEQLEN * 8)
        for i in range(0, rand_songid.size, const.DECODER_TRAIN_BATCH):
            xbatch = []
            tbatch = []
            for songid in rand_songid[i:i + const.DECODER_TRAIN_BATCH]:
                seq_len = sizes[songid]
                idx = np.random.randint(seq_len - const.DECODER_TRAIN_SEQLEN -
                                        1)
                x_snip = X[songid][idx:idx + const.DECODER_TRAIN_SEQLEN, :]
                t_snip = T[songid][idx:idx + const.DECODER_TRAIN_SEQLEN]
                if augment > 0:
                    shift = np.random.randint(augment)
                    x_snip, t_snip = shift_data(x_snip, t_snip, shift)
                xbatch.append(Variable(cp.asarray(x_snip)))
                tbatch.append(Variable(cp.asarray(t_snip)))
            rnn.cleargrads()
            opt.update(rnn, xbatch, tbatch)
            sum_loss += rnn.loss.data

        print("epoch %d/%d loss=%.3f" % (ep + 1, epoch, sum_loss / 12800.0))
        rnn.save(savefile)
Ejemplo n.º 5
0
        cqt(wav,
            sr=C.SR,
            hop_length=C.H,
            n_bins=144,
            bins_per_octave=24,
            filter_scale=2,
            tuning=None)).T.astype(np.float32)),
                   size=1)

#dat = np.load("/media/wuyiming/TOSHIBA EXT/midihcqt_12/000005.npy")
#dat_24 = np.load("/media/wuyiming/TOSHIBA EXT/midihcqt_24/000005.npz")
#spec_dnn = U.Embed(U.PreprocessSpec(dat_24["spec"]),size=7)

spec = spec[:, :250, :]
spec_dnn = spec_dnn[:250, :]
cnn = networks.FullCNNFeatExtractor()
cnn.load("fullcnn_crossentropy_6000.model")

deepchroma = networks.FeatureDNN()
deepchroma.load(
    "/home/wuyiming/Projects/TranscriptionChordRecognition/dnn3500.model")

chroma_cnn = cnn.GetFeature(spec).data[:, 12:24].T
chroma_dnn = deepchroma.GetFeature(spec_dnn).data[:, 12:24].T
chroma = np.log(
    1 + chroma_cqt(wav, sr=C.SR, hop_length=C.H, bins_per_octave=24)[:, :250])

target = chromatemplate.GetConvnetTargetFromPianoroll(
    U.GetPianoroll(
        "/media/wuyiming/TOSHIBA EXT/AIST.RWC-MDB-P-2001.SMF_SYNC/RM-P051.SMF_SYNC.MID"
    ))
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun  6 13:13:37 2018

@author: wuyiming
"""

import networks as N
from librosa.core import cqt, load, note_to_hz
import const as C
import numpy as np

cnn = N.FullCNNFeatExtractor()
cnn.load("fullcnn_crossentropy_6000.model")

y, sr = load("audio.wav", sr=C.SR)

fmin = fmin = note_to_hz("C1")
spec = np.stack([
    np.abs(
        cqt(y,
            sr=C.SR,
            hop_length=C.H,
            n_bins=C.BIN_CNT,
            bins_per_octave=C.OCT_BIN,
            fmin=fmin * (h + 1),
            filter_scale=2,
            tuning=None)).T.astype(np.float32) for h in range(C.CQT_H)
])