Example #1
0
    def getAudio(self,max_duration=0,sampleRate=44100):
        if not os.path.exists(self.wav_path):
            print("file not found "+self.wav_path) 
            return False
        # Read audio data
        audio,sampleRate,bitrate = util.readAudioScipy(self.wav_path)

        if max_duration==0 or (self.noteEnd - self.noteStart) < max_duration:
            note = audio[int(self.noteStart*sampleRate):int(self.noteEnd*sampleRate)]
        else:
            note = audio[int(self.noteStart*sampleRate):int((self.noteStart+max_duration)*sampleRate)]

        #detect onset with RMS
        lengthData = len(note)
        hopsize=128
        lengthWindow=1024
        numberFrames = int(np.ceil(lengthData / np.double(hopsize)))
        
        energy = np.zeros([int(numberFrames), 1], dtype=float)
        for n in np.arange(numberFrames):
            beginFrame = int(n*hopsize)
            endFrame = int(beginFrame+lengthWindow)
            segment = note[int(beginFrame):int(endFrame)]
            energy[n] = np.sqrt( np.sum( np.power(segment,2)) / len(segment))
  
        onset = np.maximum(0,np.argmax(energy>0.01)-1)
        energy=None
        
        if onset>0:
            self.noteStart=self.noteStart+onset*float(hopsize)/sampleRate
            if max_duration==0 or (self.noteEnd - self.noteStart) < max_duration:
                note = audio[int(self.noteStart*sampleRate):int(self.noteEnd*sampleRate)]
            else:
                note = audio[int(self.noteStart*sampleRate):int((self.noteStart+max_duration)*sampleRate)]
        return note
Example #2
0
        db = kwargs.__getattribute__('db')
    # else:
    #     db='/home/marius/Documents/Database/iKala/'
    if kwargs.__getattribute__('feature_path'):
        feature_path = kwargs.__getattribute__('feature_path')
    else:
        feature_path = os.path.join(db, 'transforms', 't1')
    assert os.path.isdir(
        db
    ), "Please input the directory for the iKala dataset with --db path_to_iKala"

    tt = None
    for f in os.listdir(os.path.join(db, "Wavfile")):
        if f.endswith(".wav"):
            #read the audio file
            audioObj, sampleRate, bitrate = util.readAudioScipy(
                os.path.join(db, "Wavfile", f))
            if tt is None:
                #initialize the transform object which will compute the STFT
                tt = transformFFT(frameSize=1024,
                                  hopSize=512,
                                  sampleRate=sampleRate,
                                  window=blackmanharris)
                pitchhop = 0.032 * float(sampleRate)  #seconds to frames
            assert sampleRate == 44100, "Sample rate needs to be 44100"

            audio = np.zeros((audioObj.shape[0], 3))

            audio[:,
                  0] = audioObj[:,
                                0] + audioObj[:,
                                              1]  #create mixture voice + accompaniment
Example #3
0
def train_auto(train,
               fun,
               transform,
               testdir,
               outdir,
               num_epochs=30,
               model="1.pkl",
               scale_factor=0.3,
               load=False,
               skip_train=False,
               skip_sep=False):
    """
    Trains a network built with \"fun\" with the data generated with \"train\"
    and then separates the files in \"testdir\",writing the result in \"outdir\"

    Parameters
    ----------
    train : Callable, e.g. LargeDataset object
        The callable which generates training data for the network: inputs, target = train()
    fun : lasagne network object, Theano tensor
        The network to be trained  
    transform : transformFFT object
        The Transform object which was used to compute the features (see compute_features.py)
    testdir : string, optional
        The directory where the files to be separated are located
    outdir : string, optional
        The directory where to write the separated files
    num_epochs : int, optional
        The number the epochs to train for (one epoch is when all examples in the dataset are seen by the network)
    model : string, optional
        The path where to save the trained model (theano tensor containing the network) 
    scale_factor : float, optional
        Scale the magnitude of the files to be separated with this factor
    Yields
    ------
    losser : list
        The losses for each epoch, stored in a list
    """

    logging.info("Building Autoencoder")
    input_var2 = T.tensor4('inputs')
    target_var2 = T.tensor4('targets')
    rand_num = T.tensor4('rand_num')

    eps = 1e-8
    alpha = 0.001
    beta = 0.01
    beta_voc = 0.03

    network2 = fun(input_var=input_var2,
                   batch_size=train.batch_size,
                   time_context=train.time_context,
                   feat_size=train.input_size)

    if load:
        params = load_model(model)
        lasagne.layers.set_all_param_values(network2, params)

    prediction2 = lasagne.layers.get_output(network2, deterministic=True)

    rand_num = np.random.uniform(size=(train.batch_size, 1, train.time_context,
                                       train.input_size))

    voc = prediction2[:, 0:1, :, :] + eps * rand_num
    bas = prediction2[:, 1:2, :, :] + eps * rand_num
    dru = prediction2[:, 2:3, :, :] + eps * rand_num
    oth = prediction2[:, 3:4, :, :] + eps * rand_num

    mask1 = voc / (voc + bas + dru + oth)
    mask2 = bas / (voc + bas + dru + oth)
    mask3 = dru / (voc + bas + dru + oth)
    mask4 = oth / (voc + bas + dru + oth)

    vocals = mask1 * input_var2
    bass = mask2 * input_var2
    drums = mask3 * input_var2
    others = mask4 * input_var2

    train_loss_recon_vocals = lasagne.objectives.squared_error(
        vocals, target_var2[:, 0:1, :, :])
    alpha_component = alpha * lasagne.objectives.squared_error(
        vocals, target_var2[:, 1:2, :, :])
    alpha_component += alpha * lasagne.objectives.squared_error(
        vocals, target_var2[:, 2:3, :, :])
    train_loss_recon_neg_voc = beta_voc * lasagne.objectives.squared_error(
        vocals, target_var2[:, 3:4, :, :])

    train_loss_recon_bass = lasagne.objectives.squared_error(
        bass, target_var2[:, 1:2, :, :])
    alpha_component += alpha * lasagne.objectives.squared_error(
        bass, target_var2[:, 0:1, :, :])
    alpha_component += alpha * lasagne.objectives.squared_error(
        bass, target_var2[:, 2:3, :, :])
    train_loss_recon_neg = beta * lasagne.objectives.squared_error(
        bass, target_var2[:, 3:4, :, :])

    train_loss_recon_drums = lasagne.objectives.squared_error(
        drums, target_var2[:, 2:3, :, :])
    alpha_component += alpha * lasagne.objectives.squared_error(
        drums, target_var2[:, 0:1, :, :])
    alpha_component += alpha * lasagne.objectives.squared_error(
        drums, target_var2[:, 1:2, :, :])
    train_loss_recon_neg += beta * lasagne.objectives.squared_error(
        drums, target_var2[:, 3:4, :, :])

    vocals_error = train_loss_recon_vocals.sum()
    drums_error = train_loss_recon_drums.sum()
    bass_error = train_loss_recon_bass.sum()
    negative_error = train_loss_recon_neg.sum()
    negative_error_voc = train_loss_recon_neg_voc.sum()
    alpha_component = alpha_component.sum()

    loss = abs(vocals_error + drums_error + bass_error - negative_error -
               alpha_component - negative_error_voc)

    params1 = lasagne.layers.get_all_params(network2, trainable=True)

    updates = lasagne.updates.adadelta(loss, params1)

    # val_updates=lasagne.updates.nesterov_momentum(loss1, params1, learning_rate=0.00001, momentum=0.7)

    train_fn = theano.function([input_var2, target_var2],
                               loss,
                               updates=updates,
                               allow_input_downcast=True)

    train_fn1 = theano.function([input_var2, target_var2], [
        vocals_error, bass_error, drums_error, negative_error, alpha_component,
        negative_error_voc
    ],
                                allow_input_downcast=True)

    predict_function2 = theano.function([input_var2],
                                        [vocals, bass, drums, others],
                                        allow_input_downcast=True)

    losser = []
    loss2 = []

    if not skip_train:

        logging.info("Training...")
        for epoch in range(num_epochs):

            train_err = 0
            train_batches = 0
            vocals_err = 0
            drums_err = 0
            bass_err = 0
            negative_err = 0
            alpha_component = 0
            beta_voc = 0
            start_time = time.time()
            for batch in range(train.iteration_size):
                inputs, target = train()
                jump = inputs.shape[2]
                inputs = np.reshape(
                    inputs,
                    (inputs.shape[0], 1, inputs.shape[1], inputs.shape[2]))
                targets = np.ndarray(shape=(inputs.shape[0], 4,
                                            inputs.shape[2], inputs.shape[3]))
                #import pdb;pdb.set_trace()
                targets[:, 0, :, :] = target[:, :, :jump]
                targets[:, 1, :, :] = target[:, :, jump:jump * 2]
                targets[:, 2, :, :] = target[:, :, jump * 2:jump * 3]
                targets[:, 3, :, :] = target[:, :, jump * 3:jump * 4]
                target = None

                train_err += train_fn(inputs, targets)
                [
                    vocals_erre, bass_erre, drums_erre, negative_erre, alpha,
                    betae_voc
                ] = train_fn1(inputs, targets)
                vocals_err += vocals_erre
                bass_err += bass_erre
                drums_err += drums_erre
                negative_err += negative_erre
                beta_voc += betae_voc
                alpha_component += alpha
                train_batches += 1

            print("Epoch {} of {} took {:.3f}s".format(
                epoch + 1, num_epochs,
                time.time() - start_time))
            print("  training loss:\t\t{:.6f}".format(train_err /
                                                      train_batches))
            losser.append(train_err / train_batches)
            print("  training loss for vocals:\t\t{:.6f}".format(
                vocals_err / train_batches))
            print("  training loss for bass:\t\t{:.6f}".format(bass_err /
                                                               train_batches))
            print("  training loss for drums:\t\t{:.6f}".format(drums_err /
                                                                train_batches))
            print("  Beta component:\t\t{:.6f}".format(negative_err /
                                                       train_batches))
            print("  Beta component for voice:\t\t{:.6f}".format(
                beta_voc / train_batches))
            print("  alpha component:\t\t{:.6f}".format(alpha_component /
                                                        train_batches))
            losser.append(train_err / train_batches)
            save_model(model, network2)

    if not skip_sep:

        logging.info("Separating")
        source = ['vocals', 'bass', 'drums', 'other']
        dev_directory = os.listdir(os.path.join(testdir, "Dev"))
        test_directory = os.listdir(os.path.join(
            testdir, "Test"))  #we do not include the test dir
        dirlist = []
        dirlist.extend(dev_directory)
        dirlist.extend(test_directory)
        for f in sorted(dirlist):
            if not f.startswith('.'):
                if f in dev_directory:
                    song = os.path.join(testdir, "Dev", f, "mixture.wav")
                else:
                    song = os.path.join(testdir, "Test", f, "mixture.wav")
                audioObj, sampleRate, bitrate = util.readAudioScipy(song)

                assert sampleRate == 44100, "Sample rate needs to be 44100"

                audio = (audioObj[:, 0] + audioObj[:, 1]) / 2
                audioObj = None
                mag, ph = transform.compute_file(audio, phase=True)

                mag = scale_factor * mag.astype(np.float32)

                batches, nchunks = util.generate_overlapadd(
                    mag,
                    input_size=mag.shape[-1],
                    time_context=train.time_context,
                    overlap=train.overlap,
                    batch_size=train.batch_size,
                    sampleRate=sampleRate)
                output = []

                batch_no = 1
                for batch in batches:
                    batch_no += 1
                    start_time = time.time()
                    output.append(predict_function2(batch))

                output = np.array(output)
                mm = util.overlapadd_multi(output,
                                           batches,
                                           nchunks,
                                           overlap=train.overlap)

                #write audio files
                if f in dev_directory:
                    dirout = os.path.join(outdir, "Dev", f)
                else:
                    dirout = os.path.join(outdir, "Test", f)
                if not os.path.exists(dirout):
                    os.makedirs(dirout)
                for i in range(mm.shape[0]):
                    audio_out = transform.compute_inverse(
                        mm[i, :len(ph)] / scale_factor, ph)
                    if len(audio_out) > len(audio):
                        audio_out = audio_out[:len(audio)]
                    util.writeAudioScipy(
                        os.path.join(dirout, source[i] + '.wav'), audio_out,
                        sampleRate, bitrate)
                    audio_out = None
                audio = None

    return losser
Example #4
0
    dirlist.extend(os.listdir(os.path.join(mixture_directory, "Test")))
    dev_directory = os.listdir(os.path.join(mixture_directory, "Dev"))
    test_directory = os.listdir(os.path.join(mixture_directory, "Test"))
    for f in sorted(dirlist):
        if not f.startswith('.'):
            print("\033[1;34m" + "- Processing file: %s" % f + "\033[0;0m")
            for co in combo:
                c = np.array(co)

                if f in dev_directory:
                    song_dir = os.path.join(source_directory, "Dev", f)
                else:
                    song_dir = os.path.join(source_directory, "Test", f)

                #read the sources audio files
                vocals, sampleRate, bitrate = util.readAudioScipy(
                    os.path.join(song_dir, "vocals.wav"))
                if vocals.shape[1] > 1:
                    vocals[:, 0] = (vocals[:, 0] + vocals[:, 1]) / 2
                    vocals = vocals[:, 0]

                #initialize variables
                number_of_blocks = int(
                    len(vocals) / (float(sampleRate) * 30.0))
                last_block = int(len(vocals) % float(sampleRate))
                if tt is None:
                    #initialize the transform object which will compute the STFT
                    tt = transformFFT(frameSize=1024,
                                      hopSize=512,
                                      sampleRate=sampleRate,
                                      window=blackmanharris)
                nframes = int(np.ceil(len(vocals) / np.double(tt.hopSize))) + 2
Example #5
0
def train_auto(train,
               fun,
               transform,
               testdir,
               outdir,
               testfile_list,
               testdir1,
               outdir1,
               testfile_list1,
               num_epochs=30,
               model="1.pkl",
               scale_factor=0.3,
               load=False,
               skip_train=False,
               skip_sep=False):
    """
    Trains a network built with \"fun\" with the data generated with \"train\"
    and then separates the files in \"testdir\",writing the result in \"outdir\"

    Parameters
    ----------
    train : Callable, e.g. LargeDataset object
        The callable which generates training data for the network: inputs, target = train()
    fun : lasagne network object, Theano tensor
        The network to be trained
    transform : transformFFT object
        The Transform object which was used to compute the features (see compute_features.py)
    testdir : string, optional
        The directory where the files to be separated are located
    outdir : string, optional
        The directory where to write the separated files
    num_epochs : int, optional
        The number the epochs to train for (one epoch is when all examples in the dataset are seen by the network)
    model : string, optional
        The path where to save the trained model (theano tensor containing the network)
    scale_factor : float, optional
        Scale the magnitude of the files to be separated with this factor
    Yields
    ------
    losser : list
        The losses for each epoch, stored in a list
    """

    logging.info("Building Autoencoder")
    input_var2 = T.tensor4('inputs')
    target_var2 = T.tensor4('targets')
    rand_num = T.tensor4('rand_num')

    eps = 1e-18
    alpha = 0.001

    network2 = fun(input_var=input_var2,
                   batch_size=train.batch_size,
                   time_context=train.time_context,
                   feat_size=train.input_size)

    if load:
        params = load_model(model)
        lasagne.layers.set_all_param_values(network2, params)

    prediction2 = lasagne.layers.get_output(network2, deterministic=True)

    rand_num = np.random.uniform(size=(train.batch_size, 1, train.time_context,
                                       train.input_size))

    s1 = prediction2[:, 0:1, :, :]
    s2 = prediction2[:, 1:2, :, :]
    s3 = prediction2[:, 2:3, :, :]
    s4 = prediction2[:, 3:4, :, :]

    mask1 = s1 / (s1 + s2 + s3 + s4 + eps * rand_num)
    mask2 = s2 / (s1 + s2 + s3 + s4 + eps * rand_num)
    mask3 = s3 / (s1 + s2 + s3 + s4 + eps * rand_num)
    mask4 = s4 / (s1 + s2 + s3 + s4 + eps * rand_num)

    source1 = mask1 * input_var2[:, 0:1, :, :]
    source2 = mask2 * input_var2[:, 0:1, :, :]
    source3 = mask3 * input_var2[:, 0:1, :, :]
    source4 = mask4 * input_var2[:, 0:1, :, :]

    train_loss_recon1 = lasagne.objectives.squared_error(
        source1, target_var2[:, 0:1, :, :])
    train_loss_recon2 = lasagne.objectives.squared_error(
        source2, target_var2[:, 1:2, :, :])
    train_loss_recon3 = lasagne.objectives.squared_error(
        source3, target_var2[:, 2:3, :, :])
    train_loss_recon4 = lasagne.objectives.squared_error(
        source4, target_var2[:, 3:4, :, :])

    error1 = train_loss_recon1.sum()
    error2 = train_loss_recon2.sum()
    error3 = train_loss_recon3.sum()
    error4 = train_loss_recon4.sum()

    loss = abs(error1 + error2 + error3 + error4)

    params1 = lasagne.layers.get_all_params(network2, trainable=True)

    updates = lasagne.updates.adadelta(loss, params1)

    train_fn = theano.function([input_var2, target_var2],
                               loss,
                               updates=updates,
                               allow_input_downcast=True)

    train_fn1 = theano.function([input_var2, target_var2],
                                [error1, error2, error3, error4],
                                allow_input_downcast=True)

    predict_function2 = theano.function([input_var2],
                                        [source1, source2, source3, source4],
                                        allow_input_downcast=True)

    losser = []

    if not skip_train:

        logging.info("Training...")
        for epoch in range(num_epochs):

            train_err = 0
            train_batches = 0
            err1 = 0
            err2 = 0
            err3 = 0
            err4 = 0
            start_time = time.time()
            for batch in range(train.iteration_size):
                inputs, target = train()

                jump = inputs.shape[2]
                targets = np.ndarray(shape=(inputs.shape[0], 4,
                                            inputs.shape[1], inputs.shape[2]))
                inputs = np.reshape(
                    inputs,
                    (inputs.shape[0], 1, inputs.shape[1], inputs.shape[2]))

                targets[:, 0, :, :] = target[:, :, :jump]
                targets[:, 1, :, :] = target[:, :, jump:jump * 2]
                targets[:, 2, :, :] = target[:, :, jump * 2:jump * 3]
                targets[:, 3, :, :] = target[:, :, jump * 3:jump * 4]
                target = None
                #gc.collect()

                train_err += train_fn(inputs, targets)
                [e1, e2, e3, e4] = train_fn1(inputs, targets)
                err1 += e1
                err2 += e2
                err3 += e3
                err4 += e4
                train_batches += 1

            logging.info("Epoch {} of {} took {:.3f}s".format(
                epoch + 1, num_epochs,
                time.time() - start_time))
            logging.info("  training loss:\t\t{:.6f}".format(train_err /
                                                             train_batches))
            logging.info("  training loss for bassoon:\t\t{:.6f}".format(
                err1 / train_batches))
            logging.info("  training loss for clarinet:\t\t{:.6f}".format(
                err2 / train_batches))
            logging.info("  training loss for saxophone:\t\t{:.6f}".format(
                err3 / train_batches))
            logging.info("  training loss for violin:\t\t{:.6f}".format(
                err4 / train_batches))
            losser.append(train_err / train_batches)
            save_model(model, network2)

    if not skip_sep:

        logging.info("Separating")
        sources = ['bassoon', 'clarinet', 'saxphone', 'violin']
        sources_midi = ['bassoon', 'clarinet', 'saxophone', 'violin']

        for f in testfile_list:
            for i in range(len(sources)):
                filename = os.path.join(testdir, f,
                                        f + '-' + sources[i] + '.wav')
                audioObj, sampleRate, bitrate = util.readAudioScipy(filename)

                assert sampleRate == 44100, "Sample rate needs to be 44100"

                nframes = int(np.ceil(
                    len(audioObj) / np.double(tt.hopSize))) + 2
                if i == 0:
                    audio = np.zeros(audioObj.shape[0])
                    #melody = np.zeros((len(sources),1,nframes))
                audio = audio + audioObj
                audioObj = None

            mag, ph = transform.compute_file(audio, phase=True)
            mag = scale_factor * mag.astype(np.float32)

            batches, nchunks = util.generate_overlapadd(
                mag,
                input_size=mag.shape[-1],
                time_context=train.time_context,
                overlap=train.overlap,
                batch_size=train.batch_size,
                sampleRate=44100)
            output = []
            #output1=[]

            batch_no = 1
            for batch in batches:
                batch_no += 1
                start_time = time.time()
                output.append(predict_function2(batch))

            output = np.array(output)
            mm = util.overlapadd_multi(output,
                                       batches,
                                       nchunks,
                                       overlap=train.overlap)
            for i in range(len(sources)):
                audio_out = transform.compute_inverse(
                    mm[i, :len(ph)] / scale_factor, ph)
                if len(audio_out) > len(audio):
                    audio_out = audio_out[:len(audio)]
                util.writeAudioScipy(
                    os.path.join(outdir, f + '-' + sources[i] + '.wav'),
                    audio_out, sampleRate, bitrate)
                audio_out = None

        style = ['fast', 'slow', 'original']
        if not os.path.exists(outdir1):
            os.makedirs(outdir1)
        for s in style:
            for f in testfile_list1:
                for i in range(len(sources)):
                    filename = os.path.join(
                        testdir1, f,
                        f + '_' + s + '_' + sources_midi[i] + '.wav')
                    audioObj, sampleRate, bitrate = util.readAudioScipy(
                        filename)

                    assert sampleRate == 44100, "Sample rate needs to be 44100"

                    nframes = int(
                        np.ceil(len(audioObj) / np.double(tt.hopSize))) + 2

                    if i == 0:
                        audio = np.zeros(audioObj.shape[0])
                        #melody = np.zeros((len(sources),1,nframes))
                    audio = audio + audioObj
                    audioObj = None

                mag, ph = transform.compute_file(audio, phase=True)
                mag = scale_factor * mag.astype(np.float32)

                batches, nchunks = util.generate_overlapadd(
                    mag,
                    input_size=mag.shape[-1],
                    time_context=train.time_context,
                    overlap=train.overlap,
                    batch_size=train.batch_size,
                    sampleRate=44100)
                output = []

                batch_no = 1
                for batch in batches:
                    batch_no += 1
                    start_time = time.time()
                    output.append(predict_function2(batch))

                output = np.array(output)
                mm = util.overlapadd_multi(output,
                                           batches,
                                           nchunks,
                                           overlap=train.overlap)
                for i in range(len(sources)):
                    audio_out = transform.compute_inverse(
                        mm[i, :len(ph)] / scale_factor, ph)
                    if len(audio_out) > len(audio):
                        audio_out = audio_out[:len(audio)]
                    filename = os.path.join(
                        outdir1, f + '_' + s + '_' + sources_midi[i] + '.wav')
                    util.writeAudioScipy(filename, audio_out, sampleRate,
                                         bitrate)
                    audio_out = None

    return losser
Example #6
0
    if kwargs.__getattribute__('feature_path'):
        feature_path = kwargs.__getattribute__('feature_path')
    else:
        feature_path=os.path.join(db,'transforms','t1')
    assert os.path.isdir(db), "Please input the directory for the DSD100 dataset with --db path_to_DSD"

    # mixture_directory=os.path.join(db,'Mixtures')
    # source_directory=os.path.join(db,'Sources')

    tt = None
    dirlist = os.listdir(os.path.join(db,"train"))
    for f in sorted(dirlist):
        if not f.startswith('.'):
            #read the mix audio file
            # mix_raw, sampleRate, bitrate = util.readAudioScipy(os.path.join(mixture_directory,"Dev",f,"mixture.wav"))
            mix_raw, sampleRate, bitrate = util.readAudioScipy(os.path.join(db,"train",f,"mixture.wav"))

            if mix_raw.shape[1]>1:
                mix_raw[:,0] = (mix_raw[:,0] + mix_raw[:,1]) / 2
                mix_raw = mix_raw[:,0]

            number_of_blocks=int(len(mix_raw)/(float(sampleRate)*30.0))
            last_block=int(len(mix_raw)%float(sampleRate))

            # read the LPF audio files
            lpf_100, sampleRate, bitrate = util.readAudioScipy(os.path.join(db,"train",f,"lpf_100.wav"))

            lpf_262, sampleRate, bitrate = util.readAudioScipy(os.path.join(db,"train",f,"lpf_262.wav"))

            #read the sources audio files
            vocals, sampleRate, bitrate = util.readAudioScipy(os.path.join(db,"train",f,"vocals.wav"))
    #print len(combo)

    #compute transform
    for f in os.listdir(db):
        if os.path.isdir(os.path.join(db, f)) and f[0].isdigit():
            if not f.startswith('.'):
                for s in range(len(style)):
                    if not os.path.exists(os.path.join(feature_path,
                                                       style[s])):
                        os.makedirs(os.path.join(feature_path, style[s]))
                    for co in combo:
                        c = np.array(co)
                        for i in range(len(sources)):
                            #read the audio file
                            sounds, sampleRate, bitrate = util.readAudioScipy(
                                os.path.join(
                                    db, f, f + '_' + style[s] + '_' +
                                    sources[i] + '.wav'))

                            if sampleRate != 44100:
                                print 'sample rate is not consistent'

                            if i == 0:
                                tt = transformFFT(frameSize=4096,
                                                  hopSize=512,
                                                  sampleRate=44100,
                                                  window=blackmanharris)
                                nframes = int(
                                    np.ceil(
                                        len(sounds) /
                                        np.double(tt.hopSize))) + 2
                                size = int(
def train_auto(fun,transform,testdir,outdir,testfile_list,testdir1,outdir1,testfile_list1,num_epochs=30,model="1.pkl",scale_factor=0.3,load=False,skip_train=False,skip_sep=False,
    path_transform_in=None,nsamples=40,batch_size=32, batch_memory=50, time_context=30, overlap=25, nprocs=4,mult_factor_in=0.3,mult_factor_out=0.3,timbre_model_path=None):
    """
    Trains a network built with \"fun\" with the data generated with \"train\"
    and then separates the files in \"testdir\",writing the result in \"outdir\"

    Parameters
    ----------
    fun : lasagne network object, Theano tensor
        The network to be trained
    transform : transformFFT object
        The Transform object which was used to compute the features (see compute_features.py)
    testdir : string, optional
        The directory where the files to be separated are located
    outdir : string, optional
        The directory where to write the separated files
    num_epochs : int, optional
        The number the epochs to train for (one epoch is when all examples in the dataset are seen by the network)
    model : string, optional
        The path where to save the trained model (theano tensor containing the network)
    scale_factor : float, optional
        Scale the magnitude of the files to be separated with this factor
    Yields
    ------
    losser : list
        The losses for each epoch, stored in a list
    """

    logging.info("Building Autoencoder")
    input_var2 = T.tensor4('inputs')
    target_var2 = T.tensor4('targets')
    rand_num = T.tensor4('rand_num')

    #parameters for the score-informed separation
    nharmonics=20
    interval=50 #cents
    tuning_freq=440 #Hz

    eps=1e-18
    alpha=0.001

    input_size = int(float(transform.frameSize) / 2 + 1)

    network2 = fun(input_var=input_var2,batch_size=batch_size,time_context=time_context,feat_size=input_size,nchannels=4)

    if load:
        params=load_model(model)
        lasagne.layers.set_all_param_values(network2,params)

    prediction2 = lasagne.layers.get_output(network2, deterministic=True)

    rand_num = np.random.uniform(size=(batch_size,1,time_context,input_size))

    s1=prediction2[:,0:1,:,:]
    s2=prediction2[:,1:2,:,:]
    s3=prediction2[:,2:3,:,:]
    s4=prediction2[:,3:4,:,:]

    mask1=s1/(s1+s2+s3+s4+eps*rand_num)
    mask2=s2/(s1+s2+s3+s4+eps*rand_num)
    mask3=s3/(s1+s2+s3+s4+eps*rand_num)
    mask4=s4/(s1+s2+s3+s4+eps*rand_num)

    input_var = input_var2[:,0:1,:,:] + input_var2[:,1:2,:,:] + input_var2[:,2:3,:,:] + input_var2[:,3:4,:,:]

    source1=mask1*input_var[:,0:1,:,:]
    source2=mask2*input_var[:,0:1,:,:]
    source3=mask3*input_var[:,0:1,:,:]
    source4=mask4*input_var[:,0:1,:,:]

    train_loss_recon1 = lasagne.objectives.squared_error(source1,target_var2[:,0:1,:,:])
    train_loss_recon2 = lasagne.objectives.squared_error(source2,target_var2[:,1:2,:,:])
    train_loss_recon3 = lasagne.objectives.squared_error(source3,target_var2[:,2:3,:,:])
    train_loss_recon4 = lasagne.objectives.squared_error(source4,target_var2[:,3:4,:,:])

    error1=train_loss_recon1.sum()
    error2=train_loss_recon2.sum()
    error3=train_loss_recon3.sum()
    error4=train_loss_recon4.sum()

    loss=abs(error1+error2+error3+error4)

    params1 = lasagne.layers.get_all_params(network2, trainable=True)

    updates = lasagne.updates.adadelta(loss, params1)

    train_fn = theano.function([input_var2,target_var2], loss, updates=updates,allow_input_downcast=True)

    train_fn1 = theano.function([input_var2,target_var2], [error1,error2,error3,error4], allow_input_downcast=True)

    predict_function2=theano.function([input_var2],[source1,source2,source3,source4],allow_input_downcast=True)

    losser=[]
    min_loss = 1e14

    training_steps = 0

    if not skip_train:

        logging.info("Training...")
        for epoch in range(num_epochs):
            train = LargeDatasetMask2(path_transform_in=path_in, nsources=4, nsamples=nsamples, batch_size=batch_size, batch_memory=batch_memory, time_context=time_context, overlap=overlap, nprocs=nprocs,mult_factor_in=scale_factor,mult_factor_out=scale_factor,\
                sampleRate=transform.sampleRate,pitch_code='e', nharmonics=20, pitch_norm=127.,tensortype=theano.config.floatX,timbre_model_path=timbre_model_path)
            train_err = 0
            train_batches = 0
            err1=0
            err2=0
            err3=0
            err4=0
            start_time = time.time()
            for batch in range(train.iteration_size):

                inputs, target, masks = train()
                jump = inputs.shape[2]

                mask=np.empty(shape=(inputs.shape[0],4,inputs.shape[1],inputs.shape[2]),dtype=theano.config.floatX)
                mask[:,0,:,:]=masks[:,:,:jump] * inputs
                mask[:,1,:,:]=masks[:,:,jump:jump*2] * inputs
                mask[:,2,:,:]=masks[:,:,jump*2:jump*3] * inputs
                mask[:,3,:,:]=masks[:,:,jump*3:jump*4] * inputs
                masks=None

                targets=np.empty(shape=(inputs.shape[0],4,inputs.shape[1],inputs.shape[2]),dtype=theano.config.floatX)
                targets[:,0,:,:]=target[:,:,:jump]
                targets[:,1,:,:]=target[:,:,jump:jump*2]
                targets[:,2,:,:]=target[:,:,jump*2:jump*3]
                targets[:,3,:,:]=target[:,:,jump*3:jump*4]
                target=None

                inputs=None

                train_err+=train_fn(mask,targets)
                [e1,e2,e3,e4]=train_fn1(mask,targets)
                err1 += e1
                err2 += e2
                err3 += e3
                err4 += e4
                train_batches += 1

            logging.info("Epoch {} of {} took {:.3f}s".format(
                epoch + 1, num_epochs, time.time() - start_time))
            logging.info("  training loss:\t\t{:.6f}".format(train_err/train_batches))
            logging.info("  training loss for bassoon:\t\t{:.6f}".format(err1/train_batches))
            logging.info("  training loss for clarinet:\t\t{:.6f}".format(err2/train_batches))
            logging.info("  training loss for saxophone:\t\t{:.6f}".format(err3/train_batches))
            logging.info("  training loss for violin:\t\t{:.6f}".format(err4/train_batches))
            losser.append(train_err / train_batches)
            #save_model(model,network2)
            # if (train_err/train_batches) < min_loss:
            #     min_loss = train_err/train_batches
            save_model(model,network2)

        # training_steps = training_steps + 1
        # num_epochs = int(np.ceil(float(num_epochs)/5.))

        # if losser[-1] > min_loss:
        #     params=load_model(model)
        #     lasagne.layers.set_all_param_values(network2,params,learning_rate=0.0001)

        # updates = lasagne.updates.adam(loss, params1)
        # train_fn = theano.function([input_var2,target_var2], loss, updates=updates,allow_input_downcast=True)


    if not skip_sep:

        logging.info("Separating")
        sources = ['bassoon','clarinet','saxphone','violin']
        sources_midi = ['bassoon','clarinet','saxophone','violin']

        train = LargeDatasetMask2(path_transform_in=path_in, nsources=4, batch_size=batch_size, batch_memory=batch_memory, time_context=time_context, overlap=overlap, nprocs=nprocs,mult_factor_in=scale_factor,mult_factor_out=scale_factor,\
                sampleRate=transform.sampleRate,pitch_code='e', nharmonics=20, pitch_norm=127.,tensortype=theano.config.floatX,timbre_model_path=timbre_model_path)

        for f in testfile_list:
            nelem_g=1
            for i in range(len(sources)):
                ng = util.getMidiNum(sources_midi[i]+'_b',os.path.join(testdir,f),0,40.0)
                nelem_g = np.maximum(ng,nelem_g)
            melody = np.zeros((len(sources),int(nelem_g),2*nharmonics+3))
            for i in range(len(sources)):
                filename=os.path.join(testdir,f,f+'-'+sources[i]+'.wav')
                audioObj, sampleRate, bitrate = util.readAudioScipy(filename)

                assert sampleRate == 44100,"Sample rate needs to be 44100"

                nframes = int(np.ceil(len(audioObj) / np.double(tt.hopSize))) + 2
                if i==0:
                    audio = np.zeros(audioObj.shape[0])
                audio = audio + audioObj
                audioObj=None

                tmp = util.expandMidi(sources_midi[i]+'_b',os.path.join(testdir,f),0,40.0,interval,tuning_freq,nharmonics,sampleRate,tt.hopSize,tt.frameSize,0.2,0.2,nframes,0.5)
                melody[i,:tmp.shape[0],:] = tmp
                tmp = None

            mag,ph=transform.compute_file(audio,phase=True)
            mag=scale_factor*mag.astype(np.float32)

            jump = mag.shape[-1]

            masks_temp = train.filterSpec(mag,melody,0,nframes)
            masks = np.ones((train.ninst,mag.shape[0],mag.shape[1]))
            masks[0,:,:]=masks_temp[:,:jump] * mag
            masks[1,:,:]=masks_temp[:,jump:jump*2] * mag
            masks[2,:,:]=masks_temp[:,jump*2:jump*3] * mag
            masks[3,:,:]=masks_temp[:,jump*3:jump*4] * mag
            mag = None
            masks_temp = None

            batches,nchunks = util.generate_overlapadd(masks,input_size=masks.shape[-1],time_context=train.time_context,overlap=train.overlap,batch_size=train.batch_size,sampleRate=44100)
            masks = None

            batch_no=1
            output=[]
            for batch in batches:
                batch_no+=1
                #start_time=time.time()
                output.append(predict_function2(batch))

            output=np.array(output)
            mm=util.overlapadd_multi(output,batches,nchunks,overlap=train.overlap)
            for i in range(len(sources)):
                audio_out=transform.compute_inverse(mm[i,:len(ph)]/scale_factor,ph)
                if len(audio_out)>len(audio):
                    audio_out=audio_out[:len(audio)]
                util.writeAudioScipy(os.path.join(outdir,f+'-'+sources[i]+'.wav'),audio_out,sampleRate,bitrate)
                audio_out=None

        # style = ['fast','slow','original']
        # style_midi = ['_fast20','_slow20','_original']
        # if not os.path.exists(outdir1):
        #     os.makedirs(outdir1)
        # for s in range(len(style)):
        #     for f in testfile_list1:
        #         nelem_g=1
        #         for i in range(len(sources)):
        #             ng = util.getMidiNum(sources_midi[i]+'_g'+style_midi[s],os.path.join(testdir1,f),0,40.0)
        #             nelem_g = np.maximum(ng,nelem_g)
        #         melody = np.zeros((len(sources),int(nelem_g),2*nharmonics+3))
        #         for i in range(len(sources)):
        #             filename=os.path.join(testdir1,f,f+'_'+style[s]+'_'+sources_midi[i]+'.wav')

        #             audioObj, sampleRate, bitrate = util.readAudioScipy(filename)

        #             assert sampleRate == 44100,"Sample rate needs to be 44100"

        #             nframes = int(np.ceil(len(audioObj) / np.double(tt.hopSize))) + 2

        #             if i==0:
        #                 audio = np.zeros(audioObj.shape[0])

        #             audio = audio + audioObj
        #             audioObj=None

        #             tmp = util.expandMidi(sources_midi[i]+'_g'+style_midi[s],os.path.join(testdir1,f),0,40.0,interval,tuning_freq,nharmonics,sampleRate,tt.hopSize,tt.frameSize,0.2,0.2,nframes)
        #             melody[i,:tmp.shape[0],:] = tmp
        #             tmp = None

        #         mag,ph=transform.compute_file(audio,phase=True)
        #         mag=scale_factor*mag.astype(np.float32)

        #         jump = mag.shape[-1]

        #         masks_temp = train.filterSpec(mag,melody,0,nframes)
        #         masks = np.ones((train.ninst,mag.shape[0],mag.shape[1]))
        #         masks[0,:,:]=masks_temp[:,:jump] * mag
        #         masks[1,:,:]=masks_temp[:,jump:jump*2] * mag
        #         masks[2,:,:]=masks_temp[:,jump*2:jump*3] * mag
        #         masks[3,:,:]=masks_temp[:,jump*3:jump*4] * mag
        #         mag = None
        #         masks_temp = None

        #         batches,nchunks = util.generate_overlapadd(masks,input_size=masks.shape[-1],time_context=train.time_context,overlap=train.overlap,batch_size=train.batch_size,sampleRate=44100)
        #         masks = None

        #         batch_no=1
        #         output=[]
        #         for batch in batches:
        #             batch_no+=1
        #             #start_time=time.time()
        #             output.append(predict_function2(batch))

        #         output=np.array(output)
        #         mm=util.overlapadd_multi(output,batches,nchunks,overlap=train.overlap)
        #         for i in range(len(sources)):
        #             audio_out=transform.compute_inverse(mm[i,:len(ph)]/scale_factor,ph)
        #             if len(audio_out)>len(audio):
        #                 audio_out=audio_out[:len(audio)]
        #             filename=os.path.join(outdir1,f+'_'+style[s]+'_'+sources_midi[i]+'.wav')
        #             util.writeAudioScipy(filename,audio_out,sampleRate,bitrate)
        #             audio_out=None

    return losser
Example #9
0
def train_auto(train,fun,transform,testdir,outdir,num_epochs=30,model="1.pkl",scale_factor=0.3,load=False,skip_train=False,skip_sep=False):
    """
    Trains a network built with \"fun\" with the data generated with \"train\"
    and then separates the files in \"testdir\",writing the result in \"outdir\"

    Parameters
    ----------
    train : Callable, e.g. LargeDataset object
        The callable which generates training data for the network: inputs, target = train()
    fun : lasagne network object, Theano tensor
        The network to be trained  
    transform : transformFFT object
        The Transform object which was used to compute the features (see compute_features.py)
    testdir : string, optional
        The directory where the files to be separated are located
    outdir : string, optional
        The directory where to write the separated files
    num_epochs : int, optional
        The number the epochs to train for (one epoch is when all examples in the dataset are seen by the network)
    model : string, optional
        The path where to save the trained model (theano tensor containing the network) 
    scale_factor : float, optional
        Scale the magnitude of the files to be separated with this factor
    Yields
    ------
    losser : list
        The losses for each epoch, stored in a list
    """

    logging.info("Building Autoencoder")
    input_var2 = T.tensor4('inputs')
    target_var2 = T.tensor4('targets')
    rand_num = T.tensor4('rand_num')
    
    eps=1e-8
    alpha=0.9
    beta_acc=0.005
    beta_voc=0.02

    network2 = fun(input_var=input_var2,batch_size=train.batch_size,time_context=train.time_context,feat_size=train.input_size)
    
    if load:
        params=load_model(model)
        lasagne.layers.set_all_param_values(network2,params)

    prediction2 = lasagne.layers.get_output(network2, deterministic=True)

    rand_num = np.random.uniform(size=(train.batch_size,1,train.time_context,train.input_size))

    voc=prediction2[:,0:1,:,:]+eps*rand_num
    acco=prediction2[:,1:2,:,:]+eps*rand_num

    mask1=voc/(voc+acco)
    mask2=acco/(voc+acco)

    vocals=mask1*input_var2[:,0:1,:,:]
    acc=mask2*input_var2[:,0:1,:,:]
    
    train_loss_recon_vocals = lasagne.objectives.squared_error(vocals,target_var2[:,0:1,:,:])
    train_loss_recon_acc = alpha * lasagne.objectives.squared_error(acc,target_var2[:,1:2,:,:])    
    train_loss_recon_neg_voc = beta_voc * lasagne.objectives.squared_error(vocals,target_var2[:,1:2,:,:])
    train_loss_recon_neg_acc = beta_acc * lasagne.objectives.squared_error(acc,target_var2[:,0:1,:,:])

    vocals_error=train_loss_recon_vocals.sum()  
    acc_error=train_loss_recon_acc.sum()  
    negative_error_voc=train_loss_recon_neg_voc.sum()
    negative_error_acc=train_loss_recon_neg_acc.sum()
    
    loss=abs(vocals_error+acc_error-negative_error_voc)

    params1 = lasagne.layers.get_all_params(network2, trainable=True)

    updates = lasagne.updates.adadelta(loss, params1)

    train_fn = theano.function([input_var2,target_var2], loss, updates=updates,allow_input_downcast=True)

    train_fn1 = theano.function([input_var2,target_var2], [vocals_error,acc_error,negative_error_voc,negative_error_acc], allow_input_downcast=True)

    predict_function2=theano.function([input_var2],[vocals,acc],allow_input_downcast=True)
    predict_function3=theano.function([input_var2],[prediction2[:,0:1,:,:],prediction2[:,1:2,:,:]],allow_input_downcast=True)

    losser=[]
    loss2=[]

    if not skip_train:

        logging.info("Training...")
        for epoch in range(num_epochs):

            train_err = 0
            train_batches = 0
            vocals_err=0
            acc_err=0        
            beta_voc=0
            beta_acc=0
            start_time = time.time()
            for batch in range(train.iteration_size): 
                inputs, target = train()
                
                jump = inputs.shape[2]
                targets=np.ndarray(shape=(inputs.shape[0],2,inputs.shape[1],inputs.shape[2]))
                inputs=np.reshape(inputs,(inputs.shape[0],1,inputs.shape[1],inputs.shape[2]))          

                targets[:,0,:,:]=target[:,:,:jump]
                targets[:,1,:,:]=target[:,:,jump:jump*2]         
                target=None
        
                train_err+=train_fn(inputs,targets)
                [vocals_erre,acc_erre,betae_voc,betae_acc]=train_fn1(inputs,targets)
                vocals_err += vocals_erre
                acc_err += acc_erre           
                beta_voc+= betae_voc
                beta_acc+= betae_acc
                train_batches += 1
            
            logging.info("Epoch {} of {} took {:.3f}s".format(
                epoch + 1, num_epochs, time.time() - start_time))
            logging.info("  training loss:\t\t{:.6f}".format(train_err/train_batches))
            logging.info("  training loss for vocals:\t\t{:.6f}".format(vocals_err/train_batches))
            logging.info("  training loss for acc:\t\t{:.6f}".format(acc_err/train_batches))
            logging.info("  Beta component for voice:\t\t{:.6f}".format(beta_voc/train_batches))
            logging.info("  Beta component for acc:\t\t{:.6f}".format(beta_acc/train_batches))
            losser.append(train_err / train_batches)
            save_model(model,network2)

    if not skip_sep:

        logging.info("Separating")
        for f in os.listdir(testdir):
            if f.endswith(".wav"):
                audioObj, sampleRate, bitrate = util.readAudioScipy(os.path.join(testdir,f))
                
                assert sampleRate == 44100,"Sample rate needs to be 44100"

                audio = audioObj[:,0] + audioObj[:,1]
                audioObj = None
                mag,ph=transform.compute_file(audio,phase=True)
         
                mag=scale_factor*mag.astype(np.float32)

                batches,nchunks = util.generate_overlapadd(mag,input_size=mag.shape[-1],time_context=train.time_context,overlap=train.overlap,batch_size=train.batch_size,sampleRate=sampleRate)
                output=[]

                batch_no=1
                for batch in batches:
                    batch_no+=1
                    start_time=time.time()
                    output.append(predict_function2(batch))

                output=np.array(output)
                bmag,mm=util.overlapadd(output,batches,nchunks,overlap=train.overlap)
                
                audio_out=transform.compute_inverse(bmag[:len(ph)]/scale_factor,ph)
                if len(audio_out)>len(audio):
                    audio_out=audio_out[:len(audio)]
                audio_out=essentia.array(audio_out)
                audio_out2= transform.compute_inverse(mm[:len(ph)]/scale_factor,ph) 
                if len(audio_out2)>len(audio):
                    audio_out2=audio_out2[:len(audio)]  
                audio_out2=essentia.array(audio_out2) 
                #write audio files
                util.writeAudioScipy(os.path.join(outdir,f.replace(".wav","-voice.wav")),audio_out,sampleRate,bitrate)
                util.writeAudioScipy(os.path.join(outdir,f.replace(".wav","-music.wav")),audio_out2,sampleRate,bitrate)
                audio_out=None 
                audio_out2=None   

    return losser  
Example #10
0
    source_directory = os.path.join(db, 'Sources')

    instrument_activation = np.ones((5, 4))
    for i in range(5):
        for j in range(4):
            if i == j:
                instrument_activation[i][j] = 0

    tt = None
    dirlist = os.listdir(os.path.join(mixture_directory, "Dev"))
    for f in sorted(dirlist):
        for ins in range(5):
            if not f.startswith('.'):
                print("\033[1;34m" + "- Processing file: %s" % f + "\033[0;0m")
                #read the sources audio files
                bass, sampleRate, bitrate = util.readAudioScipy(
                    os.path.join(source_directory, "Dev", f, "bass.wav"))
                if bass.shape[1] > 1:
                    bass[:, 0] = (bass[:, 0] + bass[:, 1]) / 2
                    bass = bass[:, 0]
                if instrument_activation[ins, 0] == 0:  ##
                    bass = np.zeros(len(bass))
                    print(" * Without BASS")

                if instrument_activation[ins, 1] == 1:  ##
                    drums, sampleRate, bitrate = util.readAudioScipy(
                        os.path.join(source_directory, "Dev", f, "drums.wav"))
                    if drums.shape[1] > 1:
                        drums[:, 0] = (drums[:, 0] + drums[:, 1]) / 2
                        drums = drums[:, 0]
                else:
                    drums = np.zeros(len(bass))
                    ng = util.getMidiNum(sources_midi[i] + '_g',
                                         os.path.join(db, f), 0, 40.0)
                    nelem_g = np.maximum(ng, nelem_g)
                    nb = util.getMidiNum(sources_midi[i] + '_b',
                                         os.path.join(db, f), 0, 40.0)
                    nelem_b = np.maximum(nb, nelem_b)
                melody_g = np.zeros(
                    (len(sources), int(nelem_g), 2 * nharmonics + 3))
                melody_b = np.zeros(
                    (len(sources), int(nelem_b), 2 * nharmonics + 3))
                melody_e = np.zeros(
                    (len(sources), int(nelem_b), 2 * nharmonics + 3))

                for i in range(len(sources)):
                    #read the audio file
                    audioObj, sampleRate, bitrate = util.readAudioScipy(
                        os.path.join(db, f, f + '-' + sources[i] + '.wav'))

                    if i == 0:
                        tt = transformFFT(frameSize=4096,
                                          hopSize=512,
                                          sampleRate=44100,
                                          window=blackmanharris)
                        nframes = int(len(audioObj) / tt.hopSize)
                        audio = np.zeros((audioObj.shape[0], len(sources) + 1))

                    audio[:, 0] = audio[:, 0] + audioObj
                    audio[:, i + 1] = audioObj
                    audioObj = None

                    tmp = util.expandMidi(sources_midi[i] + '_g',
                                          os.path.join(db, f), 0, 40.0,
Example #12
0
tr = transformMEL(bins=43, frameSize=1024, hopSize=512)

path_to_irmas = '/home/js/dataset/IRMAS/'
feature_dir_train = os.path.join(path_to_irmas, 'features', 'Training')
if not os.path.exists(feature_dir_train):
    os.makedirs(feature_dir_train)

d = os.path.join(path_to_irmas, 'Training')
instruments = sorted(
    filter(lambda x: os.path.isdir(os.path.join(d, x)), os.listdir(d)))

for count, inst in enumerate(instruments):
    for f in os.listdir(os.path.join(d, inst)):
        if os.path.isfile(os.path.join(d, inst, f)) and f.endswith('.wav'):
            audio, sampleRate, bitrate = util.readAudioScipy(
                os.path.join(d, inst, f))
            tr.compute_transform(audio.sum(axis=1),
                                 out_path=os.path.join(
                                     feature_dir_train,
                                     f.replace('.wav', '.data')),
                                 suffix='_mel_',
                                 sampleRate=sampleRate)
            util.saveTensor(np.array([count], dtype=float),
                            out_path=os.path.join(feature_dir_train,
                                                  f.replace('.wav', '.data')),
                            suffix='_label_')

suffix_in = '_mel_'
suffix_out = '_label_'
file_list = [
    f for f in os.listdir(feature_dir_train)
Example #13
0
def train_auto(fun,
               train,
               transform,
               testdir,
               outdir,
               num_epochs=30,
               model="1.pkl",
               scale_factor=0.3,
               load=False,
               skip_train=False,
               skip_sep=False,
               chunk_size=60,
               chunk_overlap=2,
               nsamples=40,
               batch_size=32,
               batch_memory=50,
               time_context=30,
               overlap=25,
               nprocs=4,
               mult_factor_in=0.3,
               mult_factor_out=0.3):
    """
    Trains a network built with \"fun\" with the data generated with \"train\"
    and then separates the files in \"testdir\",writing the result in \"outdir\"

    Parameters
    ----------
    fun : lasagne network object, Theano tensor
        The network to be trained
    transform : transformFFT object
        The Transform object which was used to compute the features (see compute_features_DSD100.py)
    testdir : string, optional
        The directory where the files to be separated are located
    outdir : string, optional
        The directory where to write the separated files
    num_epochs : int, optional
        The number the epochs to train for (one epoch is when all examples in the dataset are seen by the network)
    model : string, optional
        The path where to save the trained model (theano tensor containing the network)
    scale_factor : float, optional
        Scale the magnitude of the files to be separated with this factor
    Yields
    ------
    losser : list
        The losses for each epoch, stored in a list
    """

    logging.info("Building Autoencoder")
    input_var = T.tensor4('inputs')
    input_mask = T.tensor4('input_mask')
    target_var = T.tensor4('targets')

    theano_rng = RandomStreams(128)

    eps = 1e-12

    sources = ['vocals', 'bass', 'drums', 'other']

    nchannels = int(train.channels_in)
    nsources = int(train.channels_out / train.channels_in)

    print 'nchannels: ', nchannels
    print 'nsources: ', nsources

    input_size = int(float(transform.frameSize) / 2 + 1)

    rand_num = theano_rng.normal(size=(batch_size, nsources, time_context,
                                       input_size),
                                 avg=0.0,
                                 std=0.1,
                                 dtype=theano.config.floatX)

    net = fun(input_var=input_var,
              batch_size=batch_size,
              time_context=time_context,
              feat_size=input_size,
              nchannels=nchannels,
              nsources=nsources)
    network = net['l_out']
    if load:
        params = load_model(model)
        lasagne.layers.set_all_param_values(network, params)

    prediction = lasagne.layers.get_output(network, deterministic=True)

    sourceall = []
    errors_insts = []
    loss = 0

    sep_chann = []

    # prediction example for 2 sources in 2 channels:
    # 0, 1 source 0 in channel 0 and 1
    # 2, 3 source 1 in channel 0 and 1
    for j in range(nchannels):
        #print "j: ", j
        masksum = T.sum(prediction[:, j::nchannels, :, :], axis=1)
        temp = T.tile(masksum.dimshuffle(0, 'x', 1, 2), (1, nsources, 1, 1))
        mask = prediction[:, j::nchannels, :, :] / (temp + eps * rand_num)
        source = mask * T.tile(input_var[:, j:j + 1, :, :],
                               (1, nsources, 1, 1)) + eps * rand_num
        sourceall.append(source)

        sep_chann.append(source)
        train_loss_recon = lasagne.objectives.squared_error(
            source, target_var[:, j::nchannels, :, :])

        errors_inst = abs(train_loss_recon.sum(axis=(0, 2, 3)))

        errors_insts.append(errors_inst)

        loss = loss + abs(train_loss_recon.sum())

    params1 = lasagne.layers.get_all_params(network, trainable=True)

    updates = lasagne.updates.adadelta(loss, params1)

    train_fn_mse = theano.function([input_var, target_var],
                                   loss,
                                   updates=updates,
                                   allow_input_downcast=True)

    train_fn1 = theano.function([input_var, target_var],
                                errors_insts,
                                allow_input_downcast=True)

    #----------NEW ILD LOSS CONDITION----------

    rand_num2 = theano_rng.normal(
        size=(batch_size, nsources, time_context, input_size),
        avg=0.0,
        std=0.1,
        dtype=theano.config.floatX)  #nsources a primera dim?

    #estimate

    interaural_spec_est = sep_chann[0] / (sep_chann[1] + eps * rand_num2)

    alpha_est = 20 * np.log10(abs(interaural_spec_est + eps * rand_num2))
    alpha_est_mean = alpha_est.mean(axis=(0, 1, 2))

    #groundtruth

    interaural_spec_gt = target_var[:, 0::nchannels, :, :] / (
        target_var[:, 1::nchannels, :, :] + eps * rand_num2)

    alpha_gt = 20 * np.log10(abs(interaural_spec_gt + eps * rand_num2))
    alpha_gt_mean = alpha_gt.mean(
        axis=(0, 1, 2))  #aixo hauria de ser un vector d'una dimensio

    train_loss_ild = lasagne.objectives.squared_error(alpha_est_mean,
                                                      alpha_gt_mean)

    loss = loss + (abs(train_loss_ild.sum()) / 500)

    #------------------------------------------

    predict_function = theano.function([input_var],
                                       sourceall,
                                       allow_input_downcast=True)

    losser = []

    if not skip_train:
        logging.info("Training stage 1 (mse)...")
        for epoch in range(num_epochs):

            train_err = 0
            train_batches = 0
            errs = np.zeros((nchannels, nsources))
            start_time = time.time()
            for batch in range(train.iteration_size):
                inputs, target = train()
                train_err += train_fn_mse(inputs, target)
                errs += np.array(train_fn1(inputs, target))
                train_batches += 1

            logging.info("Epoch {} of {} took {:.3f}s".format(
                epoch + 1, num_epochs,
                time.time() - start_time))
            logging.info("  training loss:\t\t{:.6f}".format(train_err /
                                                             train_batches))
            for j in range(nchannels):
                for i in range(nsources):
                    logging.info("  training loss for " + sources[i] +
                                 " in mic " + str(j) +
                                 ":\t\t{:.6f}".format(errs[j][i] /
                                                      train_batches))

            model_noILD = model[:-4] + '_noILD' + model[-4:]
            print 'model_noILD: ', model_noILD
            save_model(model_noILD, network)
            losser.append(train_err / train_batches)


#NEW ILD TRAINING---------------------------------------------------------

        params = load_model(model_noILD)
        lasagne.layers.set_all_param_values(network, params)
        params1 = lasagne.layers.get_all_params(network, trainable=True)
        updates = lasagne.updates.adadelta(loss, params1)
        train_fn_ILD = theano.function([input_var, target_var],
                                       loss,
                                       updates=updates,
                                       allow_input_downcast=True)

        logging.info("Training stage 2 (ILD)...")

        for epoch in range(int(num_epochs / 2)):

            train_err = 0
            train_batches = 0

            start_time = time.time()
            for batch in range(train.iteration_size):
                inputs, target = train()

                train_err += train_fn_ILD(inputs, target)
                train_batches += 1

            logging.info("Epoch {} of {} took {:.3f}s".format(
                epoch + 1, num_epochs,
                time.time() - start_time))
            logging.info("  training loss:\t\t{:.6f}".format(train_err /
                                                             train_batches))

            save_model(model, network)
            losser.append(train_err / train_batches)

    if not skip_sep:

        logging.info("Separating")

        subsets = ['Dev', 'Test']
        for sub in subsets:
            for d in sorted(os.listdir(os.path.join(db, 'Mixtures', sub))):
                print os.path.join(os.path.sep, db, 'Mixtures', sub, d,
                                   'mixture.wav')
                audio, sampleRate, bitrate = util.readAudioScipy(
                    os.path.join(os.path.sep, db, 'Mixtures', sub, d,
                                 'mixture.wav'))
                nsamples = audio.shape[0]
                sep_audio = np.zeros((nsamples, len(sources), audio.shape[1]))

                mag, ph = transform.compute_transform(audio, phase=True)
                mag = scale_factor * mag.astype(np.float32)
                #print 'mag.shape: ', mag.shape, 'batch_size: ', train.batch_size
                nframes = mag.shape[-2]

                batches_mag, nchunks = util.generate_overlapadd(
                    mag,
                    input_size=mag.shape[-1],
                    time_context=train.time_context,
                    overlap=train.overlap,
                    batch_size=train.batch_size,
                    sampleRate=sampleRate)
                mag = None

                output = []
                for b in range(len(batches_mag)):
                    output.append(predict_function(batches_mag[b]))
                output = np.array(output)

                for j in range(audio.shape[1]):
                    mm = util.overlapadd_multi(np.swapaxes(
                        output[:, j:j + 1, :, :, :, :], 1, 3),
                                               batches_mag,
                                               nchunks,
                                               overlap=train.overlap)
                    for i in range(len(sources)):
                        audio_out = transform.compute_inverse(
                            mm[i, :ph.shape[1], :] / scale_factor, ph[j])
                        # if len(sep_audio[:i,j])<len(audio_out):
                        #     print len(sep_audio), len(audio_out), len(audio_out)-len(sep_audio[:i,j])
                        #     sep_audio = np.concatenate(sep_audio,np.zeros(len(audio_out)-len(sep_audio[:i,j])))
                        #     print len(sep_audio), len(audio_out), len(audio_out)-len(sep_audio[:i,j])
                        sep_audio[:, i, j] = audio_out[:len(sep_audio)]

                print 'Saving separation: ', outdir
                if not os.path.exists(os.path.join(outdir)):
                    os.makedirs(os.path.join(outdir))
                    print 'Creating model folder'
                if not os.path.exists(os.path.join(outdir, 'Sources')):
                    os.makedirs(os.path.join(outdir, 'Sources'))
                    print 'Creating Sources folder: ', os.path.join(
                        outdir, 'Sources')
                if not os.path.exists(os.path.join(outdir, 'Sources', sub)):
                    os.makedirs(os.path.join(outdir, 'Sources', sub))
                    print 'Creating subset folder'
                if not os.path.exists(os.path.join(outdir, 'Sources', sub, d)):
                    os.makedirs(os.path.join(outdir, 'Sources', sub, d))
                    print 'Creating song folder', os.path.join(
                        outdir, 'Sources', sub, d)
                for i in range(len(sources)):
                    print 'Final audio file: ', i, os.path.join(
                        outdir, 'Sources', sub, d, sources[i] + '.wav'
                    ), 'nsamples: ', nsamples, 'len sep_audio :', len(
                        sep_audio)
                    util.writeAudioScipy(
                        os.path.join(outdir, 'Sources', sub, d,
                                     sources[i] + '.wav'),
                        sep_audio[:nsamples, i, :], sampleRate, bitrate)

    return losser
Example #14
0
    'bw3_1' : tf.Variable(tf.random_normal([256]), name='bw3_1'),
    'bw4_1' : tf.Variable(tf.random_normal([512]), name='bw4_1'),
    'bfc1': tf.Variable(tf.random_normal([512]), name='bfc1'),
    'bfc2': tf.Variable(tf.random_normal([num_classes]), name='bfc2')
}

x = tf.placeholder(tf.float32, [None, feature_dim])
keep_prob = tf.placeholder(tf.float32)
keep_prob2 = tf.placeholder(tf.float32)
y_conv = cnn(x, weights, biases, keep_prob, keep_prob2)
y_conv_softmax = tf.nn.softmax(y_conv)

lengthWindow = 1024
hopsize = 512
tr = transformMEL(bins=43, frameSize=lengthWindow, hopSize=hopsize)
audio, sampleRate, bitrate = util.readAudioScipy(audio_file_name) 
melspec = tr.compute_transform2(audio.sum(axis=1), sampleRate=sampleRate)
# print('melspec.shape: ', melspec.shape) # (1294, 43)
sec_per_spectrogram = hopsize  / sampleRate * batch_size # 1.49
num_batch = int(melspec.shape[0]/batch_size)
# print('num_batch: ', num_batch) # 10
melspec_tensor = np.zeros((num_batch,feature_dim))
instruments_result = []

for i in range(0, num_batch):
    if i+batch_size > melspec.shape[0]:
        break
    # plt.imshow(melspec[i*batch_size:(i+1)*batch_size].T,interpolation='none', origin='lower')
    # plt.show()
    melspec_tensor[i, :] = melspec[i*batch_size:(i+1)*batch_size].reshape(-1, feature_dim)