Ejemplo n.º 1
0
def read_music(path):
    # song = read(path)
    # data = song.__getitem__(1)
    # data = data.tolist()
    dataSet = []
    tem = []
    temp = []
    second = 44100
    # maxrange = 0
    # minrange = 0

    music_file = []
    file_list = os.listdir(path)
    np.random.shuffle(file_list)
    for file in file_list:
        tem = re.match('.+?\.wav', file)
        if (tem and music_file.__len__() < 2):
            music_file.append(path + file)
            print(path, ",", file)

    print("Strart normalize...")

    print("Start slice...")
    for music in music_file:
        rate, song = read(music)
        data = Spectrogram.butter_bandpass_filter(song,
                                                  lowcut,
                                                  highcut,
                                                  rate,
                                                  order=1)

        tem = []
        for i in range(0, data.__len__()):
            if i != 0:
                tem.append([data.__getitem__(i)[0], data.__getitem__(i)[1]])
                # print(to_one(data.__getitem__(i)[0]), ",", to_one(data.__getitem__(i)[1]))

            if i % (second * 20) == 0 and i != 0:
                tem = np.mean(tem, axis=1)
                wav_spectrogram = Spectrogram.pretty_spectrogram(
                    tem.astype('float32'),
                    fft_size=fft_size,
                    step_size=step_size,
                    log=True,
                    thresh=spec_thresh)
                dataSet.append(wav_spectrogram)
                tem = []

    print("Slice Finished....")
    # if (len(temp) % 2 != 0):
    #     temp.pop()
    # write("output_originalStyle100.wav", 44100, np.reshape(temp[2],[-1,2]))
    # x = dataSet[2]
    dataSet = np.reshape(dataSet, [-1, 6880, 1024, 1])
    print(np.shape(dataSet))
    return dataSet
Ejemplo n.º 2
0
    def test(self):

        flag = 0

        init = tf.global_variables_initializer()
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        with tf.Session(config= config) as sess:

            sess.run(init)

            self.saver.restore(sess , self.infogan_model_path)

            flag = 3

            for i in range(0, flag):

                # train_list = CelebA.getNextBatch(self.ds_train, np.inf, i, self.batch_size)
                # input_data = util.get_Next_Batch(self.con_dataset, self.batch_size, i, self.batch_size)
                input_data = self.con_dataset
                # realbatch_array = CelebA.getShapeForData(train_list)

                input_data = self.con_dataset
                input_style = self.ds_train
                np.random.shuffle(input_style)
                input_style_data = input_style[0]
                input_style_data = np.reshape(input_style_data, [-1, 6880, 1024, 1])
                # realbatch_array = CelebA.getShapeForData(train_list)

                # tem = sess.run(self.con_encoder,feed_dict={self.content_music: input_data})

                output_music = sess.run(self.result, feed_dict={self.content_music: input_data,self.images:input_style_data})
                
                output_music = np.reshape(output_music, [ 6880,1024])
                
                recovered_audio_orig = Spectrogram.invert_pretty_spectrogram(output_music, fft_size=self.fft_size,
                                                                             step_size=self.step_size, log=True, n_iter=10)
                recovered_audio_orig = recovered_audio_orig * 10000000

                print("Strat generate Music")
                input_style_data = np.reshape(input_style_data, [ 6880,1024])
                
                recovered_style = Spectrogram.invert_pretty_spectrogram(input_style_data, fft_size=self.fft_size,
                                                                             step_size=self.step_size, log=True, n_iter=10)
                recovered_style = recovered_style * 10000000
                
                write("output_resultTest_Style_{:04d}.wav".format(i), 44100, recovered_style)

                write("output_resultTest_generated_{:04d}.wav".format(i), 44100, recovered_audio_orig)


            print("Test finish!")
Ejemplo n.º 3
0
    def __init__(self, avg=.9998):
        super(Model, self).__init__()
        # Getting Mel Spectrogram on the fly
        self.spec_layer = Spectrogram.STFT(sr=44100,
                                           n_fft=n_fft,
                                           freq_bins=freq_bins,
                                           fmin=50,
                                           fmax=6000,
                                           freq_scale='log',
                                           pad_mode='constant',
                                           center=True)
        self.n_bins = freq_bins
        # Creating Layers
        self.CNN_freq_kernel_size = (128, 1)
        self.CNN_freq_kernel_stride = (2, 1)
        k_out = 128
        k2_out = 256

        self.CNN_freq = nn.Conv2d(1,
                                  k_out,
                                  kernel_size=self.CNN_freq_kernel_size,
                                  stride=self.CNN_freq_kernel_stride)
        self.CNN_time = nn.Conv2d(k_out,
                                  k2_out,
                                  kernel_size=(1, regions),
                                  stride=(1, 1))

        self.region_v = 1 + (self.n_bins - self.CNN_freq_kernel_size[0]
                             ) // self.CNN_freq_kernel_stride[0]
        self.linear = torch.nn.Linear(k2_out * self.region_v, m, bias=False)

        self.avg = avg
    def __init__(self):
        super(Model, self).__init__()
        # Getting Mel Spectrogram on the fly

        self.cqt_layer = Spectrogram.CQT2019(sr=44100, fmin=55, n_bins=n_bins, bins_per_octave=bins_per_octave, pad_mode='constant')
            
        # Creating Layers
        self.linear = torch.nn.Linear(n_bins*regions, m, bias=False)
        torch.nn.init.constant_(self.linear.weight, 0) # initialize
    def __init__(self):
        super(Model, self).__init__()
        # Getting Mel Spectrogram on the fly

        self.STFT_layer = Spectrogram.STFT(sr=44100, n_fft=n_fft, freq_bins=freq_bins, fmin=50, fmax=6000, freq_scale='log', pad_mode='constant', center=True)
        self.n_bins = freq_bins
        # Creating Layers
        self.linear = torch.nn.Linear(self.n_bins*regions, m, bias=False)
        torch.nn.init.constant_(self.linear.weight, 0) # initialize
Ejemplo n.º 6
0
    def con_read(self, path):
        rate,song = read(path)
        data = Spectrogram.butter_bandpass_filter(song, self.lowcut, self.highcut, rate, order=1)
        dataSet = []
        tem = []
        second = 44100


        print("Start slice...")
        for i in range(0, data.__len__()):
            if i != 0:
                tem.append([data.__getitem__(i)[0], data.__getitem__(i)[1]])
                # print(to_one(data.__getitem__(i)[0]), ",", to_one(data.__getitem__(i)[1]))

            if i % (second * 20) == 0 and i != 0:
                # print np.shape(tem)
                tem = np.mean(tem, axis=1)
                wav_spectrogram = Spectrogram.pretty_spectrogram(tem.astype('float32'), fft_size=self.fft_size,
                                                                 step_size=self.step_size, log=True, thresh=self.spec_thresh)
                dataSet.append(wav_spectrogram)
                tem = []
        print("Slice Finished....")

        # print(np.shape(dataSet))
#         dataSet = np.float32(dataSet)
# 
#         max = np.max(dataSet)
#         min = np.min(dataSet)
#         dataSet = (dataSet - min) / (max - min)

        x = dataSet[0]

        x = np.reshape(x, [-1,6880,1024,1])
        print(np.shape(x),"-----")

        #write("output_originalContent100.wav", 44100, np.reshape(x, [-1, 2]))
        # dataSet = np.reshape(dataSet, [-1, 2, 352800, 1])
        return x
Ejemplo n.º 7
0
    def __init__(self, avg=.9998):
        super(Model, self).__init__()
        # Getting Mel Spectrogram on the fly
        self.mel_layer = Spectrogram.MelSpectrogram(sr=fs,
                                                    n_fft=n_fft,
                                                    n_mels=n_mels,
                                                    htk=htk,
                                                    fmin=50,
                                                    fmax=6000,
                                                    center=center)

        # Creating Layers
        self.linear = torch.nn.Linear(n_mels * regions, m, bias=False)
        torch.nn.init.constant_(self.linear.weight, 0)  # initialize

        self.avg = avg
    def __init__(self, avg=.9998):
        super(Model, self).__init__()
        # Getting Mel Spectrogram on the fly
        self.spec_layer = Spectrogram.MelSpectrogram(sr=fs, n_fft=n_fft, n_mels=n_mels, htk=htk, fmin=50, fmax=6000, center=center)
            
        # Creating Layers
        self.CNN_freq_kernel_size=(128,1)
        self.CNN_freq_kernel_stride=(2,1)
        k_out = 128
        k2_out = 256
        
        self.CNN_freq = nn.Conv2d(1,k_out,
                                kernel_size=self.CNN_freq_kernel_size,stride=self.CNN_freq_kernel_stride)
        self.CNN_time = nn.Conv2d(k_out,k2_out,
                                kernel_size=(1,regions),stride=(1,1))    
        
        self.region_v = 1 + (n_mels-self.CNN_freq_kernel_size[0])//self.CNN_freq_kernel_stride[0]
        self.linear = torch.nn.Linear(k2_out*self.region_v, m, bias=False)

        self.avg = avg
Ejemplo n.º 9
0
def run():    
    fs = 22050
    ws = 512
    hs = 496
    data_path = "/home/user/Desktop/masterarbeit/data/" # where the mp3-files are
    annotation_path = data_path + "annotations/"
    stepsize = 3600 # in seconds
    max_size = 3600 # in seconds (1h=3600s)
    y_axis_max_size = 0.8001 # maximal value of y axis (to get a uniform scale over all plots)
    y_axis_min_size = 0.0 # minimal value of y axis (to get a uniform scale over all plots)
    x_axis_max_size = 160040 # maximal value of x axis (to get a uniform scale over all plots)
    x_axis_min_size = 0 # minimal value of x axis (to get a uniform scale over all plots)
    len_per_hour = 160040 #fixed for plotting regions
    
    show_figures = 0

    steps = np.ceil(max_size/stepsize)
    
    summary = [] 

    tmp = [f for f in os.listdir(data_path) if os.path.isfile(os.path.join(data_path, f))]# take only non-folders
    files = natural_sort(tmp)
    
    for file_num, f in enumerate(files):
        abs_f = data_path + f
        print  "############ " + f +  " ###### " + str(file_num+1) + "/" + str(len(files)) + " ############"
        
        # read annotion file
        annotation_file = annotation_path + os.path.splitext(f)[0] + ".label"
        anno = read_commercial_annotations(annotation_file, commercial_id='2;')
        
        #make one figure per file
        if show_figures:
            plt.ion() # turns on interactive mode 
            plt.figure() # create a new figure

        for (j, step) in enumerate(xrange(0, max_size, stepsize)):
            #print  "###### from " + str(step) + " to " + str(step+stepsize) + " ###"
            
            # convert file and make spectrogram
            signal = np.frombuffer(audio_converter.decode_to_memory(abs_f, sample_rate=fs, skip=step, maxlen=stepsize), dtype=np.float32)
            magspec = abs(Spectrogram.spectrogram(signal, ws=ws, hs=hs))
            print "magpsec shape: %s"%(magspec.shape,)
            magspec_without_last = magspec[:,0:magspec.shape[1]-1] # remove the last entry because it always contains 0

            # arithmetic mean
            #amean = arithmetic_mean(magspec_without_last, axis=0)
            #(tmp,amean_masked) = filter_above_threshold(amean, 0.001, 0.0001, variable_pass_th_variance=1.1, top_n_smallest=50)

            # spectral flatness
            sflatness = spectral_flatness(magspec_without_last, axis=0)
            
            # extract local maxima
            # --> maybe smooth it before numpy.convolve() ... http://wiki.scipy.org/Cookbook/SignalSmooth
            #sflatness_smoothed = smooth(sflatness, window_len=11)
            #sflatness_local_maxima = extract_local_maxima(sflatness_smoothed, order=2)
            
            # maske-out all non-canidates
            channel =  f.split('-')[0] 
            daytime = int(((f.split('-')[1]) .split('h')[1]).split('.')[0])
            cutoff_th, fixed_pass_th, variable_pass_th_var, top_n_largest = get_threshold(channel, daytime)
            (variable_pass_th, sflatness_masked) = filter_below_threshold(sflatness, #sflatness_local_maxima, 
                        cutoff_th, fixed_pass_th, variable_pass_th_variance=variable_pass_th_var, top_n_largest=top_n_largest)

            # set area between candidates to 1.0
            sflatness_area = masked_area_between_points(sflatness_masked, 3.0, 90.0, 0.6)

            # calculate statistics
            #tp, fp, tn, fn = check_results_start_points(sflatness_masked, anno, decision_rate=0.2)
            #baseline = np.zeros(sflatness.shape)
            #tp, fp, tn, fn = check_results_mask(baseline, anno, decision_rate=0.2)
            tp, fp, tn, fn = check_results_mask(sflatness_area, anno, decision_rate=0.2)	    
            summary.append( (os.path.splitext(f)[0], tp, fp, tn, fn) )
            
            # NEW
            #set area between  to 1.0
            sflatness_area = masked_area_between_points(sflatness_masked, 3.0, 90.0, 1.0)

            tmp_time = np.linspace(0.0, np.float(3600.0), num=sflatness_area.shape[0], endpoint=False)

            out_f = open('/home/user/Desktop/masterarbeit/data/betweenSilentFrames/' + f.replace(".mp3", ".betweenSilentFrames") , 'wb')
            for i in range(sflatness_area.shape[0]):
                out_f.write('{0:.8f}'.format(tmp_time[i]).rstrip('0').rstrip('.') + "," + repr(sflatness_area[i]) + "\n")
            out_f.close()
            # NEW
            
            
            # plot data
            if show_figures:
                # set ticks, ticklabels in seconds
                length = magspec.shape[1]
                length_sec = Spectrogram.frameidx2time(length, ws=ws, hs=hs, fs=fs)
                tickdist_seconds = 120 # one tick every n seconds
                tickdist_labels_in_minutes = 60 # for seconds use 1; for minutes 60
                numticks = length_sec/tickdist_seconds
                tick_per_dist = int(round(length / numticks))
                xtickrange = range(length)[::tick_per_dist]
                xticklabels = ["%d"%((round(Spectrogram.frameidx2time(i, ws=ws, hs=hs, fs=fs)+j*stepsize)/tickdist_labels_in_minutes)) for i in xtickrange]
                
                #first subplot
                plt.subplot(3,steps,j+1)
                plt.plot(sflatness, alpha=0.8, linewidth=1)
                plt.axhline(y=cutoff_th, linewidth=2.5, color='g')
                plt.axhline(y=fixed_pass_th, linewidth=2.5, color='r')
                plt.axhline(y=variable_pass_th, linewidth=2.5, color='k')
                print_annotations(plt, anno, len_per_hour, max_size, j*len(sflatness), (j+1)*len(sflatness), y_axis_max_size)
                plt.xticks(xtickrange, xticklabels, rotation=70, fontsize=11)
                #plt.title("Spectral Flatness  |  Time: " + str(round(step/60,1)) + " - " + str(round((step+stepsize)/60, 1)) + " [min]")
                plt.ylim(ymin=y_axis_min_size, ymax=y_axis_max_size) # set constant y scale
                plt.xlim(xmin=x_axis_min_size, xmax=x_axis_max_size)
                plt.yticks(np.arange(0.0, y_axis_max_size, 0.1))
                
                #second subplot
                plt.subplot(3,steps,steps+j+1)
                plt.plot(sflatness_masked, alpha=0.7, linewidth=2.5)
                print_annotations(plt, anno, len_per_hour, max_size, j*len(sflatness), (j+1)*len(sflatness), y_axis_max_size)	    
                plt.xticks(xtickrange, xticklabels, rotation=70, fontsize=11)
                #plt.title("Spectral Flatness (selected)  |  Time: " + str(round(step/60,1)) + " - " + str(round((step+stepsize)/60, 1)) + " [min]")
                plt.ylim(ymin=y_axis_min_size, ymax=y_axis_max_size) # set constant y scale
                plt.xlim(xmin=x_axis_min_size, xmax=x_axis_max_size)
                plt.yticks(np.arange(0.0, y_axis_max_size, 0.1))

               #third subplot PROTOTYP only for whole hours
                plt.subplot(3,steps,3)
                plt.plot(sflatness_area, alpha=0.7, linewidth=2.5)
                print_annotations(plt, anno, len_per_hour, max_size, j*len(sflatness), (j+1)*len(sflatness), y_axis_max_size)	    
                plt.xticks(xtickrange, xticklabels, rotation=70, fontsize=11)
                plt.fill_between(range(sflatness_area.shape[0]), 0, sflatness_area, facecolor='blue', alpha=0.3)
                #plt.title("Connect area between  |  Time: " + str(round(step/60,1)) + " - " + str(round((step+stepsize)/60, 1)) + " [min]")
                plt.ylim(ymin=y_axis_min_size, ymax=y_axis_max_size) # set constant y scale
                plt.xlim(xmin=x_axis_min_size, xmax=x_axis_max_size)
                plt.yticks(np.arange(0.0, y_axis_max_size, 0.1))


        if show_figures:
            plt.suptitle("File: " + os.path.splitext(f)[0])
            plt.show()
        

    # calculating statistics
    summary_names = (np.array(summary)[:,:1])
    summary_values = (np.array(summary)[:,1:]).astype(np.integer)
    
    total_tp = summary_values.sum(0)[0]
    total_fp = summary_values.sum(0)[1]
    total_tn = summary_values.sum(0)[2]
    total_fn = summary_values.sum(0)[3]

    summary_names = np.vstack((summary_names, np.array([ 'total' ])))
    summary_values = np.vstack((summary_values, np.array([ total_tp, total_fp, total_tn, total_fn ])))
    
    print "NAME      \t TP \t FP \t TN \t FN \t PREC \t RECL \t F1 \t ACC "
    for idx, entry in enumerate(summary_values):
        tp = entry[0]
        fp = entry[1]
        tn = entry[2]
        fn = entry[3]
        prec = tp / (tp+fp+0.0001)
        recall = tp / (tp+fn+0.0001)
        fmeasure = 2.0 * (prec * recall) / (prec + recall+0.0001)
        accuracy = (tp+tn) / (tp+tn+fp+fn+0.0001)
        print ( str(summary_names[idx]) + " \t " + str(tp) + " \t " + str(fp) + " \t " + 
                    str(tn) + " \t " + str(fn) + " \t " + str(round(prec,3)) + " \t " + 
                    str(round(recall,3)) + " \t " + str(round(fmeasure,3)) + " \t " + str(round(accuracy,3)) )

    if show_figures:
        raw_input('Press Enter to continue...')
Ejemplo n.º 10
0
    def train(self):

        opti_D = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_dis).minimize(self.D_loss , var_list=self.d_vars)
        # opti_D = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_dis).minimize(self.D_loss_summary)
        opti_G = tf.train.RMSPropOptimizer(learning_rate=self.gen_learning_rate).minimize(self.g_loss, var_list=self.g_vars)
        opti_e = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_dis).minimize(self.encode_loss, var_list=self.e_vars)


        init = tf.global_variables_initializer()

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        with tf.Session(config=config) as sess:

            # print("Start Init.....")


            sess.run(init)
            # print("Finish Init....")


            self.saver.save(sess= sess, save_path= self.infogan_model_path)

            summary_op = tf.summary.merge_all()
            summary_writer = tf.summary.FileWriter(self.log_dir, sess.graph)

            batch_num = 0
            e = 0
            step = 0
            point = []
            print("Start Training...")
#             tem = np.reshape(self.con_dataset,[-1,2])
#             write("output_originalContent_{:04d}.wav".format(step), 44100, tem)
#             tem = np.reshape(self.ds_train[0],[-1,2])
#             write("output_originalStyle_{:04d}.wav".format(step), 44100, tem)
            #realbatch_array = self.ds_train[5]
            #realbatch_array = np.reshape(realbatch_array, [-1, 2, 352800, 1])
            while e <= self.max_epoch:

                max_iter = len(self.ds_train)/self.batch_size - 1
                print("Echo time: ", e)

                np.random.shuffle(self.ds_train)      #-----------

                while batch_num < len(self.ds_train)/self.batch_size:
                    step = step + 1
                    print("iterate time: ", step)
                    
                    realbatch_array = util.get_Next_Batch(self.ds_train,self.batch_size,max_iter,batch_num)
                    realbatch_array = np.reshape(realbatch_array,  [-1, 6880, 1024, 1])
                    # print("Shape of Data", np.shape(realbatch_array))
                    sample_z = np.random.normal(size=[self.batch_size, self.sample_size])


                    # print("Start Optimizing.....")
                    # print("Optimize Encoder....")
                    sess.run(opti_e, feed_dict={self.content_music: self.con_dataset, self.images: realbatch_array})
                    # print("optimized finished....")
                    #optimization D
                    # print("Optimize Dis....")
                    if(step%1==0):
                        sess.run(opti_D, feed_dict={self.images: realbatch_array, self.content_music: self.con_dataset})
                        D_loss = sess.run(self.D_loss, feed_dict={self.images: realbatch_array, self.content_music: self.con_dataset})
                       # print(D_loss,"-----D_lOSS")
                    # print("optimized finished....")
                    #optimizaiton G
                    # print("Optimize Generateor.....")
                    sess.run(opti_G, feed_dict={self.content_music: self.con_dataset,self.images: realbatch_array, self.global_train_step : step}) #, self.z_p: sample_z
                    # print("optimized finished....")

                    # print("Start Writting.....")
                    summary_str = sess.run(summary_op, feed_dict = {self.images:realbatch_array, self.content_music: self.con_dataset, self.z_p: sample_z})#
                    summary_writer.add_summary(summary_str , step)

                    batch_num += 1
                    #print("Finishing Writting....")
                    
                    fake_loss = sess.run(self.g_loss,
                                         feed_dict={self.content_music: self.con_dataset,self.images: realbatch_array, self.z_p: sample_z})

                    print(fake_loss, "-----G_lOSS")
                    point.append(fake_loss)
                    if step%100 == 0:

                        D_loss = sess.run(self.D_loss, feed_dict={self.images: realbatch_array,self.content_music: self.con_dataset})
                        fake_loss = sess.run(self.g_loss, feed_dict={self.content_music: self.con_dataset,self.images: realbatch_array, self.z_p: sample_z})
                        encode_loss = sess.run(self.encode_loss, feed_dict={self.content_music: self.con_dataset,self.images: realbatch_array, self.z_p: sample_z})
                        lr = sess.run(self.gen_learning_rate, feed_dict={self.global_train_step:step})
                        print("EPOCH %d step %d: D: loss = %.7f G: loss=%.7f Encode: loss=%.7f Gen_LearningRate:%.7f" % (e, step, D_loss, fake_loss, encode_loss,lr))


                    if np.mod(step , 500) == 0:
                        #plt.plot(range(step),point)
                        #plt.savefig("Content_Gen_cost.jpg")

                        sample_audio = sess.run(self.x_p, feed_dict={self.content_music: self.con_dataset,self.images: realbatch_array})
                        # sample_audio = np.int32(sample_audio)
                        sample_audio = np.reshape(sample_audio, [ 6880,1024])
                        recovered_audio_orig = Spectrogram.invert_pretty_spectrogram(sample_audio, fft_size=self.fft_size,
                                                                                     step_size=self.step_size, log=True,
                                                                                     n_iter=10)
                        recovered_audio_orig = recovered_audio_orig * 10000000
                        write("output_generated_{:04d}.wav".format(step), 44100, recovered_audio_orig)

                        # save_images(sample_images[0:100] , [10 , 10], '{}/train_{:02d}_{:04d}.png'.format(self.sample_path, e, step))
                        self.saver.save(sess , self.infogan_model_path)


                e += 1
                # print ("Epoch: ",e)
                batch_num = 0

            save_path = self.saver.save(sess , self.infogan_model_path)
            np.savetxt("Train_Gen_cost.txt",point)
            print("Model saved in file: %s" % save_path)
Ejemplo n.º 11
0
    def con_train(self):

        opti_con_e = tf.train.AdamOptimizer(learning_rate=0.001).minimize(self.style_en_lost,
                                                                                                var_list=self.sty_e_vars)
        opti_con_G = tf.train.AdamOptimizer(learning_rate=self.Style_gen_learning_rate).minimize(self.style_gen_lost,
                                                                                              var_list=self.sty_g_vars)

        init = tf.global_variables_initializer()

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            print("Start .....")
            sess.run(init)
            print("Finish Init....")
            self.saver.save(sess=sess, save_path=self.infogan_model_path)

            batch_num = 0
            point = []
            e = 0
            step = 0
            sample_z = np.random.normal(size=[self.batch_size, self.sample_size])
            # print(np.shape(input_data))
            # get the []
            # z_var = self.z_var.eval()
            print("Start Training...")
            while e <= self.con_ecoh:
                print("echo: ", e)

                max_iter = len(self.ds_train) / self.batch_size - 1
                #np.random.shuffle(self.ds_train)

                while batch_num < len(self.ds_train) / self.batch_size:
                    step = step + 1
                    print("iterate time: ", step)
                    input_data = util.get_Next_Batch(self.ds_train, self.batch_size, max_iter, batch_num)
                    #input_data = np.reshape(input_data, [-1, 2, 44100*20, 1])
                    input_data = np.reshape(input_data, [-1, 6880, 1024, 1])

                    sess.run(opti_con_e, feed_dict={self.images: input_data, self.z_p: sample_z})
                    sess.run(opti_con_G, feed_dict={self.images: input_data, self.global_style_train_step:step, self.z_p: sample_z})

                    E_loss = sess.run(self.style_en_lost, feed_dict={self.images: input_data})

                    G_loss = sess.run(self.style_gen_lost, feed_dict={self.images: input_data})

                    print("Encoder_loss:", E_loss)
                    print("Generator_loss:", G_loss)
                    point.append(G_loss)
                    batch_num += 1

                    # if np.mod(step, 200) == 0:
                    #     tem = np.reshape(input_data[0], [-1, 2])
                    #     write("output_Style_original_{:04d}.wav".format(step), 44100, tem)

                    if np.mod(step, 500) == 0:
                        #plt.plot(range(step), point)
                        #plt.savefig("Style_Gen_cost.jpg")
                        sample_audio, _ = sess.run([self.con_x_tilde, self.sty_encoder],
                                                   feed_dict={self.images: input_data})
                        print(np.shape(sample_audio))
                        sample_audio = np.reshape(sample_audio, [ 6880,1024])
                        recovered_audio_orig = Spectrogram.invert_pretty_spectrogram(sample_audio, fft_size=self.fft_size,
                                                                                     step_size=self.step_size, log=True,
                                                                                     n_iter=10)
                        recovered_audio_orig = recovered_audio_orig * 10000000
                        write("output_Style_generated_{:04d}.wav".format(step), 44100, recovered_audio_orig)
                        self.saver.save(sess, self.infogan_model_path)
                e += 1
                batch_num = 0
            save_path = self.saver.save(sess, self.infogan_model_path)
            np.savetxt("Style_train_Gen_cost.txt",point)
            print("Model saved in file: %s" % save_path)
Ejemplo n.º 12
0
def run():
    fs = 22050  # sampling rate
    ws = 1024  # window size
    hs = 512  # hop size
    data_path = "/home/user/Desktop/masterarbeit/data/"  # where the mp3-files are
    #filenames = [ 'RTL-h5', 'RTL-h17', 'Sat1-h16' ]
    filenames = ['RTL-h5', 'RTL-h17', 'ZDF-h17']

    total_len_sec = 3600.0
    total_len_min = total_len_sec / 60.0

    clip_size_seconds = 30.0
    yMax = 70.0

    plt.rcParams.update({'font.size': 14})

    f, axarr = plt.subplots(len(filenames), sharex=True)

    for file_num, filename in enumerate(filenames):
        loudness = np.loadtxt(data_path + "LoudnessTotal/" + filename +
                              ".LoudnessTotal",
                              delimiter=',')[:, 1]
        annotation_file = data_path + "annotations_block/" + filename + ".label"

        # calculate mean
        clip_size = int(clip_size_seconds *
                        (loudness.shape[0] / total_len_sec))
        print str(clip_size_seconds) + " seconds = " + str(
            clip_size) + " frames"
        loudness_shortend = loudness[:(
            loudness.shape[0] / clip_size
        ) * clip_size]  # remove elements at the end in order to get an shape that is a multiple of clip_size
        loudness_reshaped = loudness_shortend.reshape(-1, clip_size)
        loudness_clip = np.mean(loudness_reshaped, axis=1)

        axarr[file_num].plot(np.linspace(0, total_len_min,
                                         loudness_clip.shape[0]),
                             loudness_clip,
                             color='blue',
                             linewidth=2.0)
        axarr[file_num].plot(np.linspace(0, total_len_min, loudness.shape[0]),
                             loudness,
                             color='blue',
                             linewidth=0.2,
                             alpha=0.10)
        axarr[file_num].set_ylim(0, yMax)
        axarr[file_num].set_title("Hour: " + ntpath.basename(filename))
        axarr[file_num].set_ylabel("Loudness")
        #axarr[file_num].plot([0, total_len_min], [np.mean(loudness), np.mean(loudness)], color='red')
        axarr[file_num].fill_between(
            np.linspace(0, total_len_min, loudness_clip.shape[0]),
            loudness_clip, np.mean(loudness))

        # read annotion file
        anno = read_commercial_annotations(annotation_file, commercial_id='2;')
        #print annotations
        for a in anno:
            a_start = a[0] * total_len_min / total_len_sec
            a_end = (a[0] + a[1]) * total_len_min / total_len_sec
            axarr[file_num].fill_between([a_start, a_end],
                                         0,
                                         yMax,
                                         facecolor='gray',
                                         alpha=0.4)

    f.subplots_adjust(hspace=0.25)
    plt.setp([a.get_xticklabels() for a in f.axes[:-1]], visible=False)
    plt.xlabel("Time [min]")
    f.set_size_inches(13, 11)

    plt.savefig('/home/user/Desktop/loudness_during_commercial_block.png',
                bbox_inches='tight')
    plt.show()

    exit()

    if stop:
        signal = np.frombuffer(audio_converter.decode_to_memory(filename, sample_rate=fs, skip=start, maxlen=stop-start),\
                dtype=np.float32)
    else:
        signal = np.frombuffer(audio_converter.decode_to_memory(
            filename, sample_rate=fs),
                               dtype=np.float32)

    magspec = abs(Spectrogram.spectrogram(signal, ws=ws, hs=hs))
    print "magpsec shape: %s" % (magspec.shape, )
    print "signal shape: %s" % (signal.shape, )

    # save magspec to /tmp/
    np.savez('/tmp/magspec.npz', magspec)

    # downsample
    signal_downsampled = signal[::16384]
    print "signal_downsampled shape: %s" % (signal_downsampled.shape, )

    # set ticks, ticklabels in seconds
    length = magspec.shape[1]

    length_sec = Spectrogram.frameidx2time(length, ws=ws, hs=hs, fs=fs)
    tickdist_seconds = 60  # one tick every n seconds
    tickdist_labels_in_minutes = 60  # for seconds use 1; for minutes 60
    numticks = length_sec / tickdist_seconds
    tick_per_dist = int(round(length / numticks))
    xtickrange = range(length)[::tick_per_dist]
    xticklabels = [
        "%d" % (round(Spectrogram.frameidx2time(i, ws=ws, hs=hs, fs=fs)) /
                tickdist_labels_in_minutes) for i in xtickrange
    ]

    #plt.subplot(211)
    #plt.plot(signal_downsampled)
    #plt.xticks(xtickrange, xticklabels, rotation=70, fontsize=8)
    #plt.title("signal")

    # energy
    clip_size = 512
    energy = np.sum(magspec, axis=0)
    print "energy shape: %s" % (energy.shape)
    energy_shortend = energy[:(
        energy.shape[0] / clip_size
    ) * clip_size]  # remove elements at the end in order to get an shape that is a multiple of clip_size
    print "energy_shortend shape: %s" % (energy_shortend.shape)
    energy_reshaped = energy_shortend.reshape(-1, clip_size)
    print "energy_reshaped shape: " + str(energy_reshaped.shape)
    energy_clip = np.mean(energy_reshaped, axis=1)
    print "energy_reshaped shape: " + str(energy_clip.shape)

    #TODO: use data of RTL-h8.LoudnessTotal

    #spectrogram_xscale = plt.xlim()  # just to scale it the same way the spectrograms were scaled
    plt.subplot(111)
    plt.plot(energy_clip)
    #plt.xlim(spectrogram_xscale)
    #plt.xticks(xtickrange, xticklabels, rotation=70, fontsize=8)
    plt.title("energy")

    plt.suptitle("File: " + ntpath.basename(filename) + "  |  Time: " +
                 str(round(start / 60, 1)) + " - " +
                 str(round((start + length_sec) / 60, 1)) + " [min]")
    plt.show()
def run():    
    fs = 22050
    ws = 512 #512
    hs = 256 #496
    working_dir = "/home/user/repos/silentframes/"
    data_path = working_dir + "data/" # where the mp3-files are
    annotation_path = data_path + "annotations/"
    stepsize = 3600 # in seconds
    max_size = 3600 # in seconds (1h=3600s)
    elements_per_hour = int((fs / (hs*1.0)) * max_size) # 160040 (for ws=512 und hs=496)
    y_axis_max_size = 0.7001 # maximal value of y axis (to get a uniform scale over all plots)
    y_axis_max_size_rms = 0.00151 #0.0025 # maximal value of y axis for the RMS feature (to get a uniform scale over all plots)
    y_axis_min_size = 0.0 # minimal value of y axis (to get a uniform scale over all plots)
    x_axis_max_size = elements_per_hour #160040 # maximal value of x axis (to get a uniform scale over all plots)
    x_axis_min_size =  0 #0 # minimal value of x axis (to get a uniform scale over all plots)
    min_comm_length = 3.0
    max_comm_length = 60.0
    result_file_name = "result.txt"
    result_file = open(result_file_name, 'w')
    
    show_figures = 0

    steps = np.ceil(max_size/stepsize)
    
    tmp = [f for f in os.listdir(data_path) if os.path.isfile(os.path.join(data_path, f))]# take only non-folders
    files = natural_sort(tmp)
    
    #files = ['ZDF-h2.mp3', 'RTL-h20.mp3']# take only non-folders
    #files = ['ARD-h16.mp3', 'Sat1-h7.mp3', 'RTL-h5.mp3', 'ZDF-h14.mp3', 'ZDF-h17.mp3']# take only non-folders
    
    
    # TEMP1: manually set them to vary the parameters
    #cutoff_th, fixed_pass_th, variable_pass_th_var, top_n_largest =  0.0007, 0.0001, 1.7, 60
    
    for max_comm_length in [ 4.0, 15.0, 30.0, 45.0, 60.0, 75.0, 90.0, 120.0, 200.0, 600.0, 3600.0 ] :
    #for max_comm_length in [ 60.0 ] :
        print  "#######################################################################################"
        print  "### max_comm_length: " + str(max_comm_length) + " ###########################################################"
        print  "#######################################################################################"
        
        summary = [] 
    
        for file_num, f in enumerate(files):
            abs_f = data_path + f
            print  "############ " + f +  " ###### " + str(file_num+1) + "/" + str(len(files)) + " ############"
            
            # read annotion file
            annotation_file = annotation_path + os.path.splitext(f)[0] + ".label"
            anno = read_commercial_annotations(annotation_file, commercial_id='2;')
            
            #make one figure per file
            if show_figures:
                plt.rcParams.update({'font.size': 14})

            for (j, step) in enumerate(xrange(0, max_size, stepsize)):
                #print  "###### from " + str(step) + " to " + str(step+stepsize) + " ###"
                
                # convert file and make spectrogram
                signal = np.frombuffer(audio_converter.decode_to_memory(abs_f, sample_rate=fs, skip=step, maxlen=stepsize), dtype=np.float32)
                magspec_without_last = abs(Spectrogram.spectrogram(signal, ws=ws, hs=hs))[:,:-1] 
                signal = None # unlink variable (the garbage collector can then free the memory if it is needed)
                print "magpsec shape w/o last element (always contains 0): %s"%(magspec_without_last.shape,)

                # arithmetic mean
                #amean = arithmetic_mean(magspec_without_last, axis=0)
                #(tmp,amean_masked) = filter_above_threshold(amean, 0.001, 0.0001, variable_pass_th_variance=1.1, top_n_smallest=50)

                # spectral flatness
                #sflatness = spectral_flatness(magspec_without_last, axis=0)
                
                # RMS
                rms = rms_energy(magspec_without_last) / max(rms_energy(magspec_without_last))
                
                magspec_without_last = None # unlink variable (the garbage collector can then free the memory if it is needed)
              
                # extract local maxima
                # --> maybe smooth it before numpy.convolve() ... http://wiki.scipy.org/Cookbook/SignalSmooth
                #sflatness_smoothed = smooth(sflatness, window_len=11)
                #sflatness_local_maxima = extract_local_maxima(sflatness_smoothed, order=2)
                
                # maske-out all non-candidates
                channel =  f.split('-')[0] 
                daytime = int(((f.split('-')[1]) .split('h')[1]).split('.')[0])
                
                # TEMP1: manually set them to vary the parameters
                cutoff_th, fixed_pass_th, variable_pass_th_var, top_n_largest = get_threshold(channel, daytime)
                
                (variable_pass_th, rms_masked) = filter_above_threshold(rms, #sflatness_local_maxima, 
                            cutoff_th, fixed_pass_th, variable_pass_th_variance=variable_pass_th_var, top_n_smallest=top_n_largest)			
                #(variable_pass_th, sflatness_masked) = filter_below_threshold(sflatness, #sflatness_local_maxima, 
                #            cutoff_th, fixed_pass_th, variable_pass_th_variance=variable_pass_th_var, top_n_largest=top_n_largest)
                print "variable_pass_th: \t\t" + str(variable_pass_th)

                # set area between candidates to 1.0
                rms_area = masked_area_between_points(elements_per_hour, rms_masked, min_comm_length, max_comm_length, 1.0)
                
                # calculate statistics
                #tp, fp, tn, fn = check_results_start_points(elements_per_hour, sflatness_masked, anno, decision_rate=0.2)
                #baseline = np.zeros(sflatness.shape)
                #tp, fp, tn, fn = check_results_mask(elements_per_hour, baseline, anno, decision_rate=0.2)
                tp, fp, tn, fn = check_results_mask(elements_per_hour, rms_area, anno, decision_rate=0.2)	    
                summary.append( (os.path.splitext(f)[0], tp, fp, tn, fn) )
                
                # NEW
                tmp_time = np.linspace(0.0, np.float(3600.0), num=rms_area.shape[0], endpoint=False)

                out_f = open(working_dir + 'data/betweenSilentFrames/' + f.replace(".mp3", ".betweenSilentFrames") , 'wb')
                for i in range(rms_area.shape[0]):
                    out_f.write('{0:.8f}'.format(tmp_time[i]).rstrip('0').rstrip('.') + "," + repr(rms_area[i]) + "\n")
                out_f.close()
                # NEW
                
                
                # plot data
                if show_figures:
                    
                    f, axarr = plt.subplots(3, sharex=True)
                    
                    # set ticks, ticklabels in seconds
                    length = rms.shape[0]
                    length_sec = Spectrogram.frameidx2time(length, ws=ws, hs=hs, fs=fs)
                    tickdist_seconds = 120 # one tick every n seconds
                    tickdist_labels_in_minutes = 60 # for seconds use 1; for minutes 60
                    numticks = length_sec/tickdist_seconds
                    tick_per_dist = int(round(length / numticks))
                    xtickrange = range(length)[::tick_per_dist]
                    xticklabels = ["%d"%((round(Spectrogram.frameidx2time(i, ws=ws, hs=hs, fs=fs)+j*stepsize)/tickdist_labels_in_minutes)) for i in xtickrange]

                    '''
                    #first subplot (old value: spectral flatness)
                    axarr[0].plot(sflatness, alpha=0.8, linewidth=1)
                    axarr[0].axhline(y=cutoff_th, linewidth=2.5, color='g')
                    axarr[0].axhline(y=fixed_pass_th, linewidth=2.5, color='r')
                    axarr[0].axhline(y=variable_pass_th, linewidth=2.5, color='k')
                    print_annotations(axarr[0], anno, elements_per_hour, max_size, j*len(sflatness), (j+1)*len(sflatness), y_axis_max_size)
                    #axarr[0].set_title("Spectral Flatness  |  Time: " + str(round(step/60,1)) + " - " + str(round((step+stepsize)/60, 1)) + " [min]")
                    axarr[0].set_ylim(ymin=y_axis_min_size, ymax=y_axis_max_size) # set constant y scale
                    axarr[0].set_xlim(xmin=x_axis_min_size, xmax=x_axis_max_size)
                    axarr[0].set_yticks(np.arange(0.0, y_axis_max_size, 0.1))
                    axarr[0].set_title("Before peak picking (step: 1c)")
                    axarr[0].set_ylabel("Spectral Flatness")
                    '''
                    
                    #first subplot 
                    axarr[0].plot(rms, alpha=0.8, linewidth=1)
                    axarr[0].axhline(y=cutoff_th, linewidth=2.5, color='g')
                    axarr[0].axhline(y=fixed_pass_th, linewidth=2.5, color='r')
                    axarr[0].axhline(y=variable_pass_th, linewidth=2.5, color='k')
                    print_annotations(axarr[0], anno, elements_per_hour, max_size, j*len(rms), (j+1)*len(rms), y_axis_max_size_rms)
                    #axarr[0].set_title("Root Mean Square  |  Time: " + str(round(step/60,1)) + " - " + str(round((step+stepsize)/60, 1)) + " [min]")
                    axarr[0].set_ylim(ymin=y_axis_min_size, ymax=y_axis_max_size_rms) # set constant y scale
                    axarr[0].set_xlim(xmin=x_axis_min_size, xmax=y_axis_max_size_rms)
                    axarr[0].set_yticks(np.arange(0.0, y_axis_max_size_rms, 0.0005))
                    axarr[0].set_title("Before peak picking (step: 1c)")
                    axarr[0].set_ylabel("RMS")
                    
                    #second subplot
                    #axarr[1].plot(rms_masked, alpha=0.7, linewidth=2.5)
                    axarr[1].plot(reset_value(rms_masked, 0.0, 1.0), alpha=0.7, linewidth=2.5)
                    print_annotations(axarr[1], anno, elements_per_hour, max_size, j*len(rms), (j+1)*len(rms), y_axis_max_size_rms)
                    #axarr[1].set_title("Spectral Flatness (selected)  |  Time: " + str(round(step/60,1)) + " - " + str(round((step+stepsize)/60, 1)) + " [min]")
                    axarr[1].set_ylim(ymin=y_axis_min_size, ymax=y_axis_max_size_rms) # set constant y scale
                    axarr[1].set_xlim(xmin=x_axis_min_size, xmax=y_axis_max_size_rms)
                    axarr[1].set_yticks(np.arange(0.0, y_axis_max_size_rms, 0.0005))
                    axarr[1].set_title("After peak picking (step: 1c)")
                    axarr[1].set_ylabel("RMS")

                   #third subplot PROTOTYP only for whole hours
                    axarr[2].plot(rms_area, alpha=0.7, linewidth=2.5)
                    print_annotations(axarr[2], anno, elements_per_hour, max_size, j*len(rms), (j+1)*len(rms), y_axis_max_size)	    
                    axarr[2].fill_between(range(rms_area.shape[0]), 0, rms_area, facecolor='blue', alpha=0.3)
                    #axarr[2].set_title("Connect area between  |  Time: " + str(round(step/60,1)) + " - " + str(round((step+stepsize)/60, 1)) + " [min]")
                    axarr[2].set_ylim(ymin=y_axis_min_size, ymax=y_axis_max_size) # set constant y scale
                    axarr[2].set_xlim(xmin=x_axis_min_size, xmax=x_axis_max_size)
                    axarr[2].set_yticks([y_axis_min_size, y_axis_max_size])
                    axarr[2].set_yticklabels(['false', 'true'])
                    axarr[2].set_title("At the end (step: 2)")
                    axarr[2].set_ylabel("Commercial")
                    

            if show_figures:
                f.subplots_adjust(hspace=0.25)
                plt.setp([a.get_xticklabels() for a in f.axes[:-1]], visible=False)
                plt.xticks(np.linspace(0, elements_per_hour, num=7), [0, 10, 20, 30, 40, 50, 60])
                plt.xlabel("Time [min]")
                f.set_size_inches(13, 11)
        
                plt.savefig('/home/user/Desktop/bsf_classification_process.png', bbox_inches='tight')    
                plt.show()
            

        # calculating statistics
        summary_names = (np.array(summary)[:,:1])
        summary_values = (np.array(summary)[:,1:]).astype(np.integer)
        
        total_tp = summary_values.sum(0)[0]
        total_fp = summary_values.sum(0)[1]
        total_tn = summary_values.sum(0)[2]
        total_fn = summary_values.sum(0)[3]

        summary_names = np.vstack((summary_names, np.array([ 'total' ])))
        summary_values = np.vstack((summary_values, np.array([ total_tp, total_fp, total_tn, total_fn ])))
        
        result_file.write("NAME      \t TP \t FP \t TN \t FN \t PREC \t RECL \t F1 \t ACC\n")
        result_file_name
        for idx, entry in enumerate(summary_values):
            tp = entry[0]
            fp = entry[1]
            tn = entry[2]
            fn = entry[3]
            prec = tp / (tp+fp+0.0001)
            recall = tp / (tp+fn+0.0001)
            fmeasure = 2.0 * (prec * recall) / (prec + recall+0.0001)
            accuracy = (tp+tn) / (tp+tn+fp+fn+0.0001)
            result_file.write( str(summary_names[idx]) + " \t " + str(tp) + " \t " + str(fp) + " \t " + 
                        str(tn) + " \t " + str(fn) + " \t " + str(round(prec,3)) + " \t " + 
                        str(round(recall,3)) + " \t " + str(round(fmeasure,3)) + " \t " + str(round(accuracy,3)) + "\n")

        result_file.write("################################################\n")
        result_file.write("cutoff_th: \t\t" + str(cutoff_th) + "\n")
        result_file.write("fixed_pass_th: \t\t" + str(fixed_pass_th) + "\n")
        result_file.write("variable_pass_th_var: \t" + str(variable_pass_th_var) + "\n")
        result_file.write("top_n_largest: \t\t" + str(top_n_largest) + "\n")
        result_file.write("min_comm_length: \t" + str(min_comm_length) + "\n")
        result_file.write("max_comm_length: \t" + str(max_comm_length) + "\n")
        result_file.write("window_size: \t\t" + str(ws) + "\n")
        result_file.write("hop_size: \t\t" + str(hs) + "\n")
        result_file.write("\n\n\n")
        result_file.flush()

        if show_figures:
            raw_input('Press Enter to continue...')
    
    result_file.close()
Ejemplo n.º 14
0
def run():
    fs = 22050  # sampling rate
    ws = 1024  # window size
    hs = 512  # hop size
    data_path = "/home/user/Desktop/masterarbeit/data/"  # where the mp3-files are
    feature_names = ["sfCount", "sfConsecutive_Median", "sfConsecutive_Max"]
    silence_threshold = 1.0  # classified as silence if rms < silence_threshold

    clip_windowsize = 352  #704 #1408 #2816
    clip_stepsize = 9

    # add configuration to filename
    for i in range(len(feature_names)):
        feature_names[i] = feature_names[i] + "_" + str(
            int(silence_threshold *
                10)) + "e1_" + str(clip_stepsize) + "_" + str(clip_windowsize)
        print feature_names[i]

    # remove existing output directory and build directory path
    feature_descriptions = []
    for feature_name in feature_names:
        feature_output_dir = data_path + feature_name + "/"
        if not os.path.exists(feature_output_dir):
            os.makedirs(feature_output_dir)
        feature_descriptions.append([feature_name, feature_output_dir])
    # only silent frames (just as reference)
    feature_output_dir_sf = data_path + "sf/"
    if not os.path.exists(feature_output_dir_sf):
        os.makedirs(feature_output_dir_sf)

    # process hours in steps (otherwise too much memory is used!)
    stepsize = 900  # in seconds
    max_size = 3600  # in seconds (1h=3600s)
    steps = np.ceil(max_size * 1.0 / stepsize)

    tmp = [
        f for f in os.listdir(data_path)
        if os.path.isfile(os.path.join(data_path, f))
    ]  # take only non-folders
    files = natural_sort(tmp)

    for file_num, f in enumerate(files):
        abs_f = data_path + f
        print "############ " + f + " ###### " + str(file_num + 1) + "/" + str(
            len(files)) + " ##############################"

        # remove existing output file and build filename
        out_fs = []
        for feature_name, feature_output_dir in feature_descriptions:
            out_filename = feature_output_dir + f.replace(
                ".mp3", "." + feature_name)
            if os.path.exists(out_filename):
                os.remove(out_filename)
            out_fs.append(open(out_filename, 'ab'))
        # only silent frames (just as reference)
        out_filename_sf = feature_output_dir_sf + f.replace(".mp3", ".sf")
        if os.path.exists(out_filename_sf):
            os.remove(out_filename_sf)
        out_f_sf = open(out_filename_sf, 'ab')

        # convert file and make spectrogram
        for step in xrange(0, max_size, stepsize):
            signal = np.frombuffer(audio_converter.decode_to_memory(
                abs_f, sample_rate=fs, skip=step, maxlen=stepsize),
                                   dtype=np.float32)
            magspec = abs(Spectrogram.spectrogram(signal, ws=ws, hs=hs))
            print "magspec shape: %s" % (magspec.shape, )

            # calculate (frame-level) feature: silence frames
            rms = rms_energy(magspec)
            silent_frames = np.where(rms < silence_threshold, 1.0, 0.0)

            # only silent frames (just as reference)
            tmp_time = np.linspace(np.float(step),
                                   np.float(step + stepsize),
                                   num=silent_frames.shape[0],
                                   endpoint=False)
            for i in range(silent_frames.shape[0]):
                out_f_sf.write(
                    '{0:.8f}'.format(tmp_time[i]).rstrip('0').rstrip('.') +
                    "," + repr(silent_frames[i]) + "\n")

            # clip
            clip_feature = np.empty(
                (len(feature_names),
                 np.ceil(silent_frames.shape[0] * 1.0 / clip_stepsize)))
            for i in range(clip_feature.shape[1]):
                start_frame = i * clip_stepsize - clip_windowsize / 2
                # if clip_windowsize is odd then add one to the integer division
                stop_frame = i * clip_stepsize + clip_windowsize / 2 + (
                    1 if clip_windowsize % 2 == 1 else 0)

                # set limits to prevent index out of bounds
                if start_frame < 0:
                    start_frame = 0
                if stop_frame > silent_frames.shape[0]:
                    stop_frame = silent_frames.shape[0]

                # consecutive silent frames
                consecutive_sf = []
                consecutive_len = 0
                for j in range(start_frame, stop_frame):
                    if silent_frames[j] == 1:
                        consecutive_len += 1
                    else:
                        if consecutive_len != 0:  # add only consecutive silent frames
                            consecutive_sf.append(consecutive_len)
                        consecutive_len = 0

                if not consecutive_sf:  # if list is empty add dummy for easier calculations later
                    consecutive_sf.append(0)

                # calculate clip-level feature
                clip_feature[0][i] = silent_frames[start_frame:stop_frame].sum(
                )
                clip_feature[1][i] = np.median(consecutive_sf)
                clip_feature[2][i] = np.max(consecutive_sf)

                #TODO: other features?

            # write feature to output
            tmp_time = np.linspace(np.float(step),
                                   np.float(step + stepsize),
                                   num=clip_feature.shape[1],
                                   endpoint=False)
            for i in range(clip_feature.shape[1]):
                for j, out_f in enumerate(out_fs):
                    out_f.write(
                        '{0:.8f}'.format(tmp_time[i]).rstrip('0').rstrip('.') +
                        "," + repr(clip_feature[j][i]) + "\n")

        for out_f in out_fs:
            out_f.close()
        out_f_sf.close()  # only silent frames (just as reference)
Ejemplo n.º 15
0
def run():
    args = parse_arguments()
    if not args.start: args.start = 0
    if args.stop:
        signal = np.frombuffer(audio_converter.decode_to_memory(args.filename, sample_rate=args.fs, skip=args.start, maxlen=args.stop-args.start),\
                dtype=np.float32)
    else: 
        signal = np.frombuffer(audio_converter.decode_to_memory(args.filename, sample_rate=args.fs), dtype=np.float32)

    magspec = abs(Spectrogram.spectrogram(signal, ws=args.ws, hs=args.hs))
    print "magpsec shape: %s"%(magspec.shape,)

    # save magspec to /tmp/
    np.savez('/tmp/magspec.npz', magspec)

    # set ticks, ticklabels in seconds
    length = magspec.shape[1]
    length_sec = Spectrogram.frameidx2time(length, ws=args.ws, hs=args.hs, fs=args.fs)
    tickdist_seconds = 60 # one tick every n seconds
    tickdist_labels_in_minutes = 60 # for seconds use 1; for minutes 60
    numticks = length_sec/tickdist_seconds
    tick_per_dist = int(round(length / numticks))
    xtickrange = range(length)[::tick_per_dist]
    xticklabels = ["%d"%(round(Spectrogram.frameidx2time(i, ws=args.ws, hs=args.hs, fs=args.fs))/tickdist_labels_in_minutes) for i in xtickrange]

    plt.subplot(611)
    plt.imshow(magspec, aspect='auto', origin='lower', interpolation='nearest')
    plt.xticks(xtickrange, xticklabels, rotation=70, fontsize=8)
    plt.title("magnitude spectrogram")

    plt.subplot(612)
    plt.imshow(np.log(magspec+1), aspect='auto', origin='lower', interpolation='nearest')
    plt.xticks(xtickrange, xticklabels, rotation=70, fontsize=8)
    plt.title("log magnitude spectrogram")
    

    # spectral flatness
    magspec_without_last = magspec[:,0:magspec.shape[1]-1] # remove the last entry because it always contains 0
    print "calculating geomethric mean"
    gmean = stats.gmean(magspec_without_last, axis=0)
    print "calculating arithmetic mean"
    amean = np.mean(magspec_without_last, axis=0)
    spectral_flatness = gmean / amean

    spectrogram_xscale = plt.xlim()  # just to scale it the same way the spectrograms were scaled
    plt.subplot(613)
    plt.plot(spectral_flatness)
    plt.xlim(spectrogram_xscale)
    plt.xticks(xtickrange, xticklabels, rotation=70, fontsize=8)
    plt.title("spectral flatness")

    idx = np.where(spectral_flatness < 0.3)[0]
    spectral_flatness[idx] = 0.0
    plt.subplot(614)
    plt.plot(spectral_flatness)
    plt.xlim(spectrogram_xscale)
    plt.xticks(xtickrange, xticklabels, rotation=70, fontsize=8)
    plt.title("spectral flatness > 0.3")

    plt.subplot(615)
    plt.plot(amean)
    plt.xlim(spectrogram_xscale)
    plt.xticks(xtickrange, xticklabels, rotation=70, fontsize=8)
    plt.title("arithmetic mean")

    idx = np.where(amean > 0.001)[0]
    amean[idx] = 1.0
    plt.subplot(616)
    plt.plot(amean)
    plt.xlim(spectrogram_xscale)
    plt.xticks(xtickrange, xticklabels, rotation=70, fontsize=8)
    plt.title("arithmetic mean < 0.001")

    
    plt.suptitle("File: " + ntpath.basename(args.filename) + "  |  Time: " + str(round(args.start/60,1)) + " - " + str(round((args.start+length_sec)/60, 1)) + " [min]")
    plt.show()