Exemple #1
0
    def analyze(cls, events: List[CorpusEvent],
                metadata: Metadata) -> List[CorpusEvent]:
        if not FeatureUtils.is_valid_audio(events, metadata):
            raise FeatureError(
                f"Feature '{cls.__name__}' does not support content of "
                f"type {metadata.content_type.__class__.__name__}")

        metadata: AudioMetadata = typing.cast(AudioMetadata, metadata)
        # TODO: Pass rather than hard-code
        yin_frames: np.ndarray = librosa.yin(metadata.foreground_data,
                                             fmin=50,
                                             fmax=4186,
                                             sr=metadata.sr,
                                             frame_length=2048,
                                             hop_length=metadata.hop_length)
        yin_midipitches: np.ndarray = np.round(
            12 * np.log2(yin_frames / 8.175798915643707))
        for event in events:
            onset_frame: int = librosa.time_to_frames(
                event.onset, sr=metadata.sr, hop_length=metadata.hop_length)
            end_frame: int = librosa.time_to_frames(
                event.onset + event.duration,
                sr=metadata.sr,
                hop_length=metadata.hop_length)
            hist, _ = np.histogram(yin_midipitches[onset_frame:end_frame],
                                   bins=128,
                                   range=(0, 128))
            pitch: int = int(np.argmax(hist))
            event.set_feature(cls(value=pitch))

        return events
Exemple #2
0
def main():
    # Track beats using time series input

    song = 'song.mp3'
    y, sr = librosa.load(song)

    tempo, beats = librosa.beat.beat_track(y=y, sr=sr)


    onset_env = librosa.onset.onset_strength(y, sr=sr,
                                             aggregate=np.median)
    tempo, beats = librosa.beat.beat_track(onset_envelope=onset_env,
                                           sr=sr)

    onset_max = np.argmax(onset_env)

    starting_beat = find_nearest(beats, onset_max)
    print(starting_beat)
    range = (beats[starting_beat], beats[starting_beat + 8])

    # start_time = librosa.frames_to_time(range[0], sr=sr)
    # end_time = librosa.frames_to_time(range[1], sr=sr)
    # print(start_time, end_time)
    # mel_spectrogram(mp3=song, start_time=start_time, end_time=end_time)

    play_by_seconds(song,108,120)
    print(librosa.time_to_frames(108, sr=sr))
    print(librosa.time_to_frames(120, sr=sr))
Exemple #3
0
def extract_vocal(path):
	#print(path)
	y, sr = librosa.load(path)
	#print(y)
	S_full, phase = librosa.magphase(librosa.stft(y))
	#print(S_full)
	idx = slice(*librosa.time_to_frames([30, 35], sr=sr))
	S_filter = librosa.decompose.nn_filter(S_full,
                                       aggregate=np.median,
                                       metric='cosine',
                                       width=int(librosa.time_to_frames(2, sr=sr)))
	S_filter = np.minimum(S_full, S_filter)

	margin_i, margin_v = 2, 10
	power = 2

	mask_i = librosa.util.softmask(S_filter,
	                               margin_i * (S_full - S_filter),
	                               power=power)

	mask_v = librosa.util.softmask(S_full - S_filter,
	                               margin_v * S_filter,
	                               power=power)
	S_foreground = mask_v * S_full
	S_background = mask_i * S_full
	D_foreground = S_foreground * phase
	y_foreground = librosa.istft(D_foreground)

	D_background = S_background * phase
	y_background = librosa.istft(D_background)
	#print(y_foreground)
	
	maxv = np.iinfo(np.int16).max
	scipy.io.wavfile.write("foreground.wav", sr, (y_foreground* maxv).astype(np.int16))
Exemple #4
0
    def vocal_removal(self, y, sr):
        """
            https://librosa.github.io/librosa_gallery/auto_examples/plot_vocal_separation.html
        """
        idx = slice(*librosa.time_to_frames([0, 10], sr=sr))
        S_full, phase = librosa.magphase(librosa.stft(y))
        S_filter = librosa.decompose.nn_filter(S_full,
                                       aggregate=np.median,
                                       metric='cosine',
                                       width=int(librosa.time_to_frames(2, sr=sr)))
        S_filter = np.minimum(S_full, S_filter)
        margin_i, margin_v = 2, 10
        power = 2
        mask_i = librosa.util.softmask(S_filter,
                                       margin_i * (S_full - S_filter),
                                       power=power)

        mask_v = librosa.util.softmask(S_full - S_filter,
                                       margin_v * S_filter,
                                       power=power)

        S_foreground = mask_v * S_full
        S_background = mask_i * S_full

        # Convert back to audio
        audio_minus_vocals = librosa.core.istft(S_background[:, idx])

        return audio_minus_vocals
Exemple #5
0
def initialize_activations_with_onset_templates(signal, mid, pitches, tol_on, tol_off, max_time=None):
    _n_features, n_samples = signal.S.shape
    H_init = numpy.zeros((len(pitches) * 2, n_samples))
    onset_components_offset = len(pitches)
    current_time = 0
    on_since = {}
    for msg in mid:
        if msg.is_meta:
            continue
        current_time += msg.time
        if msg.type == 'note_on':
            on_since[msg.note] = current_time
            component = pitch_to_component(msg.note, pitches)
            start_frame, end_frame = librosa.time_to_frames([current_time - tol_on, current_time + tol_on], sr=signal.sr, hop_length=signal.fft_hop_length)
            start_frame = max(start_frame, 0)
            end_frame = min(end_frame, n_samples - 1)
            H_init[onset_components_offset + component, start_frame:end_frame] = 1
        elif msg.type == 'note_off':
            note_on_since = on_since.pop(msg.note)
            component = pitch_to_component(msg.note, pitches)
            start_frame, end_frame = librosa.time_to_frames([note_on_since - tol_on, current_time + tol_off], sr=signal.sr, hop_length=signal.fft_hop_length)
            start_frame = max(start_frame, 0)
            end_frame = min(end_frame, n_samples - 1)
            H_init[component, start_frame:end_frame] = 1
        if max_time is not None and current_time > max_time:
            break

    return H_init
Exemple #6
0
def load_siamese_data (csvfilepath, num_train_artist):
    ''' 
    Args:
    Return: 
    ''' 
        
    artist_tracks_segments = {} # dict of artist to tracks to vocal segments 
    with open(csvfilepath, 'r') as csv_file : 
        csv_reader = csv.DictReader(csv_file)
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                line_count += 1
            curr_artist = int(row['artist_index'])
            path_to_feat = config.id7d_to_path[config.idmsd_to_id7d[row['track_id']]].replace('.mp3','.npy')
            start_frames = librosa.time_to_frames(ast.literal_eval(row['vocal_segments']), sr=config.sr, hop_length=config.hop_length, n_fft=config.n_fft)
            if start_frames[0] < 0:
                start_frames[0] = 0
            
            try : 
                artist_tracks_segments[curr_artist][path_to_feat] = start_frames
            except :
                artist_tracks_segments[curr_artist] = {}
                artist_tracks_segments[curr_artist][path_to_feat] = start_frames



    track_list = [] 
    y_list = [] 

    with open(csvfilepath, 'r') as csv_file : 
        singer_list = np.arange(num_train_artist)
        print ('num_singers:', len(singer_list))
        csv_reader = csv.DictReader(csv_file)

        line_count = 0 
        for row in csv_reader:
            if line_count == 0 :
                line_count +=1
            curr_artist_id = int(row['artist_index'])
            path_to_feat = config.id7d_to_path[config.idmsd_to_id7d[row['track_id']]].replace('.mp3', '.npy')

            start_frames = librosa.time_to_frames(ast.literal_eval(row['vocal_segments']), sr=config.sr, hop_length=config.hop_length, n_fft=config.n_fft)
            
            # train with all vocal segments 
            for i in range(len(start_frames)):
                
                if start_frames[i] < 0:
                    start_frames[i] = 0

                track_list.append((path_to_feat, start_frames[i]))
                
                y_list.append(curr_artist_id)


    track_list = np.array(track_list)
    y_list = np.array(y_list)

    
    return track_list, y_list, artist_tracks_segments 
Exemple #7
0
def invert_test(file_name, input_length=3, input_overlap=False):

    sr = config.SR
    # convert seconds to frames
    n_frames = librosa.time_to_frames(input_length, sr=config.SR, n_fft=config.FFT_SIZE, hop_length=config.FFT_HOP) + 1
    if not input_overlap:
        overlap = n_frames
    else:
        overlap = librosa.time_to_frames(input_overlap, sr=config.SR, n_fft=config.FFT_SIZE, hop_length=config.FFT_HOP)

    # batching data
    print('Computing spectrogram (w/ librosa) and tags (w/ tensorflow)..', end =" ")
    batch, spectrogram = batch_data(file_name, n_frames, overlap)


    audio_rep = (np.power(spectrogram,10.0)-1.0)/10000.0
    audio_rep = audio_rep.T

    audio_out = librosa.feature.inverse.mel_to_audio(M=audio_rep,
                                               sr=sr,
                                               hop_length=config.FFT_HOP,
                                               n_fft=config.FFT_SIZE)
    sd.play(audio_out, sr)
    #librosa.save(audio_file_out, sr=config.SR)
    #audio_rep = audio_rep.astype(np.float16)
    #audio_rep = np.log10(10000 * audio_rep + 1)
    return audio_out
def get_phrase_intervals(file_id, y,sr, r, w, w_p_ratio,  period_threshold, peak_window, tempo):
    #param:
    #   y = waveform
    #   sr = sample rate
    #   r = radius/resolution of diagonal cut
    #   w = checkerboard window size in seconds,  w<=r
    #   w_p_ratio = ratio used for peak picking, unknown
    #   period_threshold = threshold for filtering by period
    #   fpb = frames per beat for phrase search
    
    #TEST CONSTANTS
    if (w>r):
        sys.exit('Window Resolution Mismatch')
    sample_length = librosa.samples_to_time([len(y)],sr)[0] #s
    w_f = librosa.time_to_frames([w],hop_length=256)[0]
    f  = extract_features(y)
    #frames per beat
    fpb = librosa.time_to_frames([1/(tempo/60)],hop_length=256)[0]
    #LOAD OR CREATE S-MATRIX & NOVELTY VECTOR
    s_matrix = init_smatrix(file_id,f,r,  sample_length)
    novelty = init_novelty_vector(file_id, w, w_f,  sample_length, s_matrix)
    
    w_p = w_f/w_p_ratio
    peaks = librosa.util.peak_pick(novelty, w_p, w_p, w_p, w_p, peak_window, w_p)
    
    return filter_by_period(peaks, period_threshold, fpb)
def load_label(audio_spec, audio_label_file):
    ''' Process and load label for the given audio
    Args:
        audio_spec : melgram of the audio. Shape=(n_bins, total_frames) ex.(80,14911)
        audio_label_file : path to the label file ex. './jamendo/jamendo_lab/02 - The Louis...lab' 
    Return :
        lab : list of ground truth annotations per frame. Shape=(total_frames, )
    '''
    with open(audio_label_file, 'r') as f:
        total_frames = audio_spec.shape[1]
        label = np.zeros((total_frames, ))

        for line in f:
            l = line.strip('\n').split(' ')
            start = librosa.time_to_frames(float(l[0]),
                                           sr=SR,
                                           hop_length=HOP_LENGTH)
            end = librosa.time_to_frames(float(l[1]),
                                         sr=SR,
                                         hop_length=HOP_LENGTH)

            is_vocal = 1 if l[2] == 'sing' or l[2] == '1' else 0
            # label[start:end] = int(is_vocal)
            label[start[0]:end[0]] = int(is_vocal)  # gwm 23/1/2019

    return label
Exemple #10
0
def changeTempo(current_tempo, onset_times, desired_tempo):

    hi_hat, _ = librosa.load('./Thrown/test_sounds/sfx/closed_hi_hat.wav')
    drum_hit, _ = librosa.load('./Thrown/test_sounds/sfx/drum_hit.wav')

    desired_tempo_timing = 60 / desired_tempo
    current_tempo_timing = 60 / current_tempo

    scale_factor = desired_tempo_timing / current_tempo_timing

    onset_frames1 = librosa.time_to_frames(onset_times, sr=sr)

    clicks1 = librosa.clicks(frames=onset_frames1,
                             sr=sr,
                             click_duration=.01,
                             length=len(t[2][4]),
                             click=hi_hat)

    sf.write('./Thrown/test_sounds/thrown_w_beat.wav', clicks1 + t[1][2], sr)

    scaled_onset_times = []
    for i in range(0, len(onset_times)):
        scaled_onset_times.append(onset_times[i] * scale_factor)

    scaled_beat_frames = []
    for i in range(0, len(t[1][8])):
        scaled_beat_frames.append(t[1][8][i] * scale_factor)

    scaled_beat_times = []
    for i in range(0, len(t[1][6])):
        scaled_beat_times.append(t[1][6] * scale_factor)

    onset_frames2 = librosa.time_to_frames(scaled_onset_times, sr=sr)

    print(onset_frames2)

    scaled_sample = librosa.effects.time_stretch(t[1][2], 1 / scale_factor)

    clicks2 = librosa.clicks(frames=onset_frames2,
                             sr=sr,
                             click_duration=.01,
                             length=len(scaled_sample),
                             click=hi_hat)
    clicks3 = librosa.clicks(frames=scaled_beat_frames,
                             sr=sr,
                             click_duration=.01,
                             length=len(scaled_sample),
                             click=drum_hit)

    librosa.output.write_wav('./Thrown/test_sounds/thrown_altered_beat.wav',
                             clicks2 + clicks3 + scaled_sample, sr)
    sf.write('./Thrown/test_sounds/altered_beat.wav', clicks2 + clicks3, sr)

    plt.figure(figsize=(14, 5))
    plt.title("Removing Percussive Sample, Scalability of Tempo Events")
    plt.vlines(scaled_onset_times, -1, 1, color='c', linestyles='dashed')
    plt.vlines(scaled_beat_times, -1, 1, color='y', linestyles='dashed')
    plt.ylim(-1, 1)
    plt.xlim(0, 5)
Exemple #11
0
 def set_windowing(self, width, stride, *option):  # called in train.py
     """Setup windowing process (argument values in seconds)."""
     self.width = librosa.time_to_frames(width, self.samp_rate)
     model = str(option[0])
     # bulbul requires minimum resolution of 106
     if model == "bulbul = pepeiao.models:bulbul" and self.width < 106:
         self.width = 106
     self.stride = librosa.time_to_frames(stride, self.samp_rate)
     _LOGGER.info('Set width to %d columns', self.width)
     _LOGGER.info('Set stride to %d columns', self.stride)
Exemple #12
0
def notes_matrix_to_annotation(notes, nframes):
    binary_annotation_matrix = np.zeros((48, nframes))
    full_annotation_matrix = np.zeros((48, nframes, 6))
    for note in notes:
        starting_frame = librosa.time_to_frames(note[0])
        duration_frames = librosa.time_to_frames(note[1])
        ending_frame = starting_frame + duration_frames
        note_value, string = int(note[2]) - 35, int(note[3])
        binary_annotation_matrix[note_value, starting_frame:ending_frame] = 1
        full_annotation_matrix[note_value, starting_frame:ending_frame,
                               string] = 1
    return binary_annotation_matrix, full_annotation_matrix
def patch_label(start,
                end,
                time_windows,
                annotate,
                binary=False,
                threshold=None):
    """Labeling a patch given annotation
    Args:
        start(float): start time of a patch (in second)
        end(float): end time of a patch (in second)
        time_windows(float): time windows for average (in milliseconds)
        annotation(DataFrame): annotation dataframe for a specific song
        songname(string): song_name(string)
    Returns:
        label(pd.DataFrame):
            column: instrument
            index: label
            eg:
                        S01   S02
                label    1   0.93
    This code has been largely borrowed from https://github.com/glennq/instrument-recognition/blob/master/data/patch_label.py
    """
    #Transfer time to frame
    annotation = copy.copy(annotate)
    start_frame = librosa.time_to_frames(start, sr=1 / 0.0464,
                                         hop_length=1) + 1
    end_frame = librosa.time_to_frames(end, sr=1 / 0.0464, hop_length=1) - 1
    moving_frame = librosa.time_to_frames(
        time_windows / 1000, sr=1 / 0.0464,
        hop_length=1) - librosa.time_to_frames(0, sr=1 / 0.0464, hop_length=1)
    #Pick annotation
    annotation = annotation.reset_index(drop=True)
    annotation.index += 1
    time_annot = annotation.loc[start_frame:end_frame].drop('time', 1)
    #Using maximum value of average in moving windows as label
    music_ins = time_annot.columns
    label = pd.DataFrame(index=[
        'label',
    ], columns=music_ins)
    for j in range(len(time_annot.columns)):
        label_temp = max([
            sum(list(time_annot.ix[:, j])[i:i + moving_frame + 1]) /
            float(moving_frame + 1)
            for i in range(len(time_annot.ix[:, j]) - moving_frame)
        ])
        #binary output
        if binary and threshold:
            if label_temp >= threshold:
                label_temp = float(1)
            else:
                label_temp = float(0)
        label.ix[:, j] = label_temp
    return label
Exemple #14
0
    def __init__(self,
                 model='MTT_musicnn',
                 input_length=3,
                 input_overlap=False):

        # select model
        if 'MTT' in model:
            self.labels = config.MTT_LABELS
        elif 'MSD' in model:
            self.labels = config.MSD_LABELS
        else:
            raise RuntimeError("Bad model name")
        self.num_classes = len(self.labels)

        if 'vgg' in model and input_length != 3:
            raise ValueError(
                'Set input_length=3, the VGG models cannot handle different input lengths.'
            )

        # convert seconds to frames
        self.n_frames = librosa.time_to_frames(input_length,
                                               sr=config.SR,
                                               n_fft=config.FFT_SIZE,
                                               hop_length=config.FFT_HOP) + 1
        if not input_overlap:
            self.overlap = self.n_frames
        else:
            self.overlap = librosa.time_to_frames(input_overlap,
                                                  sr=config.SR,
                                                  n_fft=config.FFT_SIZE,
                                                  hop_length=config.FFT_HOP)

        # tensorflow: define the model
        tf.compat.v1.reset_default_graph()
        with tf.name_scope('model'):
            self.x = tf.compat.v1.placeholder(
                tf.float32, [None, self.n_frames, config.N_MELS])
            self.train = tf.compat.v1.placeholder(tf.bool)
            if 'vgg' in model:
                y, _, _, _, _, _ = models.define_model(self.x, self.train,
                                                       model, self.num_classes)
            else:
                y, _, _, _, _, _, _, _, _ = models.define_model(
                    self.x, self.train, model, self.num_classes)
            self.pred = tf.nn.sigmoid(y)

        config_tf = tf.compat.v1.ConfigProto()
        config_tf.gpu_options.allow_growth = True
        self.sess = tf.compat.v1.Session(config=config_tf)
        self.sess.run(tf.compat.v1.global_variables_initializer())
        saver = tf.compat.v1.train.Saver()
        saver.restore(self.sess, os.path.dirname(__file__) + '/' + model + '/')
def generate_annotation_matrices(annotation, frames):
    '''
    This function will return a one hot encoded matrix of notes being played
    The annotation matrix will start w/ note 25 at index 0 and go up to note 100
    The highest and lowest values that I saw in the annotations seemed to be arounnd 29-96 so give a little leeway
    :return:
    '''
    annotation_matrix = np.zeros((84, frames))
    for note in annotation:
        starting_frame = time_to_frames(note[1])
        duration_frames = time_to_frames(note[2] - note[1])
        note_value = note[0]
        annotation_matrix[note_value - 25][starting_frame:starting_frame +
                                           duration_frames] = 1
    return annotation_matrix.T
Exemple #16
0
    def read_ann_beats(self):
        """Reads the annotated beats if available.

        Returns
        -------
        times: np.array
            Times of annotated beats in seconds.
        frames: np.array
            Frame indeces of annotated beats.
        """
        times, frames = (None, None)

        # Read annotations if they exist in correct folder
        if os.path.isfile(self.file_struct.ref_file):
            try:
                jam = jams.load(self.file_struct.ref_file)
            except TypeError:
                logging.warning(
                    "Can't read JAMS file %s. Maybe it's not "
                    "compatible with current JAMS version?" %
                    self.file_struct.ref_file)
                return times, frames
            beat_annot = jam.search(namespace="beat.*")

            # If beat annotations exist, get times and frames
            if len(beat_annot) > 0:
                beats_inters, _ = beat_annot[0].to_interval_values()
                times = beats_inters[:, 0]
                frames = librosa.time_to_frames(times, sr=self.sr,
                                                hop_length=self.hop_length)
        return times, frames
Exemple #17
0
def getIntervalFromJAMS(path):
  j = jams.load(path)
  res = []
  for i in zip(list(j.annotations[0].data.time), list(j.annotations[0].data.time + j.annotations[0].data.duration), j.annotations[0].data.value):
    v = [[librosa.time_to_frames([i[0].total_seconds(), i[1].total_seconds()]), i[2].encode("ascii")]]
    res += v
  return res
Exemple #18
0
    def split_vocal(self, y):
        S_full, phase = librosa.magphase(librosa.stft(y))

        # To avoid being biased by local continuity, we constrain similar frames to be
        # separated by at least 1.2 seconds.
        S_filter = librosa.decompose.nn_filter(
            S_full,
            aggregate=np.median,
            metric='cosine',
            width=int(librosa.time_to_frames(self._constrained, sr=self._sr)))

        S_filter = np.minimum(S_full, S_filter)

        margin_v = 10
        power = 2

        mask_v = librosa.util.softmask(S_full - S_filter,
                                       margin_v * S_filter,
                                       power=power)

        S_foreground = mask_v * S_full

        foreground = griffinlim(S_foreground)

        return foreground
Exemple #19
0
 def get_frame(self) -> int:
     if Beat.INDEX_VALUE == 'samples':
         return librosa.samples_to_frames(self.index, hop_length=util.HOP_LENGTH)
     elif Beat.INDEX_VALUE == 'time':
         return librosa.time_to_frames(self.index, sr=util.SAMPLE_RATE, hop_length=util.HOP_LENGTH)
     else:
         raise NotImplementedError("Only samples and time are supported")
Exemple #20
0
    def __call__(self, sample):
        y, sr = sample['wav']
        y, sr = librosa.resample(y, sr, 22050), 22050
        ref_times, ref_freqs = sample['gt']

        fft_f = librosa.fft_frequencies(sr, self.n_fft)
        f_interp = interp1d(librosa.time_to_frames(ref_times, sr,
                                                   self.hop_length,
                                                   self.n_fft),
                            ref_freqs,
                            fill_value=0.0,
                            bounds_error=False)
        fft = librosa.stft(y, self.n_fft, self.hop_length)
        n_fft = np.zeros(fft.shape, dtype=fft.dtype)
        for frame in range(fft.shape[1]):
            freq = f_interp(frame)
            for i in range(self.n_harmonics):
                idx = np.argmin(np.abs(fft_f - freq * (i + 1)))
                if np.abs(fft_f[idx] - freq *
                          (i + 1)) < fft_f[idx] * (2**(1 / 24) - 1):
                    n_fft[idx, frame] = fft[idx, frame]
                else:
                    fft[:, frame] = 0

        y = librosa.istft(n_fft, self.hop_length)
        y = y / max(y)
        if self.new_key is None:
            sample['wav'] = y, sr
        else:
            sample[self.new_key] = y, sr
        return sample
Exemple #21
0
    def read_ann_beats(self):
        """Reads the annotated beats if available.

        Returns
        -------
        times: np.array
            Times of annotated beats in seconds.
        frames: np.array
            Frame indeces of annotated beats.
        """
        times, frames = (None, None)

        # Read annotations if they exist in correct folder
        if os.path.isfile(self.file_struct.ref_file):
            jam = jams.load(self.file_struct.ref_file)
            beat_annot = jam.search(namespace="beat.*")

            # If beat annotations exist, get times and frames
            if len(beat_annot) > 0:
                beats_inters, _ = beat_annot[0].data.to_interval_values()
                times = beats_inters[:, 0]
                frames = librosa.time_to_frames(times,
                                                sr=self.sr,
                                                hop_length=self.hop_length)
        return times, frames
Exemple #22
0
def make_sampler(max_samples, duration, pump, seed):

    n_frames = librosa.time_to_frames(duration,
                                      sr=pump['mel'].sr,
                                      hop_length=pump['mel'].hop_length)[0]

    return pump.sampler(max_samples, n_frames, random_state=seed)
Exemple #23
0
    def process_config(self, config):
        ''' preprocess config '''
        data_conf = config['data']
        class_vocab = data_conf['task']['classes']['vocab']
        assert len(class_vocab) == data_conf['task']['classes']['num']

        # add revere_vocab, positive_id
        reverse_vocab = {val: key for key, val in class_vocab.items()}
        data_conf['task']['classes']['reverse_vocab'] = reverse_vocab

        # binary class
        pos_id = config['solver']['metrics']['pos_label']
        data_conf['task']['classes']['positive_id'] = pos_id
        data_conf['task']['classes']['positive'] = reverse_vocab[pos_id]

        # add feature shape, withoud batch_size
        if data_conf['task']['suffix'] == '.npy':
            input_channels = 3 if data_conf['task']['audio'][
                'add_delta_deltas'] else 1
            nframe = librosa.time_to_frames(
                data_conf['task']['audio']['clip_size'],
                sr=data_conf['task']['audio']['sr'],
                hop_length=data_conf['task']['audio']['winstep'] *
                data_conf['task']['audio']['sr'])
            feature_shape = [
                nframe, data_conf['task']['audio']['feature_size'],
                input_channels
            ]
        else:
            feature_shape = [
                data_conf['task']['audio']['sr'] *
                data_conf['task']['audio']['clip_size']
            ]
        data_conf['task']['audio']['feature_shape'] = feature_shape
        return config
Exemple #24
0
    def read_ann_beats(self):
        """Reads the annotated beats if available.

        Returns
        -------
        times: np.array
            Times of annotated beats in seconds.
        frames: np.array
            Frame indeces of annotated beats.
        """
        times, frames = (None, None)

        # Read annotations if they exist in correct folder
        if os.path.isfile(self.file_struct.ref_file):
            try:
                jam = jams.load(self.file_struct.ref_file)
            except TypeError:
                logging.warning("Can't read JAMS file %s. Maybe it's not "
                                "compatible with current JAMS version?" %
                                self.file_struct.ref_file)
                return times, frames
            beat_annot = jam.search(namespace="beat.*")

            # If beat annotations exist, get times and frames
            if len(beat_annot) > 0:
                beats_inters, _ = beat_annot[0].data.to_interval_values()
                times = beats_inters[:, 0]
                frames = librosa.time_to_frames(times,
                                                sr=self.sr,
                                                hop_length=self.hop_length)
        return times, frames
def process_one_file(midi_filename, skip=True):
    '''
    Load in midi data, compute features, and write out file

    :parameters:
        - midi_filename : str
            Full path to midi file
        - skip : bool
            Whether to skip creating the file when the npz already exists
    '''
    # npz files go in the 'npz' dir instead of 'mid'
    output_filename = mid_to_npz_path(midi_filename)
    # Skip files already created
    if skip and os.path.exists(output_filename):
        return
    try:
        m = pretty_midi.PrettyMIDI(midi_filename)
        midi_audio = alignment_utils.fast_fluidsynth(m, MIDI_FS)
        midi_gram = librosa.cqt(
            midi_audio, sr=MIDI_FS, hop_length=MIDI_HOP,
            fmin=librosa.midi_to_hz(NOTE_START), n_bins=N_NOTES)
        midi_beats, midi_tempo = alignment_utils.midi_beat_track(m)
        midi_sync_gram = alignment_utils.post_process_cqt(
            midi_gram, librosa.time_to_frames(
                midi_beats, sr=MIDI_FS, hop_length=MIDI_HOP))
        np.savez_compressed(
            output_filename, sync_gram=midi_sync_gram,
            beats=midi_beats, bpm=midi_tempo)
    except Exception as e:
        print "Error processing {}: {}".format(midi_filename, e)
Exemple #26
0
    def get_vocal(self):
        # And compute the spectrogram magnitude and phase
        S_full, phase = librosa.magphase(librosa.stft(self.wave))
        S_filter = librosa.decompose.nn_filter(
            S_full,
            aggregate=numpy.median,
            metric='cosine',
            width=int(librosa.time_to_frames(2, sr=self.sample_rate)))
        S_filter = numpy.minimum(S_full, S_filter)
        margin_i, margin_v = 2, 10
        power = 2

        mask_i = librosa.util.softmask(S_filter,
                                       margin_i * (S_full - S_filter),
                                       power=power)

        mask_v = librosa.util.softmask(S_full - S_filter,
                                       margin_v * S_filter,
                                       power=power)
        S_foreground = mask_v * S_full
        comps, acts = librosa.decompose.decompose(S_foreground,
                                                  n_components=16,
                                                  sort=True)  # decomposition
        if numpy.count_nonzero(comps) < 10000:
            return 0  # no vocal
        else:
            return 1  # vocal
Exemple #27
0
def make_sampler(max_samples, duration, pump, seed):

    n_frames = librosa.time_to_frames(duration,
                                      sr=pump['mel'].sr,
                                      hop_length=pump['mel'].hop_length)[0]

    return pump.sampler(max_samples, n_frames, random_state=seed)
Exemple #28
0
def split_vocal_to_wav(filename, fp_foreground, fp_background=None):
    y, sr = librosa.load(filename, sr=16000)

    S_full, phase = librosa.magphase(librosa.stft(y))

    S_filter = librosa.decompose.nn_filter(S_full,
                                           aggregate=np.median,
                                           metric='cosine',
                                           width=int(
                                               librosa.time_to_frames(2,
                                                                      sr=sr)))

    S_filter = np.minimum(S_full, S_filter)

    margin_i, margin_v = 2, 10
    power = 2

    mask_i = librosa.util.softmask(S_filter,
                                   margin_i * (S_full - S_filter),
                                   power=power)

    mask_v = librosa.util.softmask(S_full - S_filter,
                                   margin_v * S_filter,
                                   power=power)

    S_foreground = mask_v * S_full
    S_background = mask_i * S_full

    foreground = griffinlim(S_foreground)
    fp_foreground += filename.split('/')[-1]
    sf.write(fp_foreground, foreground, sr, 'PCM_16')

    if fp_background is not None:
        background = griffinlim(S_background)
        sf.write(fp_background, background, sr, 'PCM_16')
Exemple #29
0
def compute_all_features(file_struct,
                         sonify_beats=False,
                         overwrite=False,
                         out_beats="out_beats.wav"):
    """Computes all the features for a specific audio file and its respective
        human annotations. It creates an audio file with the sonified estimated
        beats if needed.

    Parameters
    ----------
    file_struct: FileStruct
        Object containing all the set of file paths of the input file.
    sonify_beats: bool
        Whether to sonify the beats.
    overwrite: bool
        Whether to overwrite previous features JSON file.
    out_beats: str
        Path to the new file containing the sonified beats.
    """

    # Output file
    out_file = file_struct.features_file

    if os.path.isfile(out_file) and not overwrite:
        return  # Do nothing, file already exist and we are not overwriting it

    # Compute the features for the given audio file
    features = compute_features_for_audio_file(file_struct.audio_file)

    # Save output as audio file
    if sonify_beats:
        logging.info("Sonifying beats...")
        fs = 44100
        audio, sr = librosa.load(file_struct.audio_file, sr=fs)
        msaf.utils.sonify_clicks(audio,
                                 features["beats"],
                                 out_beats,
                                 fs,
                                 offset=0.0)

    # Read annotations if they exist in path/references_dir/file.jams
    if os.path.isfile(file_struct.ref_file):
        jam = jams.load(file_struct.ref_file)
        beat_annot = jam.search(namespace="beat.*")

        # If beat annotations exist, compute also annotated beatsync features
        if len(beat_annot) > 0:
            logging.info("Reading beat annotations from JAMS")
            annot_beats_inters, _ = beat_annot[0].data.to_interval_values()
            annot_beats_times = annot_beats_inters[:, 0]
            annot_beats_idx = librosa.time_to_frames(
                annot_beats_times,
                sr=msaf.Anal.sample_rate,
                hop_length=msaf.Anal.hop_size)
            features["ann_mfcc"], features["ann_hpcp"], \
                features["ann_tonnetz"], features["ann_cqt"] = \
                compute_beat_sync_features(features, annot_beats_idx)

    # Save output as json file
    save_features(out_file, features)
Exemple #30
0
 def _analyze_audio(cls, events: List[AudioCorpusEvent], metadata: AudioMetadata):
     # shape: (12, k) where k is measured in frames
     chroma = librosa.feature.chroma_stft(y=metadata.background_data, sr=metadata.sr, hop_length=metadata.hop_length,
                                          n_chroma=12, n_fft=8192)  # TODO: Pass as parameters
     for event in events:
         onset_frame: int = librosa.time_to_frames(event.onset, sr=metadata.sr, hop_length=metadata.hop_length)
         event.set_feature(cls(chroma[:, onset_frame]))
Exemple #31
0
    def __init__(self, audiofilepath):
        self.y, self.sr = librosa.load(audiofilepath, sr=global_sr)
        self.yshape = self.y.shape

        self.original = self.y.copy()
        self.newest_y = self.y.copy()
        self.duration = librosa.get_duration(y=self.y, sr=global_sr)
        self.timestamps = np.linspace(0, self.duration, int(global_sr * self.duration))
        self.frames_index = librosa.time_to_frames(self.timestamps, sr=global_sr,
                                                   hop_length=global_hop_len)
        self.frames_num = max(self.frames_index)
        self.action_box = None
        self.audio_frames = []
        self.mels = []
        self.best_reward = -1
        # self.step = 0
        self.mark = 1
        self.locked = []
        self.best_mismatches = None
        self.shft, self.stch = 0, 1
        self.output_list = []
        self.log_for_sox =[]
        self.frame_log = []
        self.action_history = [-1 for _ in range(self.frames_num)]
        self.epoch_history = {}
        self.shifted = 0
        self.stretched = 1
        self.pitched = 0
        self.vlines_widths = 0.75
        self.vlines_colors = "Blue"
        self.backward_limit = self.duration * 0.1
        self.forward_limit = self.duration * 0.1
Exemple #32
0
def test():
    base_dir = "../data/train/"

    y, sr = librosa.load(base_dir + "00ad36516.flac")

    chroma_orig = librosa.feature.chroma_cqt(y=y, sr=sr)

    # For display purposes, let's zoom in on a 15-second chunk from the middle of the song
    idx = tuple([slice(None), slice(*list(librosa.time_to_frames([45, 60])))])

    # And for comparison, we'll show the CQT matrix as well.
    C = np.abs(
        librosa.cqt(y=y, sr=sr, bins_per_octave=12 * 3, n_bins=7 * 12 * 3))

    fig, ax = plt.subplots(nrows=2, sharex=True)
    img1 = librosa.display.specshow(librosa.amplitude_to_db(C,
                                                            ref=np.max)[idx],
                                    y_axis='cqt_note',
                                    x_axis='time',
                                    bins_per_octave=12 * 3,
                                    ax=ax[0])
    fig.colorbar(img1, ax=[ax[0]], format="%+2.f dB")
    ax[0].label_outer()

    img2 = librosa.display.specshow(chroma_orig[idx],
                                    y_axis='chroma',
                                    x_axis='time',
                                    ax=ax[1])
    fig.colorbar(img2, ax=[ax[1]])
    ax[1].set(ylabel='Default chroma')

    plt.show()
    def process_config(self, config):
        data_conf = config['data']

        feature_shape = data_conf['task']['audio'].get('feature_shape', None)

        if not feature_shape:
            # add feature shape, withoud batch_size
            if data_conf['task']['suffix'] == '.npy':
                input_channels = 3 if data_conf['task']['audio'][
                    'add_delta_deltas'] else 1
                nframe = librosa.time_to_frames(
                    data_conf['task']['audio']['clip_size'],
                    sr=data_conf['task']['audio']['sr'],
                    hop_length=data_conf['task']['audio']['winstep'] *
                    data_conf['task']['audio']['sr'])
                feature_shape = [
                    nframe, data_conf['task']['audio']['feature_size'],
                    input_channels
                ]
            else:
                feature_shape = [
                    data_conf['task']['audio']['sr'] *
                    data_conf['task']['audio']['clip_size']
                ]
            data_conf['task']['audio']['feature_shape'] = feature_shape
        logging.info(f"FEATURE SHAPE: {feature_shape}")
        return config
 def updateBackground (self, currenttime):
     framenumber = librosa.time_to_frames([currenttime/1000])
     
     currentmaximumvolume = max(self.transposed [framenumber[0]]) - self.minimumVol
     
     if currentmaximumvolume < self.averageVol:
         percentmaxvolume = int(50*(currentmaximumvolume/self.averageVol))
     else:
         percentmaxvolume = int(50 + (50*((currentmaximumvolume-self.averageVol)/(self.maximumVol-self.averageVol))))
     
     print("Vol    :", currentmaximumvolume)
     print("Ave Vol:", self.averageVol)
     print("Per Vol:", percentmaxvolume, "%")
     
     if percentmaxvolume <= 50:
         rgb = int(255*percentmaxvolume/50)
         rgbtuple = (rgb, 255, 0)
         #self._canvas.configure (background = '#%02x%02x%02x' % (rgb, 255, 0))
         #print("RGB    :", "(" + str(rgb) + ", " + str(255) + ", " + "0)")
     else:
         rgb = int(255*(percentmaxvolume-50)/50)
         rgbtuple = (255, 255-rgb, 0)
         #  self._canvas.configure (background = '#%02x%02x%02x' % (255, 255-rgb, 0))
         #print("RGB    :", "(" + str(255) + ", " + str(255-rgb) + ", " + "0)")
     
     closebeat = self.getClosestBeats (currenttime)
     percentage = self.scaleBetween (self.getPercentCloseToBeat(currenttime, closebeat), 0.6, 1)
     
     #rgbtuple = (255,0,0)
     rgbtuple = (int (rgbtuple[0]*percentage), int (rgbtuple [1] * percentage), int (rgbtuple[2]*percentage))
     
     self._canvas.configure (background = '#%02x%02x%02x' % (rgbtuple))
def load_data_segment(picklefile, artist_list):
    train_data = []
    artist_names = []

    f = pickle.load(open(picklefile, 'rb'))
    artist_to_id = {}
    for u in range(len(artist_list)):
        artist_to_id[artist_list[u]] = u

    for artist_id, tracks in f.items():
        for track_id, svd in tracks.items():
            center_segs = svd[len(svd) // 2 - 10:len(svd) // 2 + 10]
            # center_segs = svd[len(svd)//2 - 5 : len(svd)//2 + 5]
            start_frames = librosa.time_to_frames(center_segs,
                                                  sr=22050,
                                                  hop_length=512,
                                                  n_fft=1024)
            for i in range(len(start_frames)):
                start_frame = start_frames[i]
                if start_frame < 0:
                    start_frame = 0
                # train_data.append((artist_to_id[artist_id], track_id + '.npy', start_frame))
                ### augmentation
                train_data.append(
                    (artist_to_id[artist_id], track_id + '.npy', start_frame))
                # train_data.append((artist_to_id[artist_id], track_id + '.npy', start_frame, 1 ))
                artist_names.append(artist_id)
                artist_names.append(artist_id)

    return train_data, artist_names
Exemple #36
0
def annotation_to_mat(ref_intervals, ref_labels, beats, sr=SAMPLE_RATE, hop_length=HOP_SIZE):

    truth_dict = {}
    for label, interval in zip(ref_labels, ref_intervals):
        frames = librosa.time_to_frames(interval, sr=sr, hop_length=hop_length)
        if label in truth_dict:
            truth_dict[label]['times'].append(interval)
            truth_dict[label]['frames'].append(frames)
        else:
            truth_dict[label] = {'times': [interval], 'frames': [frames]}

    for k in truth_dict:
        truth_dict[k]['beats'] = []
        for interval in truth_dict[k]['frames']:
            beat_interval = np.argmin(np.abs([interval[0]-beats, interval[1]-beats]), axis=1)
            truth_dict[k]['beats'].append(beat_interval)

    truth_mat_beat = np.zeros((len(beats)-1, len(beats)-1))
    for k in truth_dict:
        for i_b in truth_dict[k]['beats']:
            for j_b in truth_dict[k]['beats']:
                if np.array_equal(i_b, j_b):
                    truth_mat_beat[i_b[0]:i_b[1], j_b[0]:j_b[1]] = 1
                else:
                    truth_mat_beat[i_b[0]:i_b[1], j_b[0]:j_b[1]] = 0.9
    return truth_mat_beat
Exemple #37
0
def compute_all_features(file_struct, sonify_beats=False, overwrite=False,
						 out_beats="out_beats.wav"):
	"""Computes all the features for a specific audio file and its respective
		human annotations. It creates an audio file with the sonified estimated
		beats if needed.

	Parameters
	----------
	file_struct: FileStruct
		Object containing all the set of file paths of the input file.
	sonify_beats: bool
		Whether to sonify the beats.
	overwrite: bool
		Whether to overwrite previous features JSON file.
	out_beats: str
		Path to the new file containing the sonified beats.
	"""

	# Output file
	out_file = file_struct.features_file

	if os.path.isfile(out_file) and not overwrite:
		return	# Do nothing, file already exist and we are not overwriting it

	# Compute the features for the given audio file
	features = compute_features_for_audio_file(file_struct.audio_file)

	# Save output as audio file
	if sonify_beats:
		logging.info("Sonifying beats...")
		fs = 44100
		audio, sr = librosa.load(file_struct.audio_file, sr=fs)
		msaf.utils.sonify_clicks(audio, features["beats"], out_beats, fs,
								 offset=0.0)

	# Read annotations if they exist in path/references_dir/file.jams
	if os.path.isfile(file_struct.ref_file):
		jam = jams2.load(file_struct.ref_file)

		# If beat annotations exist, compute also annotated beatsync features
		if jam.beats != []:
			logging.info("Reading beat annotations from JAMS")
			annot = jam.beats[0]
			annot_beats = []
			for data in annot.data:
				annot_beats.append(data.time.value)
			annot_beats = np.unique(annot_beats)
			annot_beats_idx = librosa.time_to_frames(
				annot_beats, sr=msaf.Anal.sample_rate,
				hop_length=msaf.Anal.hop_size)
			features["ann_mfcc"], features["ann_hpcp"], \
				features["ann_tonnetz"], features["ann_cqt"],\
				features["ann_gmt"] = \
				compute_beat_sync_features(features, annot_beats_idx)

	# Save output as json file
	save_features(out_file, features)
    def bpm(self):   
        """Computes tempo of a signal in Beats Per Minute with its tempo onsets""" 
        self.onsets_strength()                                                             
        n = len(self.envelope) 
        win_length = np.asscalar(time_to_frames(8.0, self.fs, self.H))
        ac_window = hann(win_length) 
        self.envelope = np.pad(self.envelope, int(win_length // 2),mode='linear_ramp', end_values=[0, 0])
        frames = 1 + int((len(self.envelope) - win_length) / 1) 

        f = []                                                                            
        for i in range(win_length):     
            f.append(self.envelope[i:i+frames])
        f = np.array(f)[:,:n]              
        self.windowed_x = f * ac_window[:, np.newaxis]
        self.autocorrelation()

        tempogram = np.mean(self.correlation, axis = 1, keepdims = True)

        bin_frequencies = np.zeros(int(tempogram.shape[0]), dtype=np.float)

        bin_frequencies[0] = np.inf
        bin_frequencies[1:] = 60.0 * self.fs / (self.H * np.arange(1.0, tempogram.shape[0]))

        prior = np.exp(-0.5 * ((np.log2(bin_frequencies) - np.log2(80)) / bin_frequencies[1:].std())**2)
        max_indexes = np.argmax(bin_frequencies < 208)
        min_indexes = np.argmax(bin_frequencies < 80)

        prior[:max_indexes] = 0
        prior[min_indexes:] = 0
        p = prior.nonzero()

        best_period = np.argmax(tempogram[p] * prior[p][:, np.newaxis] * -1, axis=0)
        self.tempo = bin_frequencies[p][best_period]

        period = round(60.0 * (self.fs/self.H) / self.tempo[0])

        window = np.exp(-0.5 * (np.arange(-period, period+1)*32.0/period)**2)
        localscore = convolve(self.envelope/self.envelope.std(ddof=1), window, 'same')
        backlink, cumscore = dp(localscore, period, 100)
        self.ticks = [last_beat(cumscore)]

        while backlink[self.ticks[-1]] >= 0:
            self.ticks.append(backlink[self.ticks[-1]])

        self.ticks = np.array(self.ticks[::-1], dtype=int)

        self.ticks = trim_beats(localscore, self.ticks, False) * self.H
        if not len(self.ticks) >= 2:
            raise ValueError(("Only found one single onset, can't make sure if the beat is correct"))
        interv_value = self.ticks[1] - self.ticks[0] #these are optimal beat locations
        interval = 0
        self.ticks = []
        for i in range(int(self.signal.size/interv_value)):
            self.ticks.append(interval + interv_value)
            interval += interv_value #compute tempo frames locations based on the beat location value
        self.ticks = np.array(self.ticks) / self.fs
        return self.tempo, self.ticks
def compute_features(audio_file, intervals, level):
    """Computes the subseg-sync cqt features from the given audio file, if
    they are not previously computed. Saves the results in the feat_dir folder.

    Parameters
    ----------
    audio_file : str
        Path to the audio file.
    intervals : np.array
        Intervals containing the estimated boundaries.
    level : str
        Level in the hierarchy.

    Returns
    -------
    cqgram : np.array
        Subseg-sync constant-Q power spectrogram.
    intframes : np.array
        The frame indeces.
    """
    # Check if features have already been computed
    if level == "small_scale":
        features_file = os.path.join(features_dir, os.path.basename(audio_file).split('.')[0] +
                                    "_small_scale.mp3.pk")
    else:
        features_file = os.path.join(features_dir, os.path.basename(audio_file) +
                                    ".pk")
    if os.path.isfile(features_file):
        return read_features(features_file)

    y, sr = librosa.load(audio_file, sr=11025)

    # Default hopsize is 512
    hopsize = 512
    cqgram = librosa.logamplitude(librosa.cqt(y, sr=sr, hop_length=hopsize)**2, ref_power=np.max)

    # Track beats
    y_harmonic, y_percussive = librosa.effects.hpss(y)
    tempo, beats = librosa.beat.beat_track(y=y_percussive, sr=sr,
                                           hop_length=hopsize)

    # Synchronize
    cqgram = librosa.feature.sync(cqgram, beats, aggregate=np.median)

    intframes = None
    if intervals is not None:
        # convert intervals to frames
        intframes = librosa.time_to_frames(intervals, sr=sr, hop_length=hopsize)

        # Match intervals to subseg points
        intframes = librosa.util.match_events(intframes, beats)

    # Save the features
    save_features(cqgram, intframes, beats, features_file)

    return cqgram, intframes
Exemple #40
0
    def encode_intervals(self, duration, intervals, values, dtype=np.bool):

        frames = librosa.time_to_frames(intervals,
                                        sr=self.sr,
                                        hop_length=self.hop_length)

        n_total = int(librosa.time_to_frames(duration, sr=self.sr,
                                             hop_length=self.hop_length))

        values = values.astype(dtype)

        target = np.empty((n_total, values.shape[1]), dtype=dtype)

        target.fill(fill_value(dtype))

        for column, interval in zip(values, frames):
            target[interval[0]:interval[1]] += column

        return target
def patch_label(start, end, time_windows, annotate, binary=False, threshold=None):
    """Labeling a patch given annotation
    Args:
        start(float): start time of a patch (in second)
        end(float): end time of a patch (in second)
        time_windows(float): time windows for average (in milliseconds)
        annotation(DataFrame): annotation dataframe for a specific song
        songname(string): song_name(string)
    Returns:
        label(pd.DataFrame):
            column: instrument
            index: label
            eg:
                        S01   S02
                label    1   0.93
    """
    #Transfer time to frame
    annotation = copy.copy(annotate)
    start_frame = librosa.time_to_frames(start, sr=1/0.0464, hop_length=1)+1
    end_frame = librosa.time_to_frames(end, sr=1/0.0464, hop_length=1)-1
    moving_frame = librosa.time_to_frames(time_windows/1000, sr=1/0.0464, hop_length=1)-librosa.time_to_frames(0, sr=1/0.0464, hop_length=1)
    #Pick annotation
    annotation = annotation.reset_index(drop=True)
    annotation.index += 1
    time_annot = annotation.loc[start_frame:end_frame].drop('time', 1)
    #Using maximum value of average in moving windows as label
    music_ins = time_annot.columns
    label = pd.DataFrame(index = ['label',], columns=music_ins)
    for j in range(len(time_annot.columns)):
        label_temp = max([sum(list(time_annot.ix[:, j])[i:i+moving_frame+1])/float(moving_frame+1)
                          for i in range(len(time_annot.ix[:, j])-moving_frame)])
        #binary output
        if binary and threshold:
            if label_temp >= threshold:
                label_temp = float(1)
            else:
                label_temp = float(0)
        label.ix[:, j] = label_temp
    return label
Exemple #42
0
    def encode_events(self, duration, events, values, dtype=np.bool):
        '''Encode labeled events as a time-series matrix.

        Parameters
        ----------
        duration : number
            The duration of the track

        events : ndarray, shape=(n,)
            Time index of the events

        values : ndarray, shape=(n, m)
            Values array.  Must have the same first index as `events`.

        dtype : numpy data type

        Returns
        -------
        target : ndarray, shape=(n_frames, n_values)
        '''

        # FIXME: support sparse encoding
        frames = librosa.time_to_frames(events,
                                        sr=self.sr,
                                        hop_length=self.hop_length)

        n_total = int(librosa.time_to_frames(duration, sr=self.sr,
                                             hop_length=self.hop_length))

        target = np.empty((n_total, values.shape[1]), dtype=dtype)

        target.fill(fill_value(dtype))
        values = values.astype(dtype)
        for column, event in zip(values, frames):
            target[event] += column

        return target
    def __test(sr, hop_length, n_fft):

        # Generate frames at times 0s, 1s, 2s
        times = np.arange(3)

        frames = librosa.time_to_frames(times,
                                        sr=sr,
                                        hop_length=hop_length,
                                        n_fft=n_fft)

        if n_fft:
            frames -= n_fft // (2 * hop_length)

        # we need to be within one frame
        assert np.all(np.abs(times - np.asarray([0, 1, 2])) * sr
                      < hop_length)
Exemple #44
0
    def _get_beats(self):
        """
        Gets beats using librosa's beat tracker.
        """
        _, beat_frames = librosa.beat.beat_track(y=self.analysis_samples,
                                                 sr=self.analysis_sample_rate,
                                                 trim=False)

        # pad beat times to full duration
        f_max = librosa.time_to_frames(self.duration, sr=self.analysis_sample_rate)
        beat_frames = librosa.util.fix_frames(beat_frames, x_min=0, x_max=f_max)

        # convert frames to times
        beat_times = librosa.frames_to_time(beat_frames, sr=self.analysis_sample_rate)

        # make the list of (start, duration) tuples that TimingList expects
        starts_durs = [(s, t-s) for (s, t) in zip(beat_times, beat_times[1:])]

        return starts_durs
def test_clicks():

    def __test(times, frames, sr, hop_length, click_freq, click_duration, click, length):

        y = librosa.clicks(times=times,
                           frames=frames,
                           sr=sr,
                           hop_length=hop_length,
                           click_freq=click_freq,
                           click_duration=click_duration,
                           click=click,
                           length=length)

        if times is not None:
            nmax = librosa.time_to_samples(times, sr=sr).max()
        else:
            nmax = librosa.frames_to_samples(frames, hop_length=hop_length).max()

        if length is not None:
            assert len(y) == length
        elif click is not None:
            assert len(y) == nmax + len(click)


    test_times = np.linspace(0, 10.0, num=5)

    # Bad cases
    yield raises(librosa.ParameterError)(__test), None, None, 22050, 512, 1000, 0.1, None, None
    yield raises(librosa.ParameterError)(__test), test_times, None, 22050, 512, 1000, 0.1, np.ones((2, 10)), None
    yield raises(librosa.ParameterError)(__test), test_times, None, 22050, 512, 1000, 0.1, None, 0
    yield raises(librosa.ParameterError)(__test), test_times, None, 22050, 512, 0, 0.1, None, None
    yield raises(librosa.ParameterError)(__test), test_times, None, 22050, 512, 1000, 0, None, None

    for sr in [11025, 22050]:
        for hop_length in [512, 1024]:
            test_frames = librosa.time_to_frames(test_times, sr=sr, hop_length=hop_length)

            for click in [None, np.ones(sr // 10)]:

                for length in [None, 5 * sr, 15 * sr]:
                    yield __test, test_times, None, sr, hop_length, 1000, 0.1, click, length
                    yield __test, None, test_frames, sr, hop_length, 1000, 0.1, click, length
Exemple #46
0
def evaluate_phrases(benchmark, detected, window):
    #return score based on how well the detected phrases match up to the benchmark\
    #hits / maxHits - misses / maxMisses
    
    #convert to frames
    benchmark = librosa.time_to_frames(benchmark,hop_length=256)
    
    print(benchmark)
    print(detected)
    
    hits = 0
    for i in range(0, len(benchmark)):
        target = benchmark[i]
        h_i = hitIndex(target, detected, window)
        if (h_i != -1):
            hits += 1
            
    max_hits = len(benchmark)  
    max_misses = len(benchmark) + len(detected)
    misses = max_misses - (hits*2)
    
    return (hits/max_hits) - (misses/max_misses)
Exemple #47
0
    def transform(self, jam, query=None):

        anns = []
        if query:
            results = jam.search(**query)
        else:
            results = jam.annotations

        # Find annotations that can be coerced to our target namespace
        for ann in results:
            try:
                anns.append(jams.nsconvert.convert(ann, self.namespace))
            except jams.NamespaceError:
                pass

        duration = jam.file_metadata.duration

        # If none, make a fake one
        if not anns:
            anns = [self.empty(duration)]

        # Apply transformations
        results = []
        for ann in anns:

            results.append(self.transform_annotation(ann, duration))
            # If the annotation range is None, it spans the entire track
            if ann.time is None or ann.duration is None:
                valid = [0, duration]
            else:
                valid = [ann.time, ann.time + ann.duration]

            results[-1]['_valid'] = librosa.time_to_frames(valid,
                                                           sr=self.sr,
                                                           hop_length=self.hop_length)

        # Prefix and collect
        return self.merge(results)
Exemple #48
0
def get_smatrix_diagonal(f, r):
    #param: f=feature array, r=radius/resolution of of matrix sampled from the diagonal in seconds
    #optimized based on the assumption that only information along the diagonal of the matrix is important
    print('Computing S-Matrix Diagonal')
    
    #convert r in seconds to r in frames
    r_f = librosa.time_to_frames(np.array([r]), sr=22050, hop_length=256)[0]
    
    dim = len(f[0])
    if (r_f > dim):
        r_f = -dim
    matrix = np.zeros([dim, dim])
    i_max = int(dim)
    
    for i in range(0,dim):
        sys.stdout.write("\r" + str(int((i+1)/dim*100)) + '%') 
        sys.stdout.flush()
        for j in range(0,r_f*2):
            i_r = i
            j_r = max(min(j + i-r_f,dim-1),0)
            matrix[i_r][j_r] = feature_distance(f[:,i_r], f[:,j_r])
    sys.stdout.write("\n") 
    return matrix
Exemple #49
0
    def read_ann_beats(self):
        """Reads the annotated beats if available.

        Returns
        -------
        times: np.array
            Times of annotated beats in seconds.
        frames: np.array
            Frame indeces of annotated beats.
        """
        times, frames = (None, None)

        # Read annotations if they exist in correct folder
        if os.path.isfile(self.file_struct.ref_file):
            jam = jams.load(self.file_struct.ref_file)
            beat_annot = jam.search(namespace="beat.*")

            # If beat annotations exist, get times and frames
            if len(beat_annot) > 0:
                beats_inters, _ = beat_annot[0].data.to_interval_values()
                times = beats_inters[:, 0]
                frames = librosa.time_to_frames(times, sr=self.sr,
                                                hop_length=self.hop_length)
        return times, frames
start, end, labels = [], [], []
with open('dizquefuiporai8.lab') as infile:
    print "Start / End / Label"
    for line in infile:
        print line
        fields = line.split()
        start.append(float(fields[0]))
        end.append(float(fields[1]))
        labels.append(int(fields[2]))

start = np.array(start)
end = np.array(end)
labels = np.array(labels)

# %% Convert the time stamps into frame indices
start_frames = librosa.time_to_frames(start, sr=sr)
end_frames = librosa.time_to_frames(end, sr=sr)

# %% Overlay the section markers with the mel-frequency spectrogram

plt.figure(figsize=(12, 6))
librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel',
                         n_xticks=20)

# Overlay with the detected beats
colors = ['r', 'c', 'orange', 'b', 'k', 'g', 'm', 'y']
for i in range(len(labels)):
    lines = np.arange(start_frames[i], end_frames[i])
    plt.vlines(lines, 0, log_S.shape[0],
               colors=colors[labels[i] % len(colors)],
               linestyles='-', linewidth=2, alpha=0.01)
Exemple #51
0
def compute_features(audio_file):
    """Computes the onsets and the MFCC and CQT onset-synchronous features from
    the given audio file path.

    Parameters
    ----------
    audio_file : str
        Path to the audio file

    Returns
    -------
    y : np.array
        Audio samples
    onset_times : np.array
        Onset times in seconds
    mfcc_sync : np.array
        MFCC synchronized to the onsets
    cqt_sync : np.array
        CQT features synchronized to the onsets
    chroma_sync : np.array
        Chroma features synchronized to the onsets
    """
    # Read audio file
    y, sr = librosa.load(audio_file, sr=SRATE)

    # Detect onset
    onsets = librosa.onset.onset_detect(y, sr=SRATE, hop_length=HOP_SIZE)
    onset_times = librosa.frames_to_time(onsets, sr=SRATE,
                                         hop_length=HOP_SIZE)

    # Add first and last onsets (start and end of track)
    dur = librosa.core.get_duration(y=y, sr=SRATE, hop_length=HOP_SIZE)
    if onset_times[0] != 0:
        onset_times = np.concatenate(([0], onset_times))
    if onset_times[-1] != dur:
        onset_times = np.concatenate((onset_times, [dur]))

    # Compute MFCC (timbre features)
    mfcc = librosa.feature.mfcc(y=y, sr=SRATE, hop_length=HOP_SIZE,
                                n_mfcc=N_MFCC)

    # Compute Constant-Q Transform
    cqt = librosa.logamplitude(librosa.cqt(y, sr=SRATE, hop_length=HOP_SIZE,
                                           n_bins=CQT_BINS) ** 2,
                               ref_power=np.max)

    # Compute chromagram
    y_harmonic, y_percussive = librosa.effects.hpss(y)
    chroma = librosa.feature.chroma_cqt(y=y_harmonic, sr=SRATE,
                                        hop_length=HOP_SIZE)

    # Synchronize features to onsets
    mfcc_sync = librosa.feature.sync(
        mfcc, librosa.time_to_frames(onset_times, sr=SRATE, hop_length=HOP_SIZE),
        pad=False)
    cqt_sync = librosa.feature.sync(
        cqt, librosa.time_to_frames(onset_times, sr=SRATE, hop_length=HOP_SIZE),
        pad=False)
    chroma_sync = librosa.feature.sync(
        chroma, librosa.time_to_frames(onset_times, sr=SRATE,
                                       hop_length=HOP_SIZE), pad=False)

    return y, onset_times, mfcc_sync, cqt_sync, chroma_sync
Exemple #52
0
def features(filename):
    '''Feature-extraction for audio segmentation
    Arguments:
        filename -- str
        path to the input song

    Returns:
        - X -- ndarray
            
            beat-synchronous feature matrix:
            MFCC (mean-aggregated)
            Chroma (median-aggregated)
            Latent timbre repetition
            Latent chroma repetition
            Time index
            Beat index

        - beat_times -- array
            mapping of beat index => timestamp
            includes start and end markers (0, duration)

    '''
    
    

    def compress_data(X, k):
        e_vals, e_vecs = scipy.linalg.eig(X.dot(X.T))
        
        e_vals = np.maximum(0.0, np.real(e_vals))
        e_vecs = np.real(e_vecs)
        
        idx = np.argsort(e_vals)[::-1]
        
        e_vals = e_vals[idx]
        e_vecs = e_vecs[:, idx]
        
        # Truncate to k dimensions
        if k < len(e_vals):
            e_vals = e_vals[:k]
            e_vecs = e_vecs[:, :k]
        
        # Normalize by the leading singular value of X
        Z = np.sqrt(e_vals.max())
        
        if Z > 0:
            e_vecs = e_vecs / Z
        
        return e_vecs.T.dot(X)

    # Harmonic waveform
    def harmonify(y):
        D = librosa.stft(y)
        return librosa.istft(librosa.decompose.hpss(D)[0])

    # HPSS waveforms
    def hpss_wav(y):
        H, P = librosa.decompose.hpss(librosa.stft(y))

        return librosa.istft(H), librosa.istft(P)

    # Beats and tempo
    def get_beats(y):
        odf = librosa.onset.onset_strength(y=y, 
                                            sr=sr, 
                                            n_fft=N_FFT, 
                                            hop_length=HOP_BEATS, 
                                            n_mels=N_MELS, 
                                            fmax=FMAX, 
                                            aggregate=np.median)

        bpm, beats = librosa.beat.beat_track(onset_envelope=odf, sr=sr, hop_length=HOP_BEATS)
        
        return bpm, beats

    # MFCC features
    def get_mfcc(y):
        # Generate a mel-spectrogram
        S = librosa.feature.melspectrogram(y, sr,   n_fft=N_FFT, 
                                                    hop_length=HOP_LENGTH, 
                                                    n_mels=N_MELS, 
                                                    fmax=FMAX).astype(np.float32)
    
        # Put on a log scale
        S = librosa.logamplitude(S, ref_power=S.max())

        return librosa.feature.mfcc(S=S, n_mfcc=N_MFCC)

    # Chroma features
    def chroma(y):
        # Build the wrapper
        CQT      = np.abs(librosa.cqt(y,    sr=SR, 
                                            resolution=NOTE_RES,
                                            hop_length=HOP_LENGTH,
                                            fmin=NOTE_MIN,
                                            n_bins=NOTE_NUM))

        C_to_Chr = librosa.filters.cq_to_chroma(CQT.shape[0], n_chroma=N_CHROMA) 

        return librosa.logamplitude(librosa.util.normalize(C_to_Chr.dot(CQT)))

    # Latent factor repetition features
    def repetition(X, metric='seuclidean'):
        R = librosa.segment.recurrence_matrix(X, 
                                            k=2 * int(np.ceil(np.sqrt(X.shape[1]))), 
                                            width=REP_WIDTH, 
                                            metric=metric,
                                            sym=False).astype(np.float32)

        P = scipy.signal.medfilt2d(librosa.segment.structure_feature(R), [1, REP_FILTER])
        
        # Discard empty rows.  
        # This should give an equivalent SVD, but resolves some numerical instabilities.
        P = P[P.any(axis=1)]

        return compress_data(P, N_REP)


    print '\t[1/6] loading audio'
    # Load the waveform
    y, sr = librosa.load(filename, sr=SR)

    # Compute duration
    duration = float(len(y)) / sr

    print '\t[2/6] Separating harmonic and percussive signals'
    # Separate signals
    y_harm, y_perc = hpss_wav(y)

    
    
    print '\t[3/6] detecting beats'
    # Get the beats
    bpm, beats = get_beats(y_perc)

    # augment the beat boundaries with the starting point
    beats = np.unique(np.concatenate([ [0], beats]))

    B = librosa.frames_to_time(beats, sr=SR, hop_length=HOP_BEATS)

    beat_frames = np.unique(librosa.time_to_frames(B, sr=SR, hop_length=HOP_LENGTH))

    # Stash beat times aligned to the longer hop lengths
    B = librosa.frames_to_time(beat_frames, sr=SR, hop_length=HOP_LENGTH)

    print '\t[4/6] generating MFCC'
    # Get the MFCCs
    M = get_mfcc(y)

    # Beat-synchronize the features
    M = librosa.feature.sync(M, beat_frames, aggregate=np.mean)
    
    print '\t[5/6] generating chroma'
    # Get the chroma from the harmonic component
    C = chroma(y_harm)

    # Beat-synchronize the features
    C = librosa.feature.sync(C, beat_frames, aggregate=np.median)
    
    # Time-stamp features
    N = np.arange(float(len(beat_frames)))
    
    # Beat-synchronous repetition features
    print '\t[6/6] generating structure features'
    R_timbre = repetition(librosa.feature.stack_memory(M))
    R_chroma = repetition(librosa.feature.stack_memory(C))
    
    # Stack it all up
    X = np.vstack([M, C, R_timbre, R_chroma, B, B / duration, N, N / len(beats)])

    # Add on the end-of-track timestamp
    B = np.concatenate([B, [duration]])

    return X, B
import librosa

import librosa.display

#############################################
# Load an example with vocals.
y, sr = librosa.load('audio/Cheese_N_Pot-C_-_16_-_The_Raps_Well_Clean_Album_Version.mp3', duration=120)


# And compute the spectrogram magnitude and phase
S_full, phase = librosa.magphase(librosa.stft(y))


#######################################
# Plot a 5-second slice of the spectrum
idx = slice(*librosa.time_to_frames([30, 35], sr=sr))
plt.figure(figsize=(12, 4))
librosa.display.specshow(librosa.amplitude_to_db(S_full[:, idx], ref=np.max),
                         y_axis='log', x_axis='time', sr=sr)
plt.colorbar()
plt.tight_layout()

###########################################################
# The wiggly lines above are due to the vocal component.
# Our goal is to separate them from the accompanying
# instrumentation.
#

# We'll compare frames using cosine similarity, and aggregate similar frames
# by taking their (per-frequency) median value.
#
# Debug?
DEBUG_PLOT = False

# Set some params
FS = 44100 # Enforce 44.1 kHz sample rate
N_FFT = 2048
HOP_LENGTH = N_FFT/2 # 50% overlap
N_MFCC = 13
N_MEL = 128

DB_LOW = -250.0 # silence in dB

T_CONTEXT = 3 # seconds of context for our features
N_FRAME_CONTEXT = librosa.time_to_frames(
    T_CONTEXT,
    sr=FS, hop_length=HOP_LENGTH, n_fft=N_FFT
    )[0]+1
# 64 frames on either side, for context

BOUNDARY_KERNEL = signal.gaussian(N_FRAME_CONTEXT, std=32) # For smoothing our y
#BOUNDARY_KERNEL = np.ones(N_FRAME_CONTEXT)

DTYPE = 'float32'

# FOR USE ON AMAZON EC2 AFTER COPYING FROM S3
#DATADIR = os.path.abspath(os.path.join('/mnt','audio'))
#SALAMIDIR = os.path.abspath(os.path.join('/mnt','salami', 'salami-data-public'))

# FOR USE ON WINDOWS MACHINE
DATADIR = os.path.abspath('F:\salami-audio')
SALAMIDIR = os.path.abspath('F:\salami-data-public')
Exemple #55
0
    def predict(self, filename=None, y=None, sr=None, outputs=None):
        '''Chord prediction

        Parameters
        ----------
        filename : str
            Path to the audio file to analyze

        y, sr : np.ndarray, number>0

            Audio signal in memory to analyze

        outputs : dict `{str: np.ndarray}`

            Pre-computed model outputs, as given by ``ChordModel.outputs``.

        .. note:: At least one of `filename`, `y, sr`, or `outputs`
            must be provided.

        Returns
        -------
        jams.Annotation, namespace='chord'

            The chord estimate for the given signal.

        Examples
        --------
        >>> import crema
        >>> import librosa
        >>> model = crema.models.chord.ChordModel()
        >>> chord_est = model.predict(filename=librosa.util.example_audio_file())
        >>> chord_est
        <Annotation(namespace='chord',
                    time=0,
                    duration=61.4,
                    annotation_metadata=<AnnotationMetadata(...)>,
                    data=<45 observations>,
                    sandbox=<Sandbox(...)>)>
        >>> chord_est.to_dataframe().head(5)
               time  duration  value  confidence
        0  0.000000  0.092880  E:maj    0.336977
        1  0.092880  0.464399    E:7    0.324255
        2  0.557279  1.021678  E:min    0.448759
        3  1.578957  2.693515  E:maj    0.501462
        4  4.272472  1.486077  E:min    0.287264
        '''
        if outputs is None:
            outputs = self.outputs(filename=filename, y=y, sr=sr)

        output_key = self.model.output_names[0]
        pump_op = self.pump[output_key]

        ann = super(ChordModel, self).predict(y=y, sr=sr, filename=filename,
                                              outputs=outputs)

        bass_pred = outputs['chord_bass']

        # Handle inversion estimation
        for obs in ann.pop_data():
            start, end = time_to_frames([obs.time, obs.time + obs.duration],
                                        sr=pump_op.sr,
                                        hop_length=pump_op.hop_length)

            value = obs.value
            if obs.value not in ('N', 'X'):
                mean_bass = gmean(bass_pred[start:end+1])

                bass_pc = np.argmax(mean_bass)
                root_pc, pitches, _ = mir_eval.chord.encode(obs.value)

                bass_rel = 0
                if bass_pc < 12:
                    bass_rel = np.mod(bass_pc - root_pc, 12)

                if bass_rel and pitches[bass_rel]:
                    value = '{}/{}'.format(value,
                                           SEMITONE_TO_SCALE_DEGREE[bass_rel])

            ann.append(time=obs.time,
                       duration=obs.duration,
                       value=value,
                       confidence=obs.confidence)

        return ann
def quantize_track(music, sr):
    quantization = np.arange(0, len(music)/float(sr), .1)
    quantization = [int(b) for b in librosa.time_to_frames(quantization)]
    return quantization
def serialize_song(
        sid,
        path,
        datadir=DATADIR,
        salamidir=SALAMIDIR,
        outputdir=OUTPUTDIR,
        prefix='data'
        ):
    """
    serialize_data_chunk()
    Serializes a chunk of data on disk, given SIDs and corresponding paths.

    Arguments:
        sids      :  the SIDs (int list)
        paths     :  paths to sids audio files (string list)
        datadir   : where the audio files are stored
        salamidir : i.e. the salami-data-public dir from a cloned SALAMI repo
        outputdir : for serialized data on disk
        prefix    : prefix for serialized data file on disk

    Outputs:
        X_path  : string paths to the serialized files
        X_shape : shape of data serialized in X_path
        y_path  : string paths to the serialized files
        y_shape : shape of data serialized in y_path
    """

    X, y = None, None
    X_path, X_shape, y_path, y_shape = None, None, None, None

    X_shape = [0, 1, N_MEL, N_FRAME_CONTEXT]
    y_shape = [0, 1]

    print "SID: {0},\tfile: {1}".format(sid, path)

    y_path = os.path.abspath(
        os.path.join(outputdir, prefix + str(sid) + '_y')
        )
    X_path = os.path.abspath(
        os.path.join(outputdir, prefix + str(sid) + '_X')
        )

    # Get the annotated segment times (sec)
    times = ev.id2segtimes(
        sid,
        ann_type="uppercase",
        salamipath=salamidir
        )
    times_frames = librosa.time_to_frames(
        times,
        sr=FS,
        hop_length=HOP_LENGTH,
        n_fft=N_FFT
        )

    # Get signal
    sig, fs = librosa.load(
        os.path.join(datadir, path),
        FS
        )

    # Get feature frames
    sig_feat = librosa.feature.melspectrogram(
        y=sig,
        sr=fs,
        n_fft=N_FFT,
        hop_length=HOP_LENGTH,
        n_mels=N_MEL,
        fmax=1600
        )
    sig_feat = 20.0*np.log10(np.clip( sig_feat, a_min=1e-12, a_max=None)) # convert to dB
    sig_feat = sig_feat - np.max(sig_feat) # Normalize to 0dB
    sig_feat[sig_feat==-np.inf] = DB_LOW # screen out inf

    # Keep track of the number of frames for this song
    n_frames = sig_feat.shape[1]

    y_shape[0] = n_frames # increment the shape of our final output y data
    X_shape[0] = n_frames # increment the shape of our final output y data

    # Pad the frames, so we can have frames centered at the very start and
    # end of the song.
    sig_feat = np.hstack((
        np.ones((N_MEL, N_FRAME_CONTEXT/2)) * DB_LOW,
        sig_feat,
        np.ones((N_MEL, N_FRAME_CONTEXT/2)) * DB_LOW
        ))

    # Generate the boundary indicator

    y = np.memmap(
        y_path,
        dtype=DTYPE,
        mode='w+',
        shape=tuple(y_shape)
        )
    y[:] = np.zeros((n_frames,1))[:] # start with zeros
    y[np.minimum(times_frames,n_frames-1),0] = 1.0

    if(DEBUG_PLOT):
        plt.figure(figsize=(10,  3))
        plt.plot(
            y,
            label="Annotations"
            )

    # Smooth y with the gaussian kernel
    y[:,0] = np.convolve( y[:,0], BOUNDARY_KERNEL, 'same')
    y[:,0] = np.minimum(y[:,0],1.0) # nothing above 1

    if(DEBUG_PLOT):
        plt.plot(
            y,
            label="Smoothed"
            )
        plt.xlabel("Frame number")
        plt.ylabel("Segment boundary strength")
        plt.legend()
        # plt.colorbar()
        plt.savefig('./seg.pdf', bbox_inches='tight')
        # plt.show()

    # Generate the training data
    X = np.memmap(
            X_path,
            dtype=DTYPE,
            mode='w+',
            shape=tuple(X_shape)
            )

    for i_frame in xrange(n_frames):
        X[i_frame,0] = sig_feat[:,i_frame:i_frame+N_FRAME_CONTEXT]

    # debug plot
    if(DEBUG_PLOT):
        plt.figure()
        plt.subplot(211)
        plt.imshow(X[X.shape[0]/2,0])
        plt.colorbar()
        plt.subplot(212)
        plt.plot(y)
        plt.show()

    # Flush our binary data to file
    X.flush()
    y.flush()

    return X_path, X_shape, y_path, y_shape
Exemple #58
0
import librosa
import librosa.display


#######################################################################
# We'll use a track that has harmonic, melodic, and percussive elements
y, sr = librosa.load('audio/Karissa_Hobbs_-_09_-_Lets_Go_Fishin.mp3')


#######################################
# First, let's plot the original chroma
chroma_orig = librosa.feature.chroma_cqt(y=y, sr=sr)

# For display purposes, let's zoom in on a 15-second chunk from the middle of the song
idx = [slice(None), slice(*list(librosa.time_to_frames([45, 60])))]

# And for comparison, we'll show the CQT matrix as well.
C = np.abs(librosa.cqt(y=y, sr=sr, bins_per_octave=12*3, n_bins=7*12*3))


plt.figure(figsize=(12, 4))
plt.subplot(2, 1, 1)
librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max)[idx],
                         y_axis='cqt_note', bins_per_octave=12*3)
plt.colorbar()
plt.subplot(2, 1, 2)
librosa.display.specshow(chroma_orig[idx], y_axis='chroma')
plt.colorbar()
plt.ylabel('Original')
plt.tight_layout()
Exemple #59
0
def analyze_features(input_file, features=None, analysis=None, PARAMETERS=None):
    '''Mid-level feature analysis'''

    with open(input_file, 'r') as f:
        lowlevel = pickle.load(f)

    if analysis is None:
        analysis = {}
    
    if features is None:
        features = set(get_feature_names())


    # Beats might occur after the last hop
    # We'll clip anything that's too big
    beat_frames = librosa.time_to_frames(lowlevel['beat_times'],
                                            sr=lowlevel['PARAMETERS']['load']['sr'],
                                            hop_length=lowlevel['PARAMETERS']['stft']['hop_length'])

    beat_frames = np.clip(beat_frames, 0, lowlevel['mfcc'].shape[1]-1)

    # Pad on a phantom 0 here
    beat_frames = np.unique(np.concatenate([[0], beat_frames]))

    analysis['beat_times'] = librosa.frames_to_time(beat_frames, 
                                                    sr=lowlevel['PARAMETERS']['load']['sr'],
                                                    hop_length=lowlevel['PARAMETERS']['stft']['hop_length'])

    # Compute beat-sync features
    if 'beat_sync' in features:
        (analysis['beat_sync_mfcc'], 
         analysis['beat_sync_mel_spectrogram'], 
         analysis['beat_sync_cqt'], 
         analysis['beat_sync_chroma']) = get_sync_features(lowlevel, beat_frames)
                                                
                                                
    
    onset_frames = librosa.time_to_frames(lowlevel['onsets'],
                                          sr=lowlevel['PARAMETERS']['load']['sr'],
                                          hop_length=lowlevel['PARAMETERS']['stft']['hop_length'])

    onset_frames = np.clip(onset_frames, 0, lowlevel['mfcc'].shape[1]-1)
    onset_frames = np.unique(np.concatenate([[0], onset_frames]))

    analysis['onset_times'] = librosa.frames_to_time(onset_frames, 
                                                    sr=lowlevel['PARAMETERS']['load']['sr'],
                                                    hop_length=lowlevel['PARAMETERS']['stft']['hop_length'])
    # Compute onset-sync features
    if 'onset_sync' in features:

        (analysis['onset_sync_mfcc'], 
         analysis['onset_sync_mel_spectrogram'], 
         analysis['onset_sync_cqt'], 
         analysis['onset_sync_chroma']) = get_sync_features(lowlevel, onset_frames)


    if 'repetition_mfcc' in features:
        analysis['repetition_mfcc'] = get_repetition_features(analysis['beat_sync_mfcc'], 
                                                              PARAMETERS['repetition']['mfcc']['n_history'],
                                                              PARAMETERS['repetition']['mfcc']['metric'],
                                                              PARAMETERS['repetition']['mfcc']['width'],
                                                              PARAMETERS['repetition']['mfcc']['kernel_size'],
                                                              PARAMETERS['repetition']['mfcc']['n_factors'])

    if 'repetition_chroma' in features:
        analysis['repetition_chroma'] = get_repetition_features(analysis['beat_sync_chroma'], 
                                                              PARAMETERS['repetition']['chroma']['n_history'],
                                                              PARAMETERS['repetition']['chroma']['metric'],
                                                              PARAMETERS['repetition']['chroma']['width'],
                                                              PARAMETERS['repetition']['chroma']['kernel_size'],
                                                              PARAMETERS['repetition']['chroma']['n_factors'])
    if 'beat_neighbors' in features:
        analysis['mfcc_neighbors_beat']   = get_neighbors(analysis['beat_sync_mfcc'], 
                                                          PARAMETERS['beat_neighbors']['k'],
                                                          PARAMETERS['repetition']['mfcc']['width'],
                                                          PARAMETERS['repetition']['mfcc']['metric'])

        analysis['chroma_neighbors_beat'] = get_neighbors(analysis['beat_sync_chroma'], 
                                                          PARAMETERS['beat_neighbors']['k'],
                                                          PARAMETERS['repetition']['chroma']['width'],
                                                          PARAMETERS['repetition']['chroma']['metric'])

    if 'segments' in features:
        # Get the min and max number of segments
        k_min, k_max = get_segment_range(lowlevel['duration'], 
                                         PARAMETERS['segments']['min_seg'], 
                                         PARAMETERS['segments']['max_seg'])

        # Build the feature stack
        X_segment = get_segment_features(analysis, lowlevel, PARAMETERS['segments']['transformation'])

        # Get the segment boundaries for each k in the range
        segment_boundaries, analysis['segments_best'] = get_segments(X_segment, k_min, k_max)

        # Convert back to boundary times
        analysis['segment_time_tree']  = []
        analysis['segment_beat_tree'] = []

        # Pad the beat times so that we include all points of aggregation
        beat_times = np.unique(np.concatenate([analysis['beat_times'], [lowlevel['duration']]]))

        for level, bounds in enumerate(segment_boundaries):
            analysis['segment_beat_tree'].append(bounds)
            analysis['segment_time_tree'].append(beat_times[bounds])

        # Just to make it easy, copy over the best segmentation
        analysis['segment_times'] = analysis['segment_time_tree'][analysis['segments_best']]

    if 'vq' in features:
        # Load the transformer
        whitener, encoder, args     = encoder_model(PARAMETERS['encoder']['transformation'], 
                                                    PARAMETERS['encoder']['n_quantizers'])

        lmdeltas                    = delta_features(lowlevel)
        analysis['frame_vq']        = encode_features(lmdeltas, whitener, encoder)
        analysis['vq_parameters']   = args
        dense_code                  = analysis['frame_vq'].toarray().astype(np.float32)
        analysis['onset_sync_vq']   = librosa.feature.sync(dense_code, onset_frames).astype(np.float32)
        analysis['beat_sync_vq']    = librosa.feature.sync(dense_code, beat_frames).astype(np.float32)
        analysis['track_vq']        = np.mean(dense_code, axis=1).astype(np.float32)

        
    # Construct a dense representation for summarization purposes


    PREV = analysis.get('PREVIOUS', {})

    if 'computed_features' in analysis:
        PREV['computed_features'] = analysis['computed_features']

    analysis['computed_features'] = features

    if 'PARAMETERS' in analysis:
        analysis['PREVIOUS'] = {'PARAMETERS':   analysis['PARAMETERS'],
                                'ENVIRONMENT':  analysis['ENVIRONMENT'],
                                'PREVIOUS':     PREV}

    # We're done with harmonics now
    analysis['PARAMETERS']  = PARAMETERS
    analysis['ENVIRONMENT'] = ENVIRONMENT

    return analysis
w_p_ratio = 4
#WHERE DO THESE THRESHOLD VALUES COME FROM?
beat_threshold = 15
period_threshold = 20
peak_window = 0.13

#BENCHMARK
with open('assets/phrase_intervals.json') as data_file:    
    data = json.load(data_file)
    
bench = []
for p in data[file_id]:
    if p <= sample_duration:
        bench.append(p)
        
bench = librosa.time_to_frames(bench,hop_length=256)

#TEST CONSTANTS
if (w>r):
    sys.exit('Window Resolution Mismatch')
     
#LOAD WAVEFORM
audio_path = 'assets/'+file_id+'.wav'
y, sr = (librosa.load(audio_path,  sr=None,  duration=sample_duration))

w_f = librosa.time_to_frames([w],hop_length=256)[0]
f  = extract_features(y)

#Get Beats
y_harmonic, y_percussive = librosa.effects.hpss(y)
tempo, beats = librosa.beat.beat_track(y=y_percussive, sr=sr)