Example #1
0
    def process(self):
        """Main process.
        Returns
        -------
        est_idxs : np.array(N) or list
            Estimated times for the segment boundaries in frame indeces.
            List if hierarchical segmentation.
        est_labels : np.array(N-1) or list
            Estimated labels for the segments.
            List if hierarchical segmentation.
        """
        # This algorithm only accepts one specific kind of features:
        # Combination of PCP + MFCC. Let's get them:
        pcp_obj = Features.select_features("pcp", self.file_struct,
                                           self.annot_beats, self.framesync)
        mfcc_obj = Features.select_features("mfcc", self.file_struct,
                                            self.annot_beats, self.framesync)

        # Get frame times and make sure they're the same in both features
        frame_times = pcp_obj.frame_times
        assert np.array_equal(frame_times, mfcc_obj.frame_times)

        # Brian wants PCP and MFCC
        # (tranpsosed, because he's that kind of person)
        F = (pcp_obj.features.T, mfcc_obj.features.T)

        # Do actual segmentation
        est_idxs, est_labels = main.do_segmentation(F, frame_times,
                                                    self.config,
                                                    self.in_bound_idxs)

        return est_idxs, est_labels, F
Example #2
0
    def process(self):
        """Main process.
        Returns
        -------
        est_idxs : np.array(N) or list
            Estimated times for the segment boundaries in frame indeces.
            List if hierarchical segmentation.
        est_labels : np.array(N-1) or list
            Estimated labels for the segments.
            List if hierarchical segmentation.
        """
        # This algorithm only accepts one specific kind of features:
        # Combination of PCP + MFCC. Let's get them:
        pcp_obj = Features.select_features(
            "pcp", self.file_struct, self.annot_beats, self.framesync)
        mfcc_obj = Features.select_features(
            "mfcc", self.file_struct, self.annot_beats, self.framesync)

        # Get frame times and make sure they're the same in both features
        frame_times = pcp_obj.frame_times
        assert np.array_equal(frame_times, mfcc_obj.frame_times)

        # Brian wants PCP and MFCC
        # (tranpsosed, because he's that kind of person)
        F = (pcp_obj.features.T, mfcc_obj.features.T)

        # Do actual segmentation
        est_idxs, est_labels = main.do_segmentation(
            F, frame_times, self.config, self.in_bound_idxs)

        return est_idxs, est_labels, F
Example #3
0
def import_data(file_struct, rootpath, output_path, annot_beats):
    msaf.utils.ensure_dir(output_path)
    msaf.utils.ensure_dir(os.path.join(output_path, "features"))
    data_file = '%s/features/%s_annotbeatsE%d.pickle' % \
        (output_path, os.path.splitext(
            os.path.basename(file_struct.audio_file))[0], annot_beats)

    if os.path.exists(data_file):
        with open(data_file, 'r') as f:
            Data = pickle.load(f)
            print file_struct.audio_file, 'cached!'
    else:
        X, dur = features(file_struct, annot_beats)
        pcp_obj = Features.select_features("pcp",
                                           file_struct,
                                           annot_beats,
                                           framesync=False)
        B = pcp_obj.frame_times[:pcp_obj.features.shape[0]]

        if X is None:
            return X

        Y, T, L = align_segmentation(
            get_annotation(file_struct.audio_file, rootpath), B,
            file_struct.audio_file)

        if Y is None:
            return Y

        Data = {
            'features': X,
            'beats': B,
            'filename': file_struct.audio_file,
            'segment_times': T,
            'segment_labels': L,
            'segments': Y
        }
        print file_struct.audio_file, 'processed!'

        with open(data_file, 'w') as f:
            pickle.dump(Data, f)

    return Data
Example #4
0
def import_data(file_struct, rootpath, output_path, annot_beats):
    msaf.utils.ensure_dir(output_path)
    msaf.utils.ensure_dir(os.path.join(output_path, "features"))
    data_file = '%s/features/%s_annotbeatsE%d.pickle' % \
        (output_path, os.path.splitext(
            os.path.basename(file_struct.audio_file))[0], annot_beats)

    if os.path.exists(data_file):
        with open(data_file, 'r') as f:
            Data = pickle.load(f)
            print file_struct.audio_file, 'cached!'
    else:
        X, dur = features(file_struct, annot_beats)
        pcp_obj = Features.select_features("pcp", file_struct, annot_beats,
                                           framesync=False)
        B = pcp_obj.frame_times[:pcp_obj.features.shape[0]]

        if X is None:
            return X

        Y, T, L = align_segmentation(
            get_annotation(file_struct.audio_file, rootpath), B,
            file_struct.audio_file)

        if Y is None:
            return Y

        Data = {'features': X,
                'beats': B,
                'filename': file_struct.audio_file,
                'segment_times': T,
                'segment_labels': L,
                'segments': Y}
        print file_struct.audio_file, 'processed!'

        with open(data_file, 'w') as f:
            pickle.dump(Data, f)

    return Data
Example #5
0
def features(file_struct, annot_beats=False, framesync=False):
    '''Feature-extraction for audio segmentation
    Arguments:
        file_struct -- msaf.io.FileStruct
        paths to the input files in the Segmentation dataset

    Returns:
        - X -- ndarray

            beat-synchronous feature matrix:
            MFCC (mean-aggregated)
            Chroma (median-aggregated)
            Latent timbre repetition
            Latent chroma repetition
            Time index
            Beat index

        - dur -- float
            duration of the track in seconds

    '''
    def compress_data(X, k):
        Xtemp = X.dot(X.T)
        if len(Xtemp) == 0:
            return None
        e_vals, e_vecs = np.linalg.eig(Xtemp)

        e_vals = np.maximum(0.0, np.real(e_vals))
        e_vecs = np.real(e_vecs)

        idx = np.argsort(e_vals)[::-1]

        e_vals = e_vals[idx]
        e_vecs = e_vecs[:, idx]

        # Truncate to k dimensions
        if k < len(e_vals):
            e_vals = e_vals[:k]
            e_vecs = e_vecs[:, :k]

        # Normalize by the leading singular value of X
        Z = np.sqrt(e_vals.max())

        if Z > 0:
            e_vecs = e_vecs / Z

        return e_vecs.T.dot(X)

    # Latent factor repetition features
    def repetition(X, metric='euclidean'):
        R = librosa.segment.recurrence_matrix(
            X, k=2 * int(np.ceil(np.sqrt(X.shape[1]))),
            width=REP_WIDTH, metric=metric, sym=False).astype(np.float32)

        P = scipy.signal.medfilt2d(librosa.segment.recurrence_to_lag(R),
                                   [1, REP_FILTER])

        # Discard empty rows.
        # This should give an equivalent SVD, but resolves some numerical
        # instabilities.
        P = P[P.any(axis=1)]

        return compress_data(P, N_REP)

    #########
    # '\tloading annotations and features of ', audio_path
    pcp_obj = Features.select_features("pcp", file_struct, annot_beats,
                                       framesync)
    mfcc_obj = Features.select_features("mfcc", file_struct, annot_beats,
                                        framesync)
    chroma = pcp_obj.features
    mfcc = mfcc_obj.features
    beats = pcp_obj.frame_times
    dur = pcp_obj.dur

    # Sampling Rate
    sr = msaf.config.sample_rate

    ##########
    # print '\treading beats'
    B = beats[:chroma.shape[0]]
    # beat_frames = librosa.time_to_frames(B, sr=sr,
                                         #hop_length=msaf.config.hop_size)
    #print beat_frames, len(beat_frames), uidx

    #########
    M = mfcc.T
    #plt.imshow(M, interpolation="nearest", aspect="auto"); plt.show()

    #########
    # Get the beat-sync chroma
    C = chroma.T
    C += C.min() + 0.1
    C = C / C.max(axis=0)
    C = 80 * np.log10(C)  # Normalize from -80 to 0
    #plt.imshow(C, interpolation="nearest", aspect="auto"); plt.show()

    # Time-stamp features
    N = np.arange(float(chroma.shape[0]))

    #########
    #print '\tgenerating structure features'

    # TODO:  This might fail if audio file (or number of beats) is too small
    R_timbre = repetition(librosa.feature.stack_memory(M))
    R_chroma = repetition(librosa.feature.stack_memory(C))
    if R_timbre is None or R_chroma is None:
        return None, dur

    R_timbre += R_timbre.min()
    R_timbre /= R_timbre.max()
    R_chroma += R_chroma.min()
    R_chroma /= R_chroma.max()
    #plt.imshow(R_chroma, interpolation="nearest", aspect="auto"); plt.show()

    # Stack it all up
    #print M.shape, C.shape, R_timbre.shape, R_chroma.shape, len(B), len(N)
    X = np.vstack([M, C, R_timbre, R_chroma, B, B / dur, N,
                   N / float(chroma.shape[0])])

    #plt.imshow(X, interpolation="nearest", aspect="auto"); plt.show()

    return X, dur
Example #6
0
def features(file_struct, annot_beats=False, framesync=False):
    '''Feature-extraction for audio segmentation
    Arguments:
        file_struct -- msaf.io.FileStruct
        paths to the input files in the Segmentation dataset

    Returns:
        - X -- ndarray

            beat-synchronous feature matrix:
            MFCC (mean-aggregated)
            Chroma (median-aggregated)
            Latent timbre repetition
            Latent chroma repetition
            Time index
            Beat index

        - dur -- float
            duration of the track in seconds

    '''
    def compress_data(X, k):
        Xtemp = X.dot(X.T)
        if len(Xtemp) == 0:
            return None
        e_vals, e_vecs = np.linalg.eig(Xtemp)

        e_vals = np.maximum(0.0, np.real(e_vals))
        e_vecs = np.real(e_vecs)

        idx = np.argsort(e_vals)[::-1]

        e_vals = e_vals[idx]
        e_vecs = e_vecs[:, idx]

        # Truncate to k dimensions
        if k < len(e_vals):
            e_vals = e_vals[:k]
            e_vecs = e_vecs[:, :k]

        # Normalize by the leading singular value of X
        Z = np.sqrt(e_vals.max())

        if Z > 0:
            e_vecs = e_vecs / Z

        return e_vecs.T.dot(X)

    # Latent factor repetition features
    def repetition(X, metric='euclidean'):
        R = librosa.segment.recurrence_matrix(
            X,
            k=2 * int(np.ceil(np.sqrt(X.shape[1]))),
            width=REP_WIDTH,
            metric=metric,
            sym=False).astype(np.float32)

        P = scipy.signal.medfilt2d(librosa.segment.structure_feature(R),
                                   [1, REP_FILTER])

        # Discard empty rows.
        # This should give an equivalent SVD, but resolves some numerical
        # instabilities.
        P = P[P.any(axis=1)]

        return compress_data(P, N_REP)

    #########
    # '\tloading annotations and features of ', audio_path
    pcp_obj = Features.select_features("pcp", file_struct, annot_beats,
                                       framesync)
    mfcc_obj = Features.select_features("mfcc", file_struct, annot_beats,
                                        framesync)
    chroma = pcp_obj.features
    mfcc = mfcc_obj.features
    beats = pcp_obj.frame_times
    dur = pcp_obj.dur

    # Sampling Rate
    sr = msaf.config.sample_rate

    ##########
    #print '\treading beats'
    B = beats[:chroma.shape[0]]
    #beat_frames = librosa.time_to_frames(B, sr=sr,
    #hop_length=msaf.config.hop_size)
    #print beat_frames, len(beat_frames), uidx

    #########
    M = mfcc.T
    #plt.imshow(M, interpolation="nearest", aspect="auto"); plt.show()

    #########
    # Get the beat-sync chroma
    C = chroma.T
    C += C.min() + 0.1
    C = C / C.max(axis=0)
    C = 80 * np.log10(C)  # Normalize from -80 to 0
    #plt.imshow(C, interpolation="nearest", aspect="auto"); plt.show()

    # Time-stamp features
    N = np.arange(float(chroma.shape[0]))

    #########
    #print '\tgenerating structure features'

    # TODO:  This might fail if audio file (or number of beats) is too small
    R_timbre = repetition(librosa.feature.stack_memory(M))
    R_chroma = repetition(librosa.feature.stack_memory(C))
    if R_timbre is None or R_chroma is None:
        return None, dur

    R_timbre += R_timbre.min()
    R_timbre /= R_timbre.max()
    R_chroma += R_chroma.min()
    R_chroma /= R_chroma.max()
    #plt.imshow(R_chroma, interpolation="nearest", aspect="auto"); plt.show()

    # Stack it all up
    #print M.shape, C.shape, R_timbre.shape, R_chroma.shape, len(B), len(N)
    X = np.vstack(
        [M, C, R_timbre, R_chroma, B, B / dur, N, N / float(chroma.shape[0])])

    #plt.imshow(X, interpolation="nearest", aspect="auto"); plt.show()

    return X, dur