Esempio n. 1
0
    def __test(y, top_db, ref, trim_duration):
        yt, idx = librosa.effects.trim(y, top_db=top_db,
                                       ref=ref)

        # Test for index position
        fidx = [slice(None)] * y.ndim
        fidx[-1] = slice(*idx.tolist())
        assert np.allclose(yt, y[tuple(fidx)])

        # Verify logamp
        rms = librosa.feature.rmse(y=librosa.to_mono(yt), center=False)
        logamp = librosa.power_to_db(rms**2, ref=ref, top_db=None)
        assert np.all(logamp > - top_db)

        # Verify logamp
        rms_all = librosa.feature.rmse(y=librosa.to_mono(y)).squeeze()
        logamp_all = librosa.power_to_db(rms_all**2, ref=ref,
                                         top_db=None)

        start = int(librosa.samples_to_frames(idx[0]))
        stop = int(librosa.samples_to_frames(idx[1]))
        assert np.all(logamp_all[:start] <= - top_db)
        assert np.all(logamp_all[stop:] <= - top_db)

        # Verify duration
        duration = librosa.get_duration(yt)
        assert np.allclose(duration, trim_duration, atol=1e-1), duration
Esempio n. 2
0
    def __test(y, top_db, ref, trim_duration):
        yt, idx = librosa.effects.trim(y, top_db=top_db,
                                       ref=ref)

        # Test for index position
        fidx = [slice(None)] * y.ndim
        fidx[-1] = slice(*idx.tolist())
        assert np.allclose(yt, y[tuple(fidx)])

        # Verify logamp
        rms = librosa.feature.rms(y=librosa.to_mono(yt), center=False)
        logamp = librosa.power_to_db(rms**2, ref=ref, top_db=None)
        assert np.all(logamp > - top_db)

        # Verify logamp
        rms_all = librosa.feature.rms(y=librosa.to_mono(y)).squeeze()
        logamp_all = librosa.power_to_db(rms_all**2, ref=ref,
                                         top_db=None)

        start = int(librosa.samples_to_frames(idx[0]))
        stop = int(librosa.samples_to_frames(idx[1]))
        assert np.all(logamp_all[:start] <= - top_db)
        assert np.all(logamp_all[stop:] <= - top_db)

        # Verify duration
        duration = librosa.get_duration(yt)
        assert np.allclose(duration, trim_duration, atol=1e-1), duration
Esempio n. 3
0
    def _slice_audio_by_interval(y: np.ndarray,
                                 sr: float,
                                 hop_length: int = 512,
                                 segmentation_interval_s: float = 1.0,
                                 **_kwargs) -> Tuple[np.ndarray, np.ndarray]:
        interval_samples: int = librosa.time_to_samples(
            segmentation_interval_s, sr=sr)
        total_samples: int = y.size  # y is monophonic
        num_segments: int = np.ceil(total_samples / interval_samples)
        onset_samples: np.ndarray = interval_samples * np.arange(num_segments)
        onset_frames: np.ndarray = librosa.samples_to_frames(
            onset_samples, hop_length=hop_length)

        duration_samples: np.ndarray = interval_samples * np.ones_like(
            onset_frames)

        # adjust duration of last fragment to end of file
        remainder = total_samples % interval_samples

        if remainder == 0:
            # `total_samples` is divisible by `interval_samples`: ceil operation above was not needed
            pass
        else:
            # `total_samples` is not divisible by `interval_samples`: last slice is shorter
            duration_samples[-1] = remainder

        duration_frames: np.ndarray = librosa.samples_to_frames(
            duration_samples, hop_length=hop_length)

        return onset_frames, duration_frames
def gen_hihat(all_data, fs, fps, cand):
    fps = librosa.samples_to_frames(fs, hop_length=hop_len, n_fft=win_len)
    fps = 100
    print(cand)
    proc = BeatTrackingProcessor(look_aside=0.2, fps=fps)
    act = RNNBeatProcessor()(all_data)
    beat_times = proc(act)

    song_len = librosa.samples_to_time(data.shape, sr=fs)[0]
    hihat = np.zeros(all_data.shape)
    idx = np.where(beat_times <= song_len)[0]
    new_beat_times = np.zeros(idx.shape)
    new_beat_times[idx] = beat_times[idx]
    beat_samples = librosa.time_to_samples(new_beat_times, sr=fs)
    start = librosa.frames_to_samples(cand[0], hop_len, n_fft=win_len)
    end = librosa.frames_to_samples(cand[-1], hop_len, n_fft=win_len)
    cand_len = end - start

    i = 3
    is_hihat = np.zeros(beat_samples.shape)
    while i < len(beat_samples):
        is_hihat[i] = 1
        i = i + 4
    for i, s in enumerate(beat_samples):
        if is_hihat[i] == 1:
            if s + cand_len > hihat.shape:
                break
            hihat[s:s + cand_len] = data[start:end]

    return hihat, new_beat_times, beat_samples
Esempio n. 5
0
 def get_frame(self) -> int:
     if Beat.INDEX_VALUE == 'samples':
         return librosa.samples_to_frames(self.index, hop_length=util.HOP_LENGTH)
     elif Beat.INDEX_VALUE == 'time':
         return librosa.time_to_frames(self.index, sr=util.SAMPLE_RATE, hop_length=util.HOP_LENGTH)
     else:
         raise NotImplementedError("Only samples and time are supported")
Esempio n. 6
0
def trackBeatsPer16thNote(x,
                          bpm,
                          sr=22050,
                          hop_length=512,
                          offset_16th_notes=0):
    """
    clickで書き出すと16分音符毎にクリック音が鳴らせるようにビートトラッキングする
    16分音符毎にインデックスが割り当てられている
    offset_16th_notes(最初の16分音符の数)でアウフタクトの除去が可能
    """
    tempo, beat_samples = librosa.beat.beat_track(x,
                                                  sr=sr,
                                                  hop_length=hop_length,
                                                  start_bpm=bpm,
                                                  units='samples')
    beat_frames_per_16th_note = []
    for i in range(len(beat_samples) - 1):
        interval_per_16th_units = librosa.samples_to_frames(
            np.linspace(beat_samples[i], beat_samples[i + 1], 5),
            hop_length=hop_length)
        print(interval_per_16th_units[0:4])
        beat_frames_per_16th_note = np.hstack(
            (beat_frames_per_16th_note, interval_per_16th_units[0:4]))
    if offset_16th_notes > 0:
        beat_frames_per_16th_note = beat_frames_per_16th_note[
            offset_16th_notes:]
    return beat_frames_per_16th_note.astype(np.int)
Esempio n. 7
0
 def apply_trim_offset(self, frame):
     return (
         librosa.samples_to_frames(
             librosa.frames_to_samples(frame) + self.trim_offset
         )
         if self.trim_offset
         else frame
     )
Esempio n. 8
0
 def __test(x, y, hop_length, n_fft):
     y_test = librosa.samples_to_frames(x,
                                        hop_length=hop_length,
                                        n_fft=n_fft)
     assert np.allclose(y, y_test)
     y = np.asanyarray(y)
     assert y.shape == y_test.shape
     assert y.ndim == y_test.ndim
Esempio n. 9
0
 def __test(x, y, hop_length, n_fft):
     y_test = librosa.samples_to_frames(x,
                                        hop_length=hop_length,
                                        n_fft=n_fft)
     assert np.allclose(y, y_test)
     y = np.asanyarray(y)
     assert y.shape == y_test.shape
     assert y.ndim == y_test.ndim
Esempio n. 10
0
def convert_sample_to_nframes(y_sample_start, y_sample_end, **stft_args):
    n_fft = 2048
    if 'n_fft' in stft_args:
        n_fft = stft_args['n_fft']
    hop_length = n_fft//4
    if 'hop_length' in stft_args:
        hop_length = stft_args['hop_length']
    return lr.samples_to_frames(np.array([y_sample_start, y_sample_end]),
                                hop_length=hop_length, n_fft=n_fft)
Esempio n. 11
0
def get_downbeats(y, tempo, beat_frames, sr):
    measures = len(beat_frames) // BEATS
    beat_frames = librosa.samples_to_frames(beat_frames)
    onset_env = librosa.onset.onset_strength(y, sr=sr, aggregate=np.median)
    beat_strengths = onset_env[beat_frames]
    measure_beat_strengths = beat_strengths[:measures * BEATS].reshape(
        -1, BEATS)
    beat_pos_strength = np.sum(measure_beat_strengths, axis=0)
    downbeat_pos = np.argmax(beat_pos_strength)
    full_measure_beats = beat_frames[:measures * BEATS].reshape(-1, BEATS)
    downbeat_frames = full_measure_beats[:, downbeat_pos]
    return librosa.frames_to_samples(downbeat_frames)
Esempio n. 12
0
def test_samples_to_frames(samples, hop_length, n_fft):

    frames = librosa.samples_to_frames(samples,
                                       hop_length=hop_length,
                                       n_fft=n_fft)
    samples = np.asanyarray(samples)
    assert frames.shape == samples.shape
    assert frames.ndim == samples.ndim
    if n_fft is None:
        assert np.allclose(samples, frames * hop_length)
    else:
        assert np.allclose((samples - n_fft // 2) // hop_length, frames)
def read_data():
    os.chdir(DATA_PATH)
    os.chdir('Gitarre monophon/Samples/Distortion')
    train_data = []
    train_labels = []
    for file_name in os.listdir(os.getcwd()):
        if file_name.endswith(".wav"):
            print(file_name)
            # Labeling the sample with one hot encoding
            label_no = int(file_name[13])  # Effect setting is the label
            label = np.zeros([3])
            label[label_no - 1] = 1
            train_labels.append(label)

            # Loading the audio
            y, sr = librosa.load(file_name, sr=44100)
            # Onset Detection
            y = np.insert(y, 0, np.zeros(1023))
            y = librosa.util.normalize(y)

            onset_frame = librosa.onset.onset_detect(y=y,
                                                     sr=sr,
                                                     units='frames',
                                                     pre_max=20000,
                                                     post_max=20000,
                                                     pre_avg=20000,
                                                     post_avg=20000,
                                                     delta=0,
                                                     wait=1000)
            offset_frame = librosa.samples_to_frames(samples=y.shape[0])
            onset_sample = librosa.core.frames_to_samples(onset_frame[0])
            offset_sample = librosa.core.frames_to_samples(offset_frame)
            y_cut = y[onset_sample:offset_sample]

            mfcc = librosa.feature.mfcc(y=y_cut, sr=sr, n_mfcc=2)
            mfcc_delta = librosa.feature.delta(mfcc)
            m_features = np.concatenate((mfcc, mfcc_delta))
            v_features = []
            for feat in m_features:
                lin_coeff, lin_residual, _, _, _ = np.polyfit(np.arange(
                    len(feat)),
                                                              feat,
                                                              1,
                                                              full=True)
                v_features.extend(lin_coeff)
                # v_features.append(lin_residual)
            train_data.append(np.hstack(v_features))
    train_data = np.array(train_data)
    train_labels = np.array(train_labels)
    return train_data, train_labels
Esempio n. 14
0
 def forward(self, signals, lengths):
     mel_features = self.mfcc(signals)
     if self.remove_zeroth_coef:
         mel_features = mel_features[:, 1:, :]
     device = lengths.device
     lengths_frames = librosa.samples_to_frames(lengths.cpu().numpy(),
                                                hop_length=self.hop_length,
                                                n_fft=self.n_fft)
     lengths_frames = torch.Tensor(lengths_frames).to(device).int()
     if self.use_deltas:
         delta = self.deltas(mel_features)
         delta2 = self.deltas(delta)
         mel_features = torch.cat((mel_features, delta, delta2), dim=-2)
     if self.normalize_features:
         mel_features = self.norm(mel_features)
     return mel_features, lengths_frames
Esempio n. 15
0
    def _compute_slice_durations(y: np.ndarray,
                                 sr: float,
                                 hop_length: float,
                                 onsets: np.ndarray,
                                 min_size_s: Optional[float] = None,
                                 max_size_s: Optional[float] = None,
                                 off_threshold_db: Optional[float] = None,
                                 discard_by_mean: bool = True,
                                 **_kwargs) -> Tuple[np.ndarray, np.ndarray]:
        """ y: mono signal [shape: (n,)]
            onsets: onset frames """
        rms_frames_db = 20 * np.log10(
            np.abs(librosa.feature.rms(y=y, hop_length=hop_length)) +
            librosa.util.tiny(y)).reshape(-1)
        eof = librosa.samples_to_frames(y.size, hop_length=hop_length)
        durations = np.diff(np.block([onsets, eof]))

        if max_size_s is not None:
            max_size_frames = librosa.time_to_frames(max_size_s,
                                                     sr=sr,
                                                     hop_length=hop_length)
            durations[durations > max_size_frames] = max_size_frames

        if off_threshold_db is not None:
            for i in range(onsets.size):
                segment_rms = rms_frames_db[onsets[i]:onsets[i] + durations[i]]
                first_silent_frame = np.argmax(segment_rms < off_threshold_db)
                # Only discard part of segment if mean of entire part to be discarded is below threshold
                if discard_by_mean:
                    if np.mean(segment_rms[first_silent_frame:]
                               ) < off_threshold_db:
                        durations[i] = first_silent_frame
                # Discard part of segment starting from frame below threshold
                else:
                    # `np.argmax(a < v)` will by default return 0 if it doesn't find any matches:
                    # therefore the check that the condition indeed is fulfilled.
                    durations[i] = first_silent_frame if segment_rms[first_silent_frame] < off_threshold_db else \
                        durations[i]

        if min_size_s is not None:
            valid_frames_mask = durations > min_size_s
            onsets = onsets[valid_frames_mask]
            durations = durations[valid_frames_mask]

        return onsets, durations
Esempio n. 16
0
    def retrieve_components(self, selection_order=None):
        if selection_order is None:
            return self.spectrogram

        if len(selection_order) > 0:
            max_val = max(selection_order)
            if max_val >= self.get_number_components():
                raise ValueError("{} out of bounds for {} components", max_val,
                                 self.get_number_components())

        mask = torch.zeros_like(self.spectrogram)
        unmask = torch.ones_like(self.spectrogram)

        # following the order of segments in [Mishra 2017] Figure 4
        temp_length = mask.shape[1] // len(self.temporal_segments)
        freq_length = mask.shape[0] // self.n_frequency_segments

        left_over = mask.shape[1] - temp_length * len(self.temporal_segments)
        if left_over > 0:
            warnings.warn(
                "Adding last {} frames to last segment".format(left_over))

        def compute_f_start(f):
            return f * freq_length

        def compute_f_end(f):
            return compute_f_start(f) + freq_length

        for so in selection_order:
            t = so // self.n_frequency_segments  # index of temporal_segment
            # print("t", t)
            f = so % self.n_frequency_segments

            [t_start,
             t_end] = librosa.samples_to_frames(self.temporal_segments[t],
                                                hop_length=self.hop_length)
            if t == len(self.temporal_segments) - 1:
                t_end = mask.shape[1]
            # print("t_start {}, t_end{}".format(t_start, t_end))
            f_start = compute_f_start(f)
            f_end = compute_f_end(f)
            mask[f_start:f_end, t_start:t_end] = 1.
            unmask[f_start:f_end, t_start:t_end] = 0.

        return self.spectrogram * mask + self.baseline * unmask
Esempio n. 17
0
def length_convert(length: float,
                   sr: int,
                   units_def: LengthUnit,
                   units_target: LengthUnit,
                   hop_length: int = 512) -> float:
    """Convert length from one unit to another.

    Parameters
    ----------
    length : float
        in sec
    sr : int
    units_def : LengthUnit
        Units that are passed
    units_target : LengthUnit
        Units that are expected
    hop_length : int, optional
        512 by default, mandatory for frames conversion

    Returns
    -------
    float
    """
    if units_def == LengthUnit.samples:
        if units_target == LengthUnit.frames:
            return lr.samples_to_frames(length, hop_length)  # type:ignore
        if units_target == LengthUnit.ms:
            return lr.samples_to_time(length, sr)  # type:ignore
        return length
    if units_def == LengthUnit.ms:
        if units_target == LengthUnit.samples:
            return lr.time_to_samples(length, sr)  # type:ignore
        if units_target == LengthUnit.frames:
            return lr.time_to_frames(length, sr, hop_length)  # type:ignore
        return length
    if units_def == LengthUnit.frames:
        if units_target == LengthUnit.samples:
            return lr.frames_to_samples(length, hop_length)  # type:ignore
        if units_target == LengthUnit.ms:
            return lr.frames_to_time(length, sr, hop_length)  # type:ignore
        return length
    raise TypeError(f'not a LengthUnit: {units_def, units_target}')
Esempio n. 18
0
def process_damp_data(artist_tracks_file):
    sys.path.append('../')
    import damp_config

    damp_data_dir = damp_config.vocal_audio_dir
    musdb_data_dir = damp_config.bg_audio_dir
    # musdb_data_dir = 'background_tracks'

    if not os.path.exists('damp_mashup_output'):
        os.makedirs('damp_mashup_output')

    train_dict = pickle.load(open(artist_tracks_file, 'rb'))

    vocal_paths = []
    for artist_id, track_list in train_dict.items():
        for track_id in track_list:
            vocal_track_path = os.path.join(damp_data_dir, track_id + '.m4a')

            mashability_result = find_mashup_pairs(vocal_track_path,
                                                   musdb_data_dir)

            for start_sample, (bg_track,
                               (bg_start_sample, bg_key,
                                bg_ismajor)) in mashability_result.items():
                print(start_sample, bg_track, bg_start_sample)
                mixed_output = mash(vocal_track_path, start_sample, bg_track,
                                    bg_start_sample, 3.0)
                start_frame = librosa.samples_to_frames(
                    start_sample,
                    hop_length=damp_config.hop_length,
                    n_fft=damp_config.n_fft)
                # librosa.output.write_wav(os.path.join(config.mix_audio_dir, Path(vocal_path).stem + '_' + str(start_frame) +'.wav'), mixed_output, sr=AUDIO_PARAMS['sr'])
                print(track_id, start_sample / 44100, start_frame)
                soundfile.write(os.path.join(
                    config.mix_audio_dir,
                    track_id + '_' + str(start_frame) + '.wav'),
                                mixed_output,
                                AUDIO_PARAMS['sr'],
                                format='WAV')
Esempio n. 19
0
def main_autoedit(args, **kwargs):
    """main_autoedit

    Complete autoedit flow

    ..todo::
    - loop over chunks of input, batch is large single chunk
    - handle returned chunk data, integrate over time
    - chunk parallel processing (entire graph) vs. chunk serial processing (entire graph)
    - nodes with memory and nodes without

    - graph class
    - populate 'func' in graph w/ cached/non-cached funcs
    - step file / stream input: deal with chunking and collecting, refuse to work on files > maxlen (configurable)
    - openl3
    """
    # convert args to dict
    # kwargs = args_to_dict(args)

    # convert arguments to locals, TODO: config file for autoedit param dict
    # sr_comp = kwargs['sr_comp'] # 22050
    # numsegs = kwargs['numsegs'] # 10
    # duration = kwargs['duration'] # 10
    # verbose = kwargs['verbose']
    # seglen_min = time_to_frames(kwargs['seglen_min'])
    # seglen_max = time_to_frames(kwargs['seglen_max'])

    args = autoedit_args_check(args)

    seglen_min = time_to_frames(args.seglen_min)
    seglen_max = time_to_frames(args.seglen_max)
    timebase = "frames"
    spacer = '\n    '

    # caching
    # compute_music_extractor_essentia_cached = memory.cache(compute_music_extractor_essentia)

    # computation graph g
    g = OrderedDict()

    # populate graph with functions
    g['func'] = {}
    for func in [
            compute_beats_librosa,
            compute_chroma_librosa,
            compute_onsets_librosa,
            compute_segments_essentia,
            compute_segments_librosa,
            data_load_essentia,
            data_load_librosa,
    ]:
        g['func'][func] = memory.cache(func)

    for func in [
            compute_event_merge_combined,
            track_assemble_from_segments,
            track_assemble_from_segments_sequential_scale,
    ]:
        g['func'][func] = func

    # layer 1: file data
    g['l1_files'] = OrderedDict()
    for filename in args.filenames:
        # replace with basename
        filename_short = filename.split('/')[-1]
        if args.verbose:
            print(('main_autoedit{1}filename_short: {0}'.format(
                filename_short, spacer)))
        # files[filename_short] = compute_tempo_beats(filename)
        # load data
        # y, sr = data_load_essentia_cached(filename)
        # compute beatiness on data
        g['l1_files'][filename_short] = {}
        tmp_ = g['func'][data_load_essentia](filename, sr=args.sr_comp)
        # tmp_ = g['func'][data_load_librosa](filename, sr=args.sr_comp)
        g['l1_files'][filename_short]['data'] = tmp_[0]
        g['l1_files'][filename_short]['numsamples'] = len(tmp_[0])
        g['l1_files'][filename_short]['numframes'] = samples_to_frames(
            len(tmp_[0]))
        g['l1_files'][filename_short]['sr'] = tmp_[1]
        if args.verbose:
            print(
                'main_autoedit{5}loaded {0} with shape {1}, numsamples {2}, numframes {3}, sr {4}'
                .format(filename_short,
                        g['l1_files'][filename_short]['data'].shape,
                        g['l1_files'][filename_short]['numsamples'],
                        g['l1_files'][filename_short]['numframes'],
                        g['l1_files'][filename_short]['sr'], spacer))

    # layer 2: compute chromagram
    g['l2_chromagram'] = {}
    for file_ in g['l1_files']:
        # file_key = '{0}-{1}'.format(file_, 'chromagram')
        g['l2_chromagram'][file_] = {}
        g['l2_chromagram'][file_]['data'] = g['func'][compute_chroma_librosa](
            g['l1_files'][file_]['data'], args.sr_comp)['chromagram']

    # layer 3: compute segments based on chromagram
    g['l3_segments'] = OrderedDict()
    for file_ in g['l2_chromagram']:
        # file_key = '{0}-{1}'.format(file_, 'segments')
        bounds_frames = g['func'][compute_segments_essentia](
            g['l2_chromagram'][file_]['data'], args.sr_comp,
            args.numsegs)['bounds_frames']
        # print(('    file_: {0}, bounds_frames {1}, {2}'.format(file_, len(bounds_frames), pformat(bounds_frames))))
        g['l3_segments'][file_] = {}
        g['l3_segments'][file_]['seg_sbic'] = np.clip(bounds_frames, 0, [
            g['l1_files'][filename_short]['numframes']
            for filename_short in g['l1_files']
        ][0] - 1)

        bounds_frames = g['func'][compute_segments_librosa](
            g['l2_chromagram'][file_]['data'], args.sr_comp,
            args.numsegs)['bounds_frames']
        # print(('    file_: {0}, bounds_frames {1}, {2}'.format(file_, len(bounds_frames), pformat(bounds_frames))))
        g['l3_segments'][file_]['seg_clust_1'] = bounds_frames

        bounds_frames = g['func'][compute_segments_librosa](
            g['l2_chromagram'][file_]['data'], args.sr_comp,
            args.numsegs + 5)['bounds_frames']
        # print(('    file_: {0}, bounds_frames {1}, {2}'.format(file_, len(bounds_frames), pformat(bounds_frames))))
        g['l3_segments'][file_]['seg_clust_2'] = bounds_frames

    # layer 4: compute onsets
    g['l4_onsets'] = OrderedDict()
    for file_ in g['l1_files']:
        onsets = g['func'][compute_onsets_librosa](
            g['l1_files'][file_]['data'], args.sr_comp)
        g['l4_onsets'][file_] = onsets

    # layer 5: compute beats based on onsets
    g['l5_beats'] = OrderedDict()
    for file_ in g['l4_onsets']:
        g['l5_beats'][file_] = {}
        for start_bpm in [60, 90, 120]:
            beats = g['func'][compute_beats_librosa](
                g['l4_onsets'][file_]['onsets_env'],
                g['l4_onsets'][file_]['onsets_frames'], start_bpm,
                args.sr_comp)
            # print('beats type = {0}'.format(type(beats['beats'])))
            # beats['beats'] = beats['beats'][np.logical_not(np.isnan(beats['beats']))]
            # beats = beats[~np.isnan(beats)]
            # print('    file_: {0}, bounds_frames {1}, {2}'.format(file_, len(bounds_frames), pformat(bounds_frames)))
            g['l5_beats'][file_]['beats_{0}'.format(
                start_bpm)] = beats['beats']
            g['l5_beats'][file_]['beats_{0}_16'.format(
                start_bpm)] = beats['beats'][::16]

    # layer 6: compute final segments from merging segments with beats
    g['l6_merge'] = OrderedDict()
    g['l6_merge']['files'] = []
    for file_ in g['l1_files']:
        # get basedir from filename
        dirname = os.path.dirname(filename)
        # return realpath absolute path
        # dirname = os.path.dirname(os.path.realpath(filename))
        if dirname == '':
            dirname = '.'

        if args.verbose:
            print(f'main_autoedit dirname {dirname}')
            print(
                f'main_autoedit{spacer}l6_merge file_ {file_}, dirname {dirname}, filename {filename}'
            )
        beats_keys = ['beats_60', 'beats_90', 'beats_120'
                      ] + ['beats_60_16', 'beats_90_16', 'beats_120_16']
        # beats = [g['l5_beats'][file_][beat_type] for beat_type in beats_keys for file_ in g['l1_files']]
        beats = [g['l5_beats'][file_][beat_type] for beat_type in beats_keys]
        # segs = [g['l3_segments'][file_][seg_type_] for seg_type_ in ['seg_sbic', 'seg_clust_1', 'seg_clust_2'] for file_ in g['l1_files']]
        segs = [
            g['l3_segments'][file_][seg_type_]
            for seg_type_ in ['seg_sbic', 'seg_clust_1', 'seg_clust_2']
        ]
        numframes = g['l1_files'][file_]['numframes']
        # compute
        if args.verbose:
            print(
                f'main_autoedit{spacer}l6_merge dirname {dirname}, filename {filename}'
            )
        files = g['func'][compute_event_merge_combined](
            filename_48=dirname + '/' + file_,
            beats=beats,
            segs=segs,
            numframes=numframes,
            numsegs=args.numsegs,
            verbose=args.verbose,
            sr_comp=args.sr_comp,
            rootdir=args.rootdir,
        )

        g['l6_merge']['files'].extend(files['files'])
        if args.verbose:
            print('main_autoedit{2}l6_merge {0}, {1}'.format(
                file_, g['l6_merge']['files'], spacer))

    # layer 7: compute assembled song from segments and duration
    g['l7_assemble'] = OrderedDict()
    # compute duration
    g['l6_merge']['duration'] = args.duration
    # output filename
    g['l6_merge']['filename_export'] = args.filename_export
    # crossfade argument
    g['l6_merge']['assemble_crossfade'] = args.assemble_crossfade
    # rootdir argument
    g['l6_merge']['rootdir'] = args.rootdir
    g['l6_merge']['verbose'] = args.verbose

    if args.assemble_mode == 'random':
        g['l7_assemble']['outfile'] = g['func'][track_assemble_from_segments](
            **(g['l6_merge']))
    elif args.assemble_mode == 'sequential':
        g['l7_assemble']['outfile'] = g['func'][
            track_assemble_from_segments_sequential_scale](**(g['l6_merge']))

    filename_export_wav = g['l7_assemble']['outfile']['filename_export_wav']
    filename_export_txt = g['l7_assemble']['outfile']['filename_export_txt']
    export_duration = g['l7_assemble']['outfile']['final_duration']
    export_segs = g['l7_assemble']['outfile']['seg_s']
    export_numsegs = len(g['l7_assemble']['outfile']['seg_s'])

    if 'pkl' in args.outputs:
        filename_export_graph = os.path.join(
            args.rootdir, f'{args.filename_export}_graph.pkl')
        if args.verbose:
            print(
                f'main_autoedit{spacer}exporting graph to {filename_export_graph}'
            )
        joblib.dump(g, filename_export_graph)

    # # plot dictionary g as graph
    # autoedit_graph_from_dict(g=g, plot=False)

    ret = {
        'data': {
            'output_files': [
                {
                    'format': 'wav',
                    'filename': os.path.basename(filename_export_wav)
                },
                {
                    'format': 'txt',
                    'filename': os.path.basename(filename_export_txt)
                },
            ],
            'output_length':
            export_duration,
            'output_numsegs':
            export_numsegs,
        }
    }

    if 'pkl' in args.outputs:
        ret['data']['output_files'].append({
            'format': 'pkl',
            'filename': filename_export_graph
        })

    # # yeah nice, should be obsolete
    # ret.update(g['l7_assemble']['outfile'])

    filename_result = os.path.join(
        args.rootdir,
        os.path.basename(args.filename_export) + ".json")

    # this saves the array in .json format
    json.dump(
        ret,
        codecs.open(filename_result, 'w', encoding='utf-8'),
        # separators=(',', ':'),
        # sort_keys=True,
        # indent=4,
        # cls=NumpyEncoder,
    )

    if 'task' in kwargs:
        kwargs['task'].set_done(
            result_location=os.path.basename(args.filename_export) + ".json")

    return ret
Esempio n. 20
0
    def find_loop_pairs(self):
        runtime_start = time.time()

        S = librosa.core.stft(y=self.audio)
        S_power = np.abs(S) ** 2
        S_weighed = librosa.core.perceptual_weighting(
            S=S_power, frequencies=librosa.fft_frequencies(sr=self.rate)
        )
        mel_spectrogram = librosa.feature.melspectrogram(S=S_weighed, sr=self.rate, n_mels=128, fmax=8000)
        chroma = librosa.feature.chroma_stft(S=S_power)
        power_db = librosa.power_to_db(S_weighed, ref=np.median)

        onset_env = librosa.onset.onset_strength(S=mel_spectrogram)

        pulse = librosa.beat.plp(onset_envelope=onset_env)
        beats_plp = np.flatnonzero(librosa.util.localmax(pulse))
        bpm, beats = librosa.beat.beat_track(onset_envelope=onset_env)

        beats = np.union1d(beats, beats_plp)
        beats = np.sort(beats)

        logging.info("Detected {} beats at {:.0f} bpm".format(beats.size, bpm))

        min_duration = int(chroma.shape[-1] * self.min_duration_multiplier)

        runtime_end = time.time()
        prep_time = runtime_end - runtime_start
        logging.info("Finished initial audio processing in {:.3}s".format(prep_time))

        candidate_pairs = []

        deviation = np.linalg.norm(chroma[..., beats] * 0.085, axis=0)

        for idx, loop_end in enumerate(beats):
            for loop_start in beats:
                if loop_end - loop_start < min_duration:
                    break
                dist = np.linalg.norm(chroma[..., loop_end] - chroma[..., loop_start])
                if dist <= deviation[idx]:
                    db_diff = self.db_diff(
                        power_db[..., loop_end], power_db[..., loop_start]
                    )
                    if db_diff <= 1.5:
                        candidate_pairs.append(
                            {
                                "loop_start": loop_start,
                                "loop_end": loop_end,
                                "dB_diff": db_diff,
                                "dist": (dist / deviation[idx])
                            }
                        )

        logging.info(f"Found {len(candidate_pairs)} possible loop points")

        if not candidate_pairs:
            return candidate_pairs

        beats_per_second = bpm / 60
        num_test_beats = 12
        seconds_to_test = num_test_beats / beats_per_second
        test_offset = librosa.samples_to_frames(int(seconds_to_test * self.rate))

        # adjust offset for very short tracks to 25% of its length
        if test_offset > chroma.shape[-1]:
            test_offset = chroma.shape[-1] // 4

        candidate_pairs = self._dB_prune(candidate_pairs)

        weights = _geometric_weights(test_offset, start=test_offset // num_test_beats)
        pair_score_list = [
            self._pair_score(
                pair["loop_start"],
                pair["loop_end"],
                chroma,
                test_duration=test_offset,
                weights=weights,
            )
            for pair in candidate_pairs
        ]

        # Add cosine similarity as score
        for pair, score in zip(candidate_pairs, pair_score_list):
            pair["score"] = score

        candidate_pairs = self._score_prune(candidate_pairs)

        # re-sort based on new score
        candidate_pairs = sorted(candidate_pairs, reverse=True, key=lambda x: x["score"])

        # prefer longer loops for highly similar sequences
        if len(candidate_pairs) > 1:
            self._prioritize_duration(candidate_pairs)

        if self.trim_offset:
            for pair in candidate_pairs:
                pair["loop_start"] = self.apply_trim_offset(
                    pair["loop_start"]
                )
                pair["loop_end"] = self.apply_trim_offset(
                    pair["loop_end"]
                )

        for pair in candidate_pairs:
            logging.info(
                "Found from {} to {}, dB_diff:{}, similarity:{}".format(
                    pair["loop_start"],
                    pair["loop_end"],
                    pair["dB_diff"],
                    pair["score"],
                )
            )

        if not candidate_pairs:
            raise LoopNotFoundError(f'No loop points found for {self.filename} with current parameters.')
        else:
            return candidate_pairs
Esempio n. 21
0
 def __test(x, y, hop_length, n_fft):
     y_test = librosa.samples_to_frames(x,
                                        hop_length=hop_length,
                                        n_fft=n_fft)
     assert np.allclose(y, y_test)
Esempio n. 22
0
def main_automix(args):
    """main_automix

    Perform complete automix flow with the following schema:
    
    1. input list of audio files / text file containing list of audio files
    2. loop over files
    2.1. compute bag of measures for each file: beatiness, extractor essentia, features paa

    2.2. sort files by selected feature args.sort_feature
    2.3. assemble output wav from concatenating input files pydub

    2.4. TODO: optional: local measures
    2.4. TODO: optional: complexity / information measures smp/sequence
    """
    # convert args to dict
    kwargs = args_to_dict(args)

    print('main_automix: kwargs {0}'.format(pformat(kwargs)))

    # flow graph g
    g = OrderedDict()

    # cached functions
    g['func'] = {}
    for func in [
            compute_beats_librosa,
            compute_chroma_librosa,
            compute_features_paa,
            compute_music_extractor_essentia,
            compute_onsets_librosa,
            compute_segments_essentia,
            compute_segments_librosa,
            compute_tempo_beats_essentia,
            data_load_essentia,
    ]:
        g['func'][func] = memory.cache(func)

    # uncached functions
    for func in [
            compute_event_merge_combined,
            track_assemble_from_segments,
    ]:
        g['func'][func] = func


    # input type: text file, list of files
    if len(kwargs['filenames']) == 1 and kwargs['filenames'][0].endswith('.txt'):
        filenames = [_.rstrip() for _ in open(kwargs['filenames'][0], 'r').readlines()]
        # print('filenames {0}'.format(pformat(filenames)))
        print('filenames {0}'.format(filenames))
    else:
        filenames = kwargs['filenames']

    # layer 1: file/chunk data
    g['l1_files'] = OrderedDict()
    for i, filename in enumerate(filenames):
        # print('filename {0}: {1}'.format(i, filename))
        
        filename_short = filename.split('/')[-1]
        print(('file: {0}'.format(filename_short)))
        # load data
        # y, sr = g['func'][data_load_essentia](filename)
        g['l1_files'][filename_short] = {}
        tmp_ = g['func'][data_load_essentia](filename)
        g['l1_files'][filename_short]['path'] = filename
        g['l1_files'][filename_short]['data'] = tmp_[0]
        g['l1_files'][filename_short]['numframes'] = samples_to_frames(len(tmp_[0]))
        g['l1_files'][filename_short]['sr'] = tmp_[1]

    # layer 2: beatiness, compute beatiness on data
    # g['l2_beatiness'] = {}
    for file_ in g['l1_files']:
        # file_key = '{0}-{1}'.format(file_, 'beatiness')
        # g['l2_beatiness'][file_] = {}
        tmp_ = g['func'][compute_tempo_beats_essentia](g['l1_files'][file_]['data'])
        # g['l2_beatiness'][file_] = tmp_
        # g['l1_files'][file_]['beatiness'] = tmp_
        g['l1_files'][file_].update(dict([('beatiness' + _, tmp_[_]) for _ in tmp_]))
    
    # layer 3: extractor
    # g['l3_extractor'] = {}
    for file_ in g['l1_files']:
        print('l3_extractor on {0}'.format(file_))
        # file_key = '{0}-{1}'.format(file_, 'extractor')
        # g['l2_extractor'][file_] = {}
        tmp_ = g['func'][compute_music_extractor_essentia](g['l1_files'][file_]['path'])
        # g['l3_extractor'][file_] = tmp_
        # g['l1_files'][file_]['extractor'] = tmp_
        g['l1_files'][file_].update(dict([('extractor_' + _, tmp_[_]) for _ in tmp_]))
    
    # layer 4: paa features
    # g['l4_paa_features'] = {}
    for file_ in g['l1_files']:
        # file_key = '{0}-{1}'.format(file_, 'extractor')
        # g['l4_paa_features'][file_] = {}
        tmp_ = g['func'][compute_features_paa](g['l1_files'][file_]['path'])
        # g['l4_paa_features'][file_]['features_st'] = dict(zip(tmp_[1], tmp_[0]))
        # g['l4_paa_features'][file_]['features_mt'] = dict(zip(tmp_[1], tmp_[2]))
        g['l1_files'][file_].update(dict(zip(['features_st_' + _ for _ in tmp_[1]], [_.mean() for _ in tmp_[0]])))
        g['l1_files'][file_].update(dict(zip(['features_mt_' + _ for _ in tmp_[1]], [_.mean() for _ in tmp_[2]])))
        # g['l1_files'][file_]['features_mt'] = dict(zip(tmp_[1], tmp_[2]))

    # layer 5: 

    pickle.dump(g, open('g.pkl', 'wb'))
    
    # print('files {0}'.format(pformat(files)))
    # plot dictionary g as graph
    autoedit_graph_from_dict(g=g, plot=False)

    l1_files_df = pd.DataFrame.from_dict(g['l1_files']).T

    # sort_key = 'features_mt_energy_entropy_mean'
    # sort_key = 'features_mt_energy_mean'
    # sort_key = 'features_mt_spectral_centroid_mean'
    # sort_key = 'features_mt_spectral_entropy_mean'
    # sort_key = 'features_mt_spectral_flux_mean'
    # sort_key = 'features_mt_spectral_rolloff_mean'
    # sort_key = 'features_mt_spectral_spread_mean'
    # sort_key = 'features_mt_zcr_mean'
    sort_key = kwargs['sorter']
    
    print('Sorting l1_files by {0}'.format(l1_files_df.sort_values(sort_key, ascending=False).path.to_string()))
    l1_files_df.sort_values(sort_key, ascending=False).path.to_csv('automix-assembled-{0}-{1}.{2}'.format(3, sort_key, 'csv'))

    if args.write:
        track_assemble_from_segments_sequential(files=list(l1_files_df.sort_values(sort_key, ascending=False).path),
                                                output_filename='automix-assembled-{0}-{1}.{2}'.format(3, sort_key, 'wav'),
                                                duration=None)
 def __test(x, y, hop_length, n_fft):
     y_test = librosa.samples_to_frames(x,
                                        hop_length=hop_length,
                                        n_fft=n_fft)
     assert np.allclose(y, y_test)
Esempio n. 24
0
test.extend(drum[int(0.5 * drum.shape[0]):])
test.extend(drum)
test.extend(beattt)
test.extend(sapce)
test.extend(sapce)
test.extend(drum)
test.extend(sapce)
test.extend(beattt)
test.extend(sapce)
test.extend(drum)
test.extend(drum[int(0.5 * drum.shape[0]):])
sss = np.zeros(data.shape)
sss[:np.array(test).shape[0]] = np.array(test)
#sd.play(sss*5+data*5, fs)
'''gen drum'''
fps = librosa.samples_to_frames(fs, hop_length=hop_len, n_fft=win_len)
fps = 100
print(fps)
proc = BeatTrackingProcessor(look_aside=0.2, fps=fps)
act = RNNBeatProcessor()(all_data)
beat_times = proc(act)

song_len = librosa.samples_to_time(data.shape, sr=fs)[0]
beat = np.zeros(all_data.shape)
idx = np.where(beat_times <= song_len)[0]
new_beat_times = np.zeros(idx.shape)
new_beat_times[idx] = beat_times[idx]

beat_samples = librosa.time_to_samples(new_beat_times, sr=fs)
cand_len = len(drum)
def read_data(path_folder):
    """Reads sample data from files and extracts features"""
    os.chdir(DATA_PATH)
    sample_paths = [
        'Gitarre monophon/Samples/NoFX', 'Gitarre polyphon/Samples/NoFX'
    ]
    train_data = []
    train_labels = []
    for path in sample_paths:
        sample_path = os.path.join(path_folder, path)
        os.chdir(sample_path)
        for file_name in os.listdir(os.getcwd()):
            if file_name.endswith(".wav"):
                print(file_name)
                os.chdir(Path('../../Labels'))
                # Label names are: Edge, Gain, Tone
                label_file = file_name[:-4] + '.pickle'
                # label = [0.0, 0.0, 0.0]
                with open(label_file, 'rb') as handle:
                    label = pickle.load(handle)
                    print(label)
                    if path_folder == 'DlyRandomSamples':  # Fix limited delay plugin range
                        label[0] = label[0] * 4.0
                        label[1] = label[1] * 10.0

                os.chdir('../Samples/NoFX')
                train_labels.append(label)
                # Loading the audio
                y, sr = librosa.load(file_name, sr=44100)
                # Onset Detection
                y = np.insert(y, 0, np.zeros(1023))
                y = librosa.util.normalize(y)

                onset_frame = librosa.onset.onset_detect(y=y,
                                                         sr=sr,
                                                         units='frames',
                                                         pre_max=20000,
                                                         post_max=20000,
                                                         pre_avg=20000,
                                                         post_avg=20000,
                                                         delta=0,
                                                         wait=1000)
                offset_frame = librosa.samples_to_frames(samples=y.shape[0])
                onset_sample = librosa.core.frames_to_samples(onset_frame[0])
                offset_sample = librosa.core.frames_to_samples(offset_frame)
                y_cut = y[onset_sample:offset_sample]

                v_features = []
                if path_folder == 'DistRandomSamples':
                    v_features = get_dist_feat(y_cut=y_cut, sr=sr)
                elif path_folder == 'TremRandomSamples':
                    v_features = get_trem_feat(y_cut=y_cut, sr=sr)
                elif path_folder == 'DlyRandomSamples':
                    v_features = get_dly_feat(y_cut=y_cut, sr=sr, y=y)
                else:
                    print('Sample folder for feature extraction not found')

                train_data.append(np.hstack(v_features))
        os.chdir(DATA_PATH)

    train_data = np.array(train_data)
    print(train_data.shape)
    scaler = preprocessing.StandardScaler()
    train_data = scaler.fit_transform(train_data)

    train_labels = np.array(train_labels)
    os.chdir(DATA_PATH)
    return train_data, train_labels
Esempio n. 26
0
def modify_classical(level,
                     param_dict,
                     start,
                     dur=4,
                     sig_dur=4,
                     segment=False):
    print "Classical modification begun.."

    # snap to segment or start marker
    if segment:
        # use pre-computed segment boundaries
        start_bounds = param_dict['bounds'][:, 0]
        nearest_bound = start_bounds[np.where(start_bounds >= start)][0]
    else:
        # simply use start marker
        nearest_bound = start

    # level 1 - tempo change -- volume envelope needs fixing!!
    if level == 1:
        offset = 0.8
        # in frames, conversion to samples required
        tempo_curve = param_dict['tempo']
        nearest_bound_in_frame = librosa.samples_to_frames([nearest_bound])[0]
        tempo_factor = tempo_curve[nearest_bound_in_frame]

        # change dur to account for tempo factor
        dur = int(np.ceil(dur * (tempo_factor + offset)))

        clip = gs.audio_buffer[nearest_bound:nearest_bound + (dur * gs.sr)]

        shrink = librosa.effects.time_stretch(clip, offset + tempo_factor)

        # normalizing CRAP.
        librosa.output.write_wav("clip.wav", clip, gs.sr)
        as_clip = pydub.AudioSegment.from_wav("clip.wav")
        as_amp = as_clip.dBFS
        librosa.output.write_wav("shrink.wav", shrink, gs.sr)
        shrink = match_target_amplitude(
            pydub.AudioSegment.from_wav("shrink.wav"), as_amp)
        shrink.export("new_shrink.wav", format="wav")
        shrink, sr = librosa.load("new_shrink.wav")

        compensate_factor = 1.2

        remainder = np.concatenate(
            (shrink, gs.audio_buffer[nearest_bound + (dur * gs.sr):]))

        gs.audio_buffer[nearest_bound:nearest_bound +
                        len(remainder)] = remainder

        gs.audio_buffer[-1 * (len(clip) - len(shrink)):] = 0

        # taper_buffer_edges(nearest_bound, nearest_bound + len(shrink), 1.0)

        # if stretch instead of shrink
        # gs.audio_buffer = np.concatenate((gs.audio_buffer, np.zeros(len(stretch) - len(clip))))
        # gs.audio_buffer[nearest_bound:] = np.concatenate( (window(stretch), window(gs.audio_buffer[nearest_bound + (dur*gs.sr):])) )

    # level 0 - echo with delay
    elif level == 0:
        offset = int(0.75 * gs.sr)
        clip = gs.audio_buffer[nearest_bound:nearest_bound + (dur * gs.sr)]

        echo_amp_curve = param_dict['echo']
        if echo_amp_curve != None:
            echo_amp = echo_amp_curve[nearest_bound]
        else:
            echo_amp = 0.8

        delay_curve = param_dict['delay']
        nearest_bound_in_frame = librosa.samples_to_frames([nearest_bound])[0]
        delay_in_secs = delay_curve[nearest_bound_in_frame]
        delay_in_samps = int(delay_in_secs * gs.sr)
        delay_in_samps += offset

        # gs.audio_buffer[nearest_bound + delay_in_samps: nearest_bound + delay_in_samps + (dur*gs.sr)] += ((0.8*echo_amp) * window(clip))
        gs.audio_buffer[nearest_bound + delay_in_samps:nearest_bound +
                        delay_in_samps + (dur * gs.sr)] += ((echo_amp) *
                                                            window(clip))

    # level 2 - alert sample
    else:
        # issue sampled alert
        alert = param_dict['alert']
        if len(alert) > sig_dur * gs.sr:
            alert = alert[:sig_dur * gs.sr]

        remainder = np.concatenate(
            (square_window(alert),
             gs.audio_buffer[nearest_bound + len(alert):]))
        gs.audio_buffer[nearest_bound:nearest_bound +
                        len(remainder)] = remainder
        taper_buffer_edges(nearest_bound,
                           nearest_bound + len(alert),
                           1.0,
                           low_end=0.0)

    print "Classical modification completed.."
    return True
def extract_features(sample, training=True):
    """Extracts features from sample"""
    X_list = []
    y_list = []

    print(sample.label)
    print(sample.file_name)
    snd = parselmouth.Sound(os.path.join(sample.path, sample.file_name))
    # Onset Detection
    sample.sig = np.insert(sample.sig, 0, np.zeros(1024))
    sample.sig = librosa.util.normalize(sample.sig)
    onset_frame = librosa.onset.onset_detect(y=sample.sig,
                                             sr=sample.fs,
                                             units='frames',
                                             backtrack=False,
                                             pre_max=20000,
                                             post_max=20000,
                                             pre_avg=20000,
                                             post_avg=20000,
                                             delta=0.0,
                                             wait=1000)
    offset_frame = int(
        round(0.75 * librosa.samples_to_frames(samples=sample.sig.shape[0])))
    if offset_frame - 32 <= onset_frame[0]:  # or not training
        offset_frame = librosa.samples_to_frames(samples=sample.sig.shape[0])
        if training:
            print(
                'Training Sample shorter than 32 Frames {}; Sample Length: {}'.
                format(sample.file_name, offset_frame - onset_frame[0]))
        if offset_frame - 32 <= onset_frame[0]:
            onset_frame[0] = 0

    onset_sample = librosa.core.frames_to_samples(onset_frame[0])
    offset_sample = librosa.core.frames_to_samples(offset_frame)
    # plots.waveform(sample, onset_sample)

    smp_cut = sample.sig[onset_sample:offset_sample]

    # Randomly shorten sample from 1/4 to 3/4 note length at 120 BPM
    # smp_cut = smp_cut[:int(np.random.uniform(0.66, 1, [1, 1])[0, 0]*len(smp_cut))]
    # Add noise to sample at max -48dBFS
    # smp_cut += np.random.uniform(-2**-9, 2**-9, [len(smp_cut)])

    # Time series features from librosa
    mfcc = librosa.feature.mfcc(y=smp_cut, sr=sample.fs)
    # plots.spectrogram(smp_cut)
    mfcc_pos = (mfcc - np.min(mfcc))
    mfcc_norm = mfcc_pos / np.max(mfcc_pos) - np.mean(mfcc_pos)
    mfcc_delta = librosa.feature.delta(mfcc_norm)
    spec_contr = librosa.feature.spectral_contrast(y=smp_cut, sr=sample.fs)
    # plots.spec_contrast(spec_contr)

    phase_res = phase_fmax(smp_cut)
    # plots.phase_reg_line_deviation(phase_res)

    zero_cr = librosa.feature.zero_crossing_rate(y=smp_cut)
    zero_cr_delta = librosa.feature.delta(zero_cr)
    rms = librosa.feature.rms(y=smp_cut)
    rms *= 1 / rms.max()
    rms_delta = librosa.feature.delta(rms)

    # Time series features from praat
    pitch = snd.to_pitch().to_array()
    pitch_curve, voice_prob = zip(*pitch[0][:])
    pitch_curve = np.array(pitch_curve)
    voice_prob = np.array(voice_prob)
    pitch_onset = int(
        (onset_sample / sample.sig.shape[0]) * pitch_curve.shape[0])
    pitch_curve = pitch_curve[pitch_onset:]
    voice_prob = voice_prob[pitch_onset:]
    # plots.pitch_voiced_curve(pitch_curve, voice_prob)

    pitch_curve = np.reshape(pitch_curve, [1, pitch_curve.shape[0]])
    # plots.pitch(pitch_curve)
    voice_prob = np.reshape(voice_prob, [1, voice_prob.shape[0]])

    harmonicity = call(snd, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
    hnr = call(harmonicity, "Get mean", 0, 0)
    m_features = np.concatenate((mfcc_norm, mfcc_delta, spec_contr, zero_cr,
                                 zero_cr_delta, rms, rms_delta))
    v_features = functionals(m_features)

    # phase_res and pitch curve have different lenghts from m_features, so functionals
    # need to be analysed individually

    v_features = np.append(v_features, functionals(phase_res))
    v_features = np.append(v_features, functionals(pitch_curve))
    v_features = np.append(v_features, functionals(voice_prob))
    v_features = np.append(v_features, hnr)

    X_list.append(v_features)
    y_list.append(sample.label)

    return X_list, y_list