Beispiel #1
0
def load_references_and_mix(mixture_folder, spk_directories, mix_file):
    references = {}
    mixture, sr = utils.load_audio(os.path.join(mixture_folder, 'mix', mix_file))
    for spk_directory in spk_directories:
        reference = os.path.join(mixture_folder, spk_directory, mix_file)
        references[spk_directory.split('/')[-1]] = utils.load_audio(reference)[0][0]
    return mixture, references, sr
Beispiel #2
0
    def __getitem__(self, index):
        # select the target based on the dataset   index
        target_track_path = self.tracks[index]['path']
        if self.random_chunks:
            target_min_duration = self.tracks[index]['min_duration']
            target_start = random.uniform(
                0, target_min_duration - self.seq_duration
            )
        else:
            target_start = 0

        # optionally select a random interferer track
        if self.random_interferer_mix:
            random_idx = random.choice(range(len(self.tracks)))
            intfr_track_path = self.tracks[random_idx]['path']
            if self.random_chunks:
                intfr_min_duration = self.tracks[random_idx]['min_duration']
                intfr_start = random.uniform(
                    0, intfr_min_duration - self.seq_duration
                )
            else:
                intfr_start = 0
        else:
            intfr_track_path = target_track_path
            intfr_start = target_start

        # get sources from interferer track
        sources = list(intfr_track_path.glob('*' + self.ext))

        # load sources
        x = 0
        for source_path in sources:
            # skip target file and load it later
            if source_path == intfr_track_path / self.target_file:
                continue

            try:
                audio = load_audio(
                    source_path, start=intfr_start, dur=self.seq_duration
                )
            except RuntimeError:
                index = index - 1 if index > 0 else index + 1
                return self.__getitem__(index)
            x += self.source_augmentations(audio)

        # load the selected track target
        if Path(target_track_path / self.target_file).exists():
            y = load_audio(
                target_track_path / self.target_file,
                start=target_start,
                dur=self.seq_duration
            )
            y = self.source_augmentations(y)
            x += y

        # Use silence if target does not exist
        else:
            y = torch.zeros(audio.shape)

        return x, y
Beispiel #3
0
    def __getitem__(self, index):
        input_path, output_path = self.tuple_paths[index]

        if (self.seq_duration != 0.0):
            if self.random_chunks:
                input_info = load_info(input_path)
                output_info = load_info(output_path)
                duration = min(input_info['duration'], output_info['duration'])
                start = random.uniform(0, duration - self.seq_duration)
            else:
                start = 0
            #print("DATA", start)
            X_audio = load_audio(input_path,
                                 start=start,
                                 dur=self.seq_duration)
            Y_audio = load_audio(output_path,
                                 start=start,
                                 dur=self.seq_duration)
        else:
            input_info = load_info(input_path)
            output_info = load_info(output_path)
            start = 0
            duration = min(input_info['duration'], output_info['duration'])
            X_audio = load_audio(input_path, start=start, dur=duration)
            Y_audio = load_audio(output_path, start=start, dur=duration)
        # return torch tensors
        return X_audio, Y_audio
Beispiel #4
0
def align_audio(payload: PayLoad):
    try:
        load_audio(payload.bucket_id, payload.sub_dir, payload.file_name)
    except ClientError:
        raise HTTPException(status_code=404, detail="Item not found")
    prepare_text(payload.text)
    sync_map = force_align()
    clean_dir()

    response = Response(alignment=sync_map, file_name=payload.file_name)
    return response
Beispiel #5
0
def generate_samples(samples_path, label_path, batch_count):
    '''
    Generator function that loads wavs converts to spectrograms and finds labels to return as well
    Batch count is number of to return in each generator batch
    '''
    sample_count = 0
    X_train = None
    Y_train = None
    while(True):
        file = random.choice(os.listdir(samples_path))
        wav_path = os.path.join(samples_path, file)
        signal, sr = utils.load_audio(wav_path, mono=True)
        melgram = utils.make_melgram(signal, sr)
        file_name = file.split('.')[0]
        label = np.load(os.path.join(label_path,file_name+".npy"))
        if(X_train is None):
            #if first in sequence
            X_train = np.zeros((batch_count, melgram.shape[1], melgram.shape[2], melgram.shape[3]))
            Y_train = np.zeros((batch_count, label.shape[0]))
            X_train[0] = melgram[0]
            Y_train[0] = label
        else:
            X_train[sample_count] = melgram[0]
            Y_train[sample_count] = label
        
        if sample_count == batch_count-1:
            sample_count = 0
            yield X_train, Y_train
        else:
            sample_count += 1
Beispiel #6
0
def separate_signals(method: str, model_filename: pathlib.Path, data_conf: dict, midi_note_nums: numpy.ndarray, spectrogram: numpy.ndarray, facwt: pyfacwt.FACWT, facwt_params: dict, decimation_factor: int, save_waveform: bool=False, gpu: int=-1, overwrite: bool=False) -> numpy.ndarray:
    """Separate signals

    Args:
        method (str): Method
        model_filename (pathlib.Path): Model filename
        data_conf (dict): Input data configuration
        midi_note_nums (numpy.ndarray): Midi note numbers of pitches
        spectrogram (numpy.ndarray): Observed complex spectrogram
        facwt (pyfacwt.FACWT): Fast approximate CWT instance
        facwt_params (dict): FACWT parameters
        decimation_factor (int): Decimation factor
        save_waveform (bool, optional): If True, dump separated signals. Defaults to False.
        gpu (int, optional): Gpu number. Defaults to -1.
        overwrite (bool, optional): If True, overwrite separated signals. Defaults to False.

    Returns:
        numpy.ndarray: Separated signals
    """    
    # load model
    model = joblib.load(model_filename)
    # separate preparation
    valid_k_list = list(filter(lambda k: midi_note_nums[k] in data_conf["gt"].keys(), list(range(model.n_bases))))
    logger.info("valid k list: {} {}".format(valid_k_list, data_conf["gt"].keys()))
    if gpu >= 0:
        xp = cupy
        model.to_gpu()
    else:
        xp = numpy
    X = None
    facwt.verbose = 0  # suppress output
    #
    separated_signals = {}
    for k in tqdm(valid_k_list, leave=True, desc='    {0: >10s}'.format('Valid basis ')):
        outfname = model_filename.parent / f"pitch{midi_note_nums[k]:03d}.wav"
        if not outfname.exists() or overwrite:
            if X is None:
                X = model.reconstruct()
                X[:] = xp.maximum(model.eps, X)
                X = X.astype('f')
            X_k = xp.maximum(model.eps, model.reconstruct(k_list=[k])).astype('f')
            weight = (X_k / X).astype('f')

            if xp == cupy:
                weight = cupy.asnumpy(weight)

            # Interpolate masks at decimated frames
            if decimation_factor > 1:
                interpfun = interp1d(numpy.arange(0, weight.shape[1]) * decimation_factor, weight, kind="linear", axis=1, bounds_error=False, fill_value="extrapolate")
                weight = interpfun(numpy.arange(0, spectrogram.shape[1])).astype('f')
                # Squash weights into [0,1]
                weight[weight < 0] = 0
                weight[weight > 1] = 1
            # masking
            Y_k = spectrogram * weight
            # Convert into time-domain signal
            separated_signals[midi_note_nums[k]] = spectrogram2signal(outfname, list(Y_k), facwt, save=save_waveform)
        else:
            separated_signals[midi_note_nums[k]] = load_audio(outfname)[0]
    return separated_signals
Beispiel #7
0
    def __getitem__(self, idx):
        phn_file = self.files[idx]
        wav_file = self.files[idx][:-3] + "WAV.wav"

        labels = get_phn(phn_file, self.tokenizer)
        labels = self.pad(labels,
                          pad_value=self.tokenizer.convert_token("[NULL]"))
        audio = load_audio(wav_file)
        audio = self.pad(audio)

        random_index = random.randint(0, len(audio) - self.audio_length)
        labels, audio = labels[random_index:random_index + self.
                               audio_length], audio[random_index:random_index +
                                                    self.audio_length]
        audio = self.normalize_audio(audio)

        if random.choice([0, 1]) == 1:
            labels, audio = self.add_silence(labels, audio)

        audio = self.add_noise(audio)

        assert audio.shape[0] == self.audio_length
        assert labels.shape[0] == self.audio_length

        return torch.FloatTensor(audio).unsqueeze(0), torch.LongTensor(labels)
Beispiel #8
0
    def __getitem__(self, index):
        # for validation, get deterministic behavior
        # by using the index as seed
        if self.split == 'valid':
            random.seed(index)

        # For each source draw a random sound and mix them together
        audio_sources = []
        for source in self.source_folders:
            # select a random track for each source
            source_path = random.choice(self.source_tracks[source])
            if self.random_chunks:
                duration = load_info(source_path)['duration']
                start = random.uniform(0, duration - self.seq_duration)
            else:
                start = 0

            audio = load_audio(source_path, start=start, dur=self.seq_duration)
            audio = self.source_augmentations(audio)
            audio_sources.append(audio)
        stems = torch.stack(audio_sources)
        # # apply linear mix over source index=0
        x = stems.sum(0)
        # target is always the last element in the list
        y = stems[-1]
        return x, y
Beispiel #9
0
    def __getitem__(self, index):
        track_path = self.tracks[index]['path']
        min_duration = self.tracks[index]['min_duration']
        sources = list(track_path.glob('*' + self.ext))

        if self.random_chunks:
            start = random.uniform(0, min_duration - self.seq_duration)
        else:
            start = 0

        # load sources
        audio_sources = []
        for source_path in sources:
            try:
                audio = load_audio(source_path,
                                   start=start,
                                   dur=self.seq_duration)
            except RuntimeError:
                index = index - 1 if index > 0 else index + 1
                return self.__getitem__(index)
            audio = self.source_augmentations(audio)
            audio_sources.append(audio)

        stems = torch.stack(audio_sources, dim=0)
        # # apply linear mix over source index=0
        x = stems.sum(0)
        # target is always the last element in the list
        if track_path / self.target_file in sources:
            y = stems[sources.index(track_path / self.target_file)]
        else:
            y = torch.zeros(x.shape)

        return x, y
Beispiel #10
0
    def add_noise(self, audio):
        noise_audio = load_audio(random.choice(self.noise_files))
        noise_audio = self.random_loudness(noise_audio)

        random_index = random.randint(0, len(noise_audio) - self.audio_length)

        return audio + noise_audio[random_index:random_index +
                                   self.audio_length]
Beispiel #11
0
def read_f0(ref_dir):
    paths = sorted(glob.glob(os.path.join(ref_dir, '*.wav')))
    f0_lst = []
    for path in paths:
        wav, sr = utils.load_audio(path)
        f0 = utils.get_f0(wav, sr, fmin=60, fmax=400)
        f0_lst.append(f0)
    return f0_lst
Beispiel #12
0
    def load_audio_files(self, wav_file):
        sources = []
        channel_indices = np.arange(self.channels_in_mix)
        np.random.shuffle(channel_indices)
        channel_indices = channel_indices[:self.num_channels]

        for speaker in self.speaker_folders:
            speaker_path = os.path.join(self.folder, speaker, wav_file)
            mix_path = os.path.join(self.folder, 'mix', wav_file)

            mix, _ = utils.load_audio(mix_path)
            source, _ = utils.load_audio(speaker_path)

            mix = mix[channel_indices]
            source = source[channel_indices]
            sources.append(source)

        return mix, sources, np.eye(self.num_speakers)
Beispiel #13
0
def job(fpath):
    wav_path = os.path.join(args.data_path, 'wavs',
                            fpath.replace('npy', 'wav'))

    wav, sr = utils.load_audio(wav_path)
    mel = utils.get_mel_spectrogram(wav, sr)
    # ga = prepro_guided_attention(len(text), len(mel), g=args.g)
    f0 = utils.get_f0(wav, sr, fmin=60, fmax=400, spec_len=mel.shape[0])
    np.save(os.path.join(args.data_path, args.mel_dir, fpath), mel)
    np.save(os.path.join(args.data_path, args.f0_dir, fpath), f0)
    return None
Beispiel #14
0
def load_run_experiment_and_save(filename):
    clf = SVC(C=1, gamma=0.001, kernel='rbf', random_state=0)
    audio = load_audio(filename)

    evolution = refit_from_best(clf, audio)

    exp_filename = 'data/experiments/' + filename.split('/')[-1]
    exp_filename = exp_filename.replace('.wav', '.yaml')
    save_yaml(exp_filename, evolution)

    return evolution
Beispiel #15
0
def prepare_evaluate(conf: dict):
    '''Prepare for evaluation

    Args:
        conf (dict): Configuration of a mixture
    
    Returns:
        tuple[list,numpy.ndarray,numpy.ndarray]: Groundtruth pitches, input SDRs, and groundtruth signals (# of pitches x signal length)
    '''
    gt_list = conf["gt"]
    gt_pitches = sorted([int(p) for p in gt_list.keys()])
    refs = [load_audio(gt_list[p]) for p in gt_pitches]
    refs = numpy.stack(refs, axis=0)  # n_pitches x sig_len
    # load mixed
    mixed = load_audio(conf["mix"])
    # compute input sisdr
    sdrs = compute_bsseval_v2(
        refs,
        numpy.tile(mixed[None, :refs.shape[1]] / refs.shape[0],
                   (refs.shape[0], 1)))
    return gt_pitches, sdrs, refs
Beispiel #16
0
def load_test_data(model_path, instrument, fx, param_id):
    test_df = pd.read_csv(os.path.join(model_path, 'test_data.csv'))
    n_total_clips = test_df.shape[0]
    input_target_pairs = [0] * n_total_clips

    if fx is meta.FXCHAIN:
        dry_path = meta.params_path[meta.FXCHAIN][instrument][meta.NO_FX]
        wet_path = meta.params_path[meta.FXCHAIN][instrument][param_id]
    else:
        dry_path = meta.params_path[instrument][meta.NO_FX]
        wet_path = meta.params_path[instrument][fx]

    for idx, row in test_df.iterrows():
        audio_in = load_audio(os.path.join(dry_path, row['input_file']), idx,
                              n_total_clips, meta.NO_FX)
        audio_target = load_audio(os.path.join(wet_path, row['target_file']),
                                  idx, n_total_clips, fx)

        input_target_pairs[idx] = (audio_in, audio_target)

    return input_target_pairs
Beispiel #17
0
    def __getitem__(self, index):
        # first, get target track
        track_path = self.tracks[index]['path']
        min_duration = self.tracks[index]['min_duration']
        if self.random_chunks:
            # determine start seek by target duration
            start = random.uniform(0, min_duration - self.seq_duration)
        else:
            start = 0

        # assemble the mixture of target and interferers
        audio_sources = []
        # load target
        target_audio = load_audio(track_path / self.target_file,
                                  start=start,
                                  dur=self.seq_duration)
        target_audio = self.source_augmentations(target_audio)
        audio_sources.append(target_audio)
        # load interferers
        for source in self.interferer_files:
            # optionally select a random track for each source
            if self.random_track_mix:
                random_idx = random.choice(range(len(self.tracks)))
                track_path = self.tracks[random_idx]['path']
                if self.random_chunks:
                    min_duration = self.tracks[random_idx]['min_duration']
                    start = random.uniform(0, min_duration - self.seq_duration)

            audio = load_audio(track_path / source,
                               start=start,
                               dur=self.seq_duration)
            audio = self.source_augmentations(audio)
            audio_sources.append(audio)

        stems = torch.stack(audio_sources)
        # # apply linear mix over source index=0
        x = stems.sum(0)
        # target is always the first element in the list
        y = stems[0]
        return x, y
Beispiel #18
0
def main(_):

    if model_type == "IMAGE":
        features, labels = utils.load_images(args.input, args.batch_size)
    elif model_type == "AUDIO":
        features, labels = utils.load_audio(args.input, args.batch_size)
    elif model_type == "SPECT":
        features, labels = utils.load_spect(args.input, args.batch_size)
    print(features)
    ripeness_classifier = tf.estimator.Estimator(model_fn=model,
                                                 model_dir=args.dir)
    #Set up loging for predictions
    tensors_to_log = {"probabilites": "Predictions/softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log,
                                              every_n_iter=args.log_steps)
    #Train the model
    if args.mode == "TRAIN":
        input_fn = tf.estimator.inputs.numpy_input_fn(
            x=features,
            y=labels,
            batch_size=args.batch_size,
            num_epochs=None,
            shuffle=True)

        ripeness_classifier.train(input_fn=input_fn,
                                  steps=args.steps,
                                  hooks=[logging_hook])

        print("Training completed")
    elif args.mode == "EVAL":
        input_fn = tf.estimator.inputs.numpy_input_fn(
            x=features,
            y=labels,
            batch_size=args.batch_size,
            num_epochs=1,
            shuffle=False)
        eval_results = ripeness_classifier.evaluate(input_fn=input_fn)
        print("Eval results:")
        print(eval_results)
    elif args.mode == "PREDICT":
        input_fn = tf.estimator.inputs.numpy_input_fn(
            x=features,
            batch_size=args.batch_size,
            num_epochs=1,
            shuffle=False)
        pred = list(est.predict(pred_input_fn))
        print("Prediction results:")
        print(pred)
def load_data():
    features = []
    for file in os.listdir(data_set):
        if file.endswith(".wav") and (
                "Al" in file or 'Ar' in file or 'Pr' in file
                or 'Pl' in file) and 'COPD' in file and 'AKGC417L' in file:
            class_label = utils.class_name(file)
            data_file = os.path.join(data_set, file)
            audio, sample_rate = utils.load_audio(data_file)
            raw_data = utils.extract_features(audio, sample_rate)
            features = utils.append_features(features, class_label, raw_data)

    featuresdf = pd.DataFrame(features, columns=['feature', 'class_label'])
    x = np.array(featuresdf.feature.tolist())
    y = np.array(featuresdf.class_label.tolist())
    le = LabelEncoder()
    yy = to_categorical(le.fit_transform(y))
    return x, yy
Beispiel #20
0
def init_image_dataset():
    for genre in classes.values():
        # Create output directory
        if not os.path.exists(out_path + genre):
            os.mkdir(out_path + genre)

        # Get all audio files
        files = os.listdir(in_path + genre)

        for f in files:
            # Define paths
            audio_path = in_path + genre + "/" + f
            spec_path = out_path + genre + "/" + f

            # Load audio and create spectrogram
            audio, fs = load_audio(audio_path)
            audio2spectrogram(audio, fs, spec_path)
            print("Saved:", spec_path)
Beispiel #21
0
def init_analytics_dataset():
    with open('dataset.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        header = "chroma_freqs spectral_centroid spectral_bandwidth spectral_rolloff zero_crossing_rate"
        for i in range(1, 21):
            header += " mfcc" + str(i)
        header += " genre"
        writer.writerow(header.split())

        # Writing Data
        for genre in classes.values():
            # Get all audio files
            files = os.listdir(in_path + genre)

            for f in files:
                audio, fs = load_audio(in_path + genre + "/" + f)
                features = extract_features(audio, fs)
                features.append(genre)
                writer.writerow(features)
                print("Features extracted:", f)
    def neural_predicate(self, network, path, in_training=True, versions=3):
        data = load_audio(str(path)[1:-1])

        if in_training:
            sig_t, sr, _ = self.t_transforms.apply(data, None)
        else:
            sig_t, sr, _ = self.v_transforms.apply(data, None)

        # print(path)

        length = torch.tensor(sig_t.size(0))
        sr = torch.tensor(sr)
        data = [d.unsqueeze(0) for d in [sig_t, length, sr]]
        try:
            out_raw = network.net(data)
        except RuntimeError:
            print(path)
            print(data)
            raise

        return out_raw.squeeze(0)
Beispiel #23
0
def wav2spectrogram(filename: str,
                    max_length: int = None,
                    sr: int = 16000,
                    start_pos: float = 0.0,
                    **kwargs):
    """Convert wavefile to complex CWT spectrogram

    Args:
        filename (str): Wav filename
        max_length (int, optional): Maximum signal length [s]. Defaults to None.
        sr (int, optional): Sampling rate [Hz]. Defaults to 16000.
        start_pos (float, optional): Anlysis start position of waveform [s]. Defaults to 0.0.
        **kwargs: Parameters for pyfacwt.FACWT

    Returns:
        numpy.ndarray: Complex CWT spectrogram
    """
    wavdata = load_audio(filename, sr=sr, mono=True)
    if start_pos > 0.0:
        wavdata = wavdata[int(start_pos * sr):]
    if max_length is not None:
        wavdata = wavdata[:int(max_length * sr)]
    # setup FACWT
    facwt_params = dict(lowFreq=kwargs.get("lowFreq", 27.5),
                        highFreq=kwargs.get("highFreq", sr / 2),
                        fs=sr,
                        resol=kwargs.get("resol", 3),
                        width=kwargs.get("width", 2.0),
                        sd=kwargs.get("sd",
                                      numpy.log(2.0) / 60.0),
                        alpha=kwargs.get("alpha", 1.0),
                        multirate=kwargs.get("multirate", False),
                        minWidth=kwargs.get("minWidth", 2),
                        waveletType=kwargs.get("waveletType", "log_normal"),
                        verbose=kwargs.get("verbose", 1))
    facwt = FACWT(wavdata.shape[0], **facwt_params)

    # forward computation
    spectrogram = facwt.forward(wavdata)
    return facwt, facwt_params, spectrogram
 def _make_example(self, wav_name, text):
     wav_file = os.path.join(self.wav_dir, wav_name + '.wav')
     wav = load_audio(wav_file)
     mel, mag = get_spectrogram(wav)
     return {'text': text, 'mel': mel, 'mag': mag}
Beispiel #25
0
def main():
    sr = 44100
    hop_length = 512
    y_in = utils.load_audio("Awake.wav", sr)
    y_out = utils.load_audio("Light.wav", sr)
    cross_interval(y_in, y_out, sr, hop_length, 10)
Beispiel #26
0
    def load_audio_files(self, jam_file):
        mix, sr = utils.load_audio(jam_file[:-4] + 'wav')
        mix = mix[0]

        jam = jams.load(jam_file)
        data = jam.annotations[0]['data']['value']
        classes = self.source_labels

        sources = []
        one_hots = []
        group = []
        used_classes = []
        keep_columns = []

        for d in data:
            if d['role'] == 'foreground':
                source_path = d['saved_source_file']
                source_path = os.path.join(self.folder,
                                           source_path.split('/')[-1])
                sources.append(utils.load_audio(source_path)[0][0])
                one_hot = np.zeros(len(classes))
                one_hot[self.source_indices[d['label']]] = 1
                used_classes.append(d['label'])
                one_hots.append(one_hot)

                if d['label'] in self.group_sources or d[
                        'label'] in self.ignore_sources:
                    group.append(sources[-1])
                    sources.pop()
                    one_hots.pop()
                    used_classes.pop()
                else:
                    keep_columns.append(self.source_indices[d['label']])

        if len(self.group_sources) > 0:
            sources.append(sum(group))
            one_hot = np.zeros(len(classes))
            one_hot[self.source_indices['group']] = 1
            used_classes.append('group')
            one_hots.append(one_hot)
            keep_columns.append(self.source_indices['group'])

        if self.num_extra_sources > 0:
            num_sources = len(sources)
            shuffled = random.sample(classes, len(classes))
            for class_name in shuffled:
                if class_name not in used_classes:
                    if len(sources) >= num_sources + self.num_extra_sources:
                        break
                    one_hot = np.zeros(len(classes))
                    one_hot[classes.index(class_name)] = 1
                    one_hots.append(one_hot)
                    sources.append(np.zeros(sources[-1].shape))
                    used_classes.append(class_name)

        length_cutoff = int(mix.shape[0] * self.length)
        mix = mix[:length_cutoff]
        sources = [source[:length_cutoff] for source in sources]
        if self.reorder_sources:
            source_order = [
                used_classes.index(c) for c in self.source_labels
                if c in used_classes
            ]
            sources = [sources[i] for i in source_order]
            one_hots = [one_hots[i] for i in source_order]
        if self.group_sources:
            one_hots = np.stack(one_hots)[:, sorted(keep_columns)]
        else:
            one_hots = np.stack(one_hots)
        return mix, sources, one_hots
Beispiel #27
0
def main(unused_argv=None):
  tf.logging.set_verbosity(FLAGS.log)

  if FLAGS.checkpoint_path:
    checkpoint_path = utils.shell_path(FLAGS.checkpoint_path)
  else:
    expdir = utils.shell_path(FLAGS.expdir)
    tf.logging.info("Will load latest checkpoint from %s.", expdir)
    while not tf.gfile.Exists(expdir):
      tf.logging.fatal("\tExperiment save dir '%s' does not exist!", expdir)
      sys.exit(1)

    try:
      checkpoint_path = tf.train.latest_checkpoint(expdir)
    except tf.errors.NotFoundError:
      tf.logging.fatal("There was a problem determining the latest checkpoint.")
      sys.exit(1)

  if not tf.train.checkpoint_exists(checkpoint_path):
    tf.logging.fatal("Invalid checkpoint path: %s", checkpoint_path)
    sys.exit(1)

  tf.logging.info("Will restore from checkpoint: %s", checkpoint_path)

  source_path = utils.shell_path(FLAGS.source_path)
  tf.logging.info("Will load Wavs from %s." % source_path)

  save_path = utils.shell_path(FLAGS.save_path)
  tf.logging.info("Will save embeddings to %s." % save_path)
  if not tf.gfile.Exists(save_path):
    tf.logging.info("Creating save directory...")
    tf.gfile.MakeDirs(save_path)

  sample_length = FLAGS.sample_length
  batch_size = FLAGS.batch_size

  def is_wav(f):
    return f.lower().endswith(".wav")

  wavfiles = sorted([
      os.path.join(source_path, fname)
      for fname in tf.gfile.ListDirectory(source_path) if is_wav(fname)
  ])

  for start_file in xrange(0, len(wavfiles), batch_size):
    batch_number = (start_file / batch_size) + 1
    tf.logging.info("On file number %s (batch %d).", start_file, batch_number)
    end_file = start_file + batch_size
    wavefiles_batch = wavfiles[start_file:end_file]

    # Ensure that files has batch_size elements.
    batch_filler = batch_size - len(wavefiles_batch)
    wavefiles_batch.extend(batch_filler * [wavefiles_batch[-1]])
    wav_data = np.array(
        [utils.load_audio(f, sample_length) for f in wavefiles_batch])
    try:
      tf.reset_default_graph()
      # Load up the model for encoding and find the encoding
      encoding = encode(wav_data, checkpoint_path, sample_length=sample_length)
      if encoding.ndim == 2:
        encoding = np.expand_dims(encoding, 0)

      tf.logging.info("Encoding:")
      tf.logging.info(encoding.shape)
      tf.logging.info("Sample length: %d" % sample_length)

      for num, (wavfile, enc) in enumerate(zip(wavefiles_batch, encoding)):
        filename = "%s_embeddings.npy" % wavfile.split("/")[-1].strip(".wav")
        with tf.gfile.Open(os.path.join(save_path, filename), "w") as f:
          np.save(f, enc)

        if num + batch_filler + 1 == batch_size:
          break
    except Exception as e:
      tf.logging.info("Unexpected error happened: %s.", e)
      raise
Beispiel #28
0
    def __getitem__(self, index):
        # first, get target track
        track_path = self.tracks[index]['path']
        min_duration = self.tracks[index]['min_duration']

        if self.random_chunks:
            # determine start seek by target duration
            start = random.uniform(0, min_duration - self.seq_duration)
        else:
            start = 0

        # assemble the mixture of target and interferers
        audio_sources = []
        midi_sources = []
        start_ends = []
        # load target
        # random choose target

        self.source_files = random.sample(self.source_files,
                                          len(self.source_files))

        for index, source in enumerate(self.source_files):

            if self.random_chunks:
                # determine start seek by target duration
                start = random.uniform(0, min_duration - self.seq_duration)
            else:
                start = 0

            audio = load_audio(track_path / source,
                               start=start,
                               dur=self.seq_duration)
            audio = torch.unsqueeze(self.source_augmentations(audio), 0)
            print('audio.shape', audio.shape)
            audio_sources.append(audio)

            start_ends.append((start, start + self.seq_duration))
            midi_path = os.path.join(str(track_path),
                                     source.split('.')[0] + '.txt')
            midi_sources.append(midi_path)

        stems = torch.stack(audio_sources)
        # # apply linear mix over source index=0
        x = stems.sum(0)
        # target is always the first element in the list
        y = stems[-1]

        # time series to stft
        x = self.stft.forward(x)
        x = self.spec.forward(x)

        y = self.stft.forward(y)
        y = self.spec.forward(y)

        # Hard Mask to Soft Mask

        mask_accom = midi_to_mask(
            x.permute(1, 0, 2)[0].numpy(), midi_sources[0], start_ends[0])
        mask_target = midi_to_mask(
            x.permute(1, 0, 2)[0].numpy(), midi_sources[-1], start_ends[-1])
        mask_target = mask_target / (mask_target + mask_accom)

        x_filtered = mask_target * x.permute(1, 0, 2)[0].numpy()

        # Expand dimensions for the model
        x_filtered = torch.tensor(np.expand_dims(x_filtered, 1))

        return x, y, x_filtered
Beispiel #29
0
def load_and_preprocess(file_path, sr, bits):
    x = load_audio(file_path.numpy(), sr)
    x = encode_mulaw(x, bits)
    return x
Beispiel #30
0
for i in range(generation_step):
    preds = model.predict(np.expand_dims(pred_seed, 0))  # prediction with the model
    sampled = sample(preds[0][-1])  # multinomial sampling
    # To prevent dead silence.
    if sampled == prev_sample:
        equal_cnt += 1
    else:
        equal_cnt = 0
    prev_sample = sampled
    sampled_onehot = np.zeros([1, 1, input_dim])
    sampled_onehot[0][0][sampled] = 1  # make the sample into onehot
    generated_sample = np.append(generated_sample, sampled_onehot, axis=1)  # append generated sample
    pred_seed = generated_sample[0][i + 1:i + 1 + sample_len]  # make new seed as generation input
    if equal_cnt > 1000:
        impulse_audio = load_audio(impulse[impulse_idx])
        impulse_audio = mu_quantize(impulse_audio, input_dim)
        impulse_audio = impulse_audio[:1000]
        impulse_audio = q_to_one_hot(impulse_audio, input_dim)
        for j in range(1000):
            pred_seed[sample_len - 1000 + j] = impulse_audio[j]
        print('Inject impulse.')
        if impulse_idx == len(impulse) - 1:
            impulse_idx = 0
        else:
            impulse_idx += 1
        equal_cnt = 0

    print('generated %ith sample ==> %i (equal_cnt = %i)' % ((i + 1), sampled, equal_cnt), end='\r')

# Save generated samples as a flie