Ejemplo n.º 1
0
def _maybe_convert_wav(data_dir, original_data, converted_data):
    source_dir = os.path.join(data_dir, original_data)
    target_dir = os.path.join(data_dir, converted_data)

    # Conditionally convert sph files to wav files
    if os.path.exists(target_dir):
        print("skipping maybe_convert_wav")
        return

    # Create target_dir
    os.makedirs(target_dir)

    # Loop over sph files in source_dir and convert each to 16-bit PCM wav
    for root, dirnames, filenames in os.walk(source_dir):
        for filename in fnmatch.filter(filenames, "*.sph"):
            for channel in ['1', '2']:
                sph_file = os.path.join(root, filename)
                wav_filename = os.path.splitext(os.path.basename(sph_file))[0] + "-" + channel + ".wav"
                wav_file = os.path.join(target_dir, wav_filename)
                temp_wav_filename = os.path.splitext(os.path.basename(sph_file))[0] + "-" + channel + "-temp.wav"
                temp_wav_file = os.path.join(target_dir, temp_wav_filename)
                print("converting {} to {}".format(sph_file, temp_wav_file))
                subprocess.check_call(["sph2pipe", "-c", channel, "-p", "-f", "rif", sph_file, temp_wav_file])
                print("upsampling {} to {}".format(temp_wav_file, wav_file))
                audioData, frameRate = librosa.load(temp_wav_file, sr=16000, mono=True)
                soundfile.write(wav_file, audioData, frameRate, "PCM_16")
                os.remove(temp_wav_file)
Ejemplo n.º 2
0
def save(filename_audio, filename_jam, jam, strict=True, **kwargs):
    '''Save a muda jam to disk

    Parameters
    ----------
    filename_audio: str
        The path to store the audio file

    filename_jam: str
        The path to store the jams object

    strict: bool
        Strict safety checking for jams output

    kwargs
        Additional parameters to `soundfile.write`

    '''

    y = jam.sandbox.muda._audio['y']
    sr = jam.sandbox.muda._audio['sr']

    # First, dump the audio file
    psf.write(filename_audio, y, sr, **kwargs)

    # Then dump the jam
    jam.save(filename_jam, strict=strict)
Ejemplo n.º 3
0
    def saveTo(self, file):
        with ZipFile(file, 'w') as zip:
            song_file = configparser.ConfigParser()
            song_file['DEFAULT'] = {'volume': self.volume,
                                    'bpm': self.bpm,
                                    'beat_per_bar': self.beat_per_bar,
                                    'width': self.width,
                                    'height': self.height}
            for clip in self.clips:
                clip_file = {'name': clip.name,
                             'volume': str(clip.volume),
                             'frame_offset': str(clip.frame_offset),
                             'beat_offset': str(clip.beat_offset),
                             'beat_diviser': str(clip.beat_diviser),
                             'audio_file': basename(
                                 clip.audio_file)}
                if clip_file['audio_file'] is None:
                    clip_file['audio_file'] = 'no-sound'
                song_file["%s/%s" % (clip.x, clip.y)] = clip_file

            buffer = StringIO()
            song_file.write(buffer)
            zip.writestr('metadata.ini', buffer.getvalue())

            for member in self.data:
                buffer = BytesIO()
                sf.write(self.data[member], buffer,
                         self.samplerate[member],
                         subtype=sf.default_subtype('WAV'),
                         format='WAV')
                zip.writestr(member, buffer.getvalue())

        self.file_name = file
Ejemplo n.º 4
0
 def Save(cls, filename, data):
     d = numpy.transpose(data.GetChannels())
     soundfile.write(data=d,
                     file="%s.%s" % (filename, cls.ending),
                     samplerate=int(round(data.GetSamplingRate())),
                     subtype=cls.encoding,
                     format=cls.format)
Ejemplo n.º 5
0
def test_write_int_data_to_float_file(file_inmemory):
    """This is a very uncommon use case."""
    sf.write(file_inmemory, data_mono, 44100, format='WAV', subtype='FLOAT')
    file_inmemory.seek(0)
    read, fs = sf.read(file_inmemory, always_2d=False, dtype='float32')
    assert np.all(read == data_mono)
    assert fs == 44100
Ejemplo n.º 6
0
def play_message(in_msg_fn):
    """
    This method opens a decrypted in_msg and converts the data to an audio
    stream. Then, it simply reads in the frames of the audio file and writes
    the data to an output stream. In other words, it plays the message for you.
    """
    try:
        in_msg = open(in_msg_fn, 'rb')
        data = pickle.load(in_msg)
        in_msg.close()
        print('Data pickled')
    except IOError:
        print("ERROR: Failed to open message file.")
        return
    sf.write(DECR_OUTPUT_FILENAME, data, samplerate=RATE)
    ##########################################################################
    # For now, I just want to make sure the WAV file is written successfully.#
    # Until then, this playback stuff will be on the backlog.#################
    ##########################################################################
    # wf = wave.open(DECR_OUTPUT_FILENAME, 'rb')
    # p = pyaudio.PyAudio()
    # stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
    #                 channels=wf.getnchannels(),
    #                 rate=wf.getframerate(),
    #                 output=True)
    # data = wf.readframes(CHUNK)
    # while data != '':
    #     stream.write(data)
    #     data = wf.readframes(CHUNK)
    # stream.stop_stream()
    # stream.close()
    # p.terminate()
    return DECR_OUTPUT_FILENAME
Ejemplo n.º 7
0
def __rubberband(y, sr, **kwargs):
    '''Execute rubberband

    Parameters
    ----------
    y : np.ndarray [shape=(n,) or (n, c)]
        Audio time series, either single or multichannel

    sr : int > 0
        sampling rate of y

    **kwargs
        keyword arguments to rubberband

    Returns
    -------
    y_mod : np.ndarray [shape=(n,) or (n, c)]
        `y` after rubberband transformation

    '''

    assert sr > 0

    # Get the input and output tempfile
    fd, infile = tempfile.mkstemp(suffix='.wav')
    os.close(fd)
    fd, outfile = tempfile.mkstemp(suffix='.wav')
    os.close(fd)

    # dump the audio
    sf.write(infile, y, sr)

    try:
        # Execute rubberband
        arguments = ['rubberband', '-q']

        for key, value in six.iteritems(kwargs):
            arguments.append(str(key))
            arguments.append(str(value))

        arguments.extend([infile, outfile])

        subprocess.check_call(arguments)

        # Load the processed audio.
        y_out, _ = sf.read(outfile, always_2d=True)

        # make sure that output dimensions matches input
        if y.ndim == 1:
            y_out = np.squeeze(y_out)

    finally:
        # Remove temp files
        os.unlink(infile)
        os.unlink(outfile)
        pass

    return y_out
Ejemplo n.º 8
0
    def play(self):
        log.debug('Play %r', self)

        # FIXME change adhoc play to universal
        fragment_filename = '/tmp/fragment.wav'
        sub.check_call(['rm', '-rf', fragment_filename])
        sf.write(self.samples, fragment_filename, self.samplerate)

        sub.check_call(['play', fragment_filename])
Ejemplo n.º 9
0
def compute_combination(args):
    snr, signal, noise, target_rate, new_name, storage_name = args
    noisy_signal = signal*snrdb2ratio(snr, signal, noise)+noise
    noisy_signal = noisy_signal/peak(noisy_signal)
    soundfile.write(storage_name, noisy_signal, target_rate)
    shutil.copyfile(storage_name, new_name)
    #soundfile = al.Sndfile(new_name, 'w', al.Format('flac'), 1, target_rate)
    #soundfile.write_frames(noisy_signal)
    #soundfile.sync()
    print("Wrote", new_name)
Ejemplo n.º 10
0
    def _save_estimates(self, user_estimates, track, estimates_dir):
        track_estimate_dir = op.join(
            estimates_dir, track.subset, track.filename
        )
        if not os.path.exists(track_estimate_dir):
            os.makedirs(track_estimate_dir)

        # write out tracks to disk
        for target, estimate in list(user_estimates.items()):
            target_path = op.join(track_estimate_dir, target + '.wav')
            sf.write(target_path, estimate, track.rate)
        pass
Ejemplo n.º 11
0
    def onExportClip(self):
        if self.last_clip and self.last_clip.audio_file:
            audio_file = self.last_clip.audio_file
            file_name, a = self.getSaveFileName(
                'Export Clip : %s' % self.last_clip.name, 'WAVE (*.wav)')

            if file_name:
                file_name = verify_ext(file_name, 'wav')
                sf.write(self.song.data[audio_file], file_name,
                         self.song.samplerate[audio_file],
                         subtype=sf.default_subtype('WAV'),
                         format='WAV')
Ejemplo n.º 12
0
def main():
    logdir, ckpt = os.path.split(args.checkpoint)
    arch = tf.gfile.Glob(os.path.join(logdir, 'architecture*.json'))[0]  # should only be 1 file
    with open(arch) as fp:
        arch = json.load(fp)

    normalizer = Tanhize(
        xmax=np.fromfile('./etc/xmax.npf'),
        xmin=np.fromfile('./etc/xmin.npf'),
    )

    features = read_whole_features(args.file_pattern.format(args.src))

    x = normalizer.forward_process(features['sp'])
    x = nh_to_nchw(x)
    y_s = features['speaker']
    y_t_id = tf.placeholder(dtype=tf.int64, shape=[1,])
    y_t = y_t_id * tf.ones(shape=[tf.shape(x)[0],], dtype=tf.int64)

    machine = MODEL(arch)
    z = machine.encode(x)
    x_t = machine.decode(z, y_t)  # NOTE: the API yields NHWC format
    x_t = tf.squeeze(x_t)
    x_t = normalizer.backward_process(x_t)

    # For sanity check (validation)
    x_s = machine.decode(z, y_s)
    x_s = tf.squeeze(x_s)
    x_s = normalizer.backward_process(x_s)

    f0_s = features['f0']
    f0_t = convert_f0(f0_s, args.src, args.trg)

    output_dir = get_default_output(args.output_dir)

    saver = tf.train.Saver()
    sv = tf.train.Supervisor(logdir=output_dir)
    with sv.managed_session() as sess:
        load(saver, sess, logdir, ckpt=ckpt)
        while True:
            try:
                feat, f0, sp = sess.run(
                    [features, f0_t, x_t],
                    feed_dict={y_t_id: np.asarray([SPEAKERS.index(args.trg)])}
                )
                feat.update({'sp': sp, 'f0': f0})
                y = pw2wav(feat)
                oFilename = make_output_wav_name(output_dir, feat['filename'])
                sf.write(oFilename, y, FS)
            except:
                break
Ejemplo n.º 13
0
def write_audio_file(filepath, v_signal, fs, norm=0.98):
    '''
    norm: If None, no normalisation is applied. If it is a float number,
          it is the target value (absolute) for the normalisation.
    '''
    
    # Normalisation:
    if norm is not None:
        v_signal = norm * v_signal / np.max(np.abs(v_signal)) # default
        
    # Write:    
    sf.write(filepath, v_signal, fs)
    
    return
Ejemplo n.º 14
0
def test_write_float_data_to_pcm_file(file_inmemory):
    float_to_clipped_int16 = [
        (-1.0 - 2**-15, -2**15    ),
        (-1.0         , -2**15    ),
        (-1.0 + 2**-15, -2**15 + 1),
        ( 0.0         ,  0        ),
        ( 1.0 - 2**-14,  2**15 - 2),
        ( 1.0 - 2**-15,  2**15 - 1),
        ( 1.0         ,  2**15 - 1),
    ]
    written, expected = zip(*float_to_clipped_int16)
    sf.write(file_inmemory, written, 44100, format='WAV', subtype='PCM_16')
    file_inmemory.seek(0)
    read, fs = sf.read(file_inmemory, dtype='int16')
    assert np.all(read == expected)
    assert fs == 44100
def main():
    args = get_args()
    if args.lin: cFreq = makeLinearCFs(args.band, args.space, args.low, args.high)
    else:        cFreq = makeErbCFs(args.band, args.space, args.low, args.high)

    compTone = genComplex(cFreq, args.rate, args.time)
    ampTone  = ampModulate(compTone, args.mod, args.rate)

    #  -1 : balance to not go above '1'.
    # > 0 : balance to the specified value.
    if args.rms <= 0.0:
        ampTone *= ( 1 / np.max( np.abs(ampTone) ) )
    else:
        ampTone *= (args.rms / rms(ampTone))

    sf.write(args.save, ampTone, args.rate)
Ejemplo n.º 16
0
def sliceAudio(iFilename, names, times, verbose_en):
	#open aduio
	data, fs = sf.read(iFilename)
	
	times.append(len(data)*fs)
	
	# calculate time laps
	for i in range(len(times)-1):
		startPoint = times[i]*fs
		endPoint = times[i+1]*fs
		
		# write slice audio file
		sf.write(names[i]+'.wav', data[startPoint:endPoint], fs)
		
		if verbose_en == True:
			print names[i]+'.wav'
Ejemplo n.º 17
0
    def output(self, filename, format=None):
        """
        Write the samples out to the given filename.

        Parameters
        ----------
        filename : str
            The path to write the audio on disk.
            This can be any format supported by `pysoundfile`, including
            `WAV`, `FLAC`, or `OGG` (but not `mp3`).

        format : str
            If provided, explicitly set the output encoding format.
            See `soundfile.available_formats`.
        """
        sf.write(filename, self.raw_samples.T, int(self.sample_rate), format=format)
Ejemplo n.º 18
0
def save(f, s, fs, subtype=None):
    '''
    Return
    ------
        waveform (ndarray), sample rate (int)
    '''
    from soundfile import write
    return write(f, s, fs, subtype=subtype)
Ejemplo n.º 19
0
def export(input, input_file, output_path, samplerate):
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    basepath = os.path.join(
        output_path, os.path.splitext(os.path.basename(input_file))[0]
    )

    # Write out all components
    for i in range(input.shape[0]):
        sf.write(
            basepath + "_cpnt-" + str(i) + ".wav",
            input[i],
            samplerate
        )

    out_sum = np.sum(input, axis=0)
    sf.write(basepath + '_reconstruction.wav', out_sum, samplerate)
Ejemplo n.º 20
0
Archivo: util.py Proyecto: yz-/ut
def stereo_to_mono_and_extreme_silence_cropping(source, target, subtype=None, print_progress=False):
    if os.path.isdir(source) and os.path.isdir(target):
        from glob import iglob
        if source[-1] != '/':
            source += '/'
        for i, filepath in enumerate(iglob(source + '*.wav')):
            filename = os.path.basename(filepath)
            if print_progress:
                printProgress("{}: {}".format(i, filename))
            stereo_to_mono_and_extreme_silence_cropping(
                filepath,
                os.path.join(target, filename)
            )
    else:
        wf, sr = wf_and_sr(source)
        wf = ensure_mono(wf)
        wf = crop_head_and_tail_silence(wf)
        sf.write(data=wf, file=target, samplerate=sr, subtype=subtype)
Ejemplo n.º 21
0
 def unify_signals(self, obj):
     for signal_name in self.signal_names:
         if signal_name in obj._save_names:
             continue
         signal = getattr(obj, signal_name)
         if signal != []:
             name = hashlib.md5(signal).hexdigest()
             if name in self.signals:
                 setattr(obj, signal_name, self.signals[name])
                 # signal = self.signals[name]
             else:
                 self.signals.update({name:signal})
                 #signal = self.signals[name]
                 setattr(obj, signal_name, self.signals[name])
                 sf.write(os.path.join(self.temp_path, name+'.wav'),
                          self.signals[name],
                          samplerate = obj.sample_rate)
             obj._save_names[signal_name] = name
Ejemplo n.º 22
0
def swingify(file_path, outfile, factor, sr=None, format=None):
    y, sr = librosa.load(file_path, mono=False, sr=sr)
    print(y.shape)
    anal_samples = librosa.to_mono(y)
    raw_samples = np.atleast_2d(y)
    # force stereo
    if raw_samples.shape[0] < 2:
        print('doubling mono signal to be stereo')
        raw_samples = np.vstack([raw_samples, raw_samples])

    beats = get_beats(anal_samples, sr, 512)

    output = synthesize(raw_samples, beats, factor)

    output = output * 0.7
    print(sr)
    sf.write(outfile, output.T, int(sr), format=format)
    # librosa.output.write_wav(outfile, output, sr, norm=True)
    return beats
Ejemplo n.º 23
0
	def insertIntoDb(file, identifier, info):
		onlineDbId = info['source'].lower()
		id = '{}_{}'.format(onlineDbId, identifier)
		if id in rirDb:
			return False

		info['id'] = id
		info['filename'] = os.path.join(ImportDir, id + '.wav')
		rirDb[id] = info

		# copy file (or write as wav file)
		if isinstance(file, str):
			shutil.copyfile(file, info['filename'])
		else:
			assert len(file) == 2
			x, fs = file
			sf.write(info['filename'], x, fs)

		return True
Ejemplo n.º 24
0
def main(input_directory, data_directory, group, speaker, chapter):
    wav_file_count = 0
    save_path = os.path.join(data_directory, group, speaker, chapter)
    idtag = speaker+'-'+chapter
    transcript_filename = idtag + '.trans.txt'
    os.makedirs(save_path, exist_ok=True)
    outfile = open(os.path.join(save_path, transcript_filename), 'w')
    save_path = os.path.join(data_directory, group, speaker, chapter)
    for file in os.listdir(input_directory):
        if file.endswith(".wav"):
            data, samplerate = sf.read(os.path.join(input_directory,file))
            sf.write('testwavout.wav',data,samplerate)
            # save the file to its new place
            ident = idtag + '-' + '{:04d}'.format(wav_file_count)
            new_filename = ident+'.wav'
            print(ident)
            os.replace('testwavout.wav',os.path.join(save_path,new_filename))
            wav_file_count += 1
            outfile.write(ident+' \n')
    outfile.close()
Ejemplo n.º 25
0
    def synthesize_direct(self, tract_params, glottis_params, duration_s,
                          framerate_hz, wavfile=None):
        extra_frames = 1000
        n_frames = int(duration_s * framerate_hz)
        assert len(tract_params) / self.n_vocaltract_params == n_frames
        assert len(glottis_params) / self.n_glottis_params == n_frames

        # Prep output
        c_int_ptr = ctypes.c_int * 1  # int*
        c_audio_ptr = ctypes.c_double * int(
            duration_s * self.audio_samplerate + extra_frames)
        audio = c_audio_ptr(0)
        n_audio_samples = c_int_ptr(0)
        c_tubeareas_ptr = ctypes.c_double * int(
            n_frames * self.n_tube_sections)
        tubeareas = c_tubeareas_ptr(0)
        c_tractsequence_ptr = ctypes.c_double * int(
            n_frames * self.n_vocaltract_params)
        tract_params_ptr = c_tractsequence_ptr(*tract_params)
        c_glottissequence_ptr = ctypes.c_double * int(
            n_frames * self.n_glottis_params)
        glottis_params_ptr = c_tractsequence_ptr(*glottis_params)

        # Call VTL
        self.lib.vtlInitialize(self.speaker)
        self.lib.vtlSynthBlock(tract_params_ptr,
                               glottis_params_ptr,
                               tubeareas,
                               ctypes.c_int(n_frames),
                               ctypes.c_double(framerate_hz),
                               audio,
                               n_audio_samples)

        # Process output
        out_audio = np.asarray(audio, dtype=np.float64)
        out_audio = np.int16(out_audio / np.max(np.abs(out_audio)) * 32767)
        self.lib.vtlClose()

        if wavfile is None:
            return out_audio, self.audio_samplerate
        sf.write(wavfile, out_audio, self.audio_samplerate)
Ejemplo n.º 26
0
    def saveTo(self, file):
        with ZipFile(file, 'w') as zip:
            song_file = configparser.ConfigParser()
            port_list = list(self.outputsPorts)
            song_file['DEFAULT'] = {'volume': self.volume,
                                    'bpm': self.bpm,
                                    'beat_per_bar': self.beat_per_bar,
                                    'width': self.width,
                                    'height': self.height,
                                    'outputs': json.dumps(port_list),
                                    'scenes': json.dumps(self.scenes)}
            if self.initial_scene is not None:
                song_file['DEFAULT']['initial_scene'] = self.initial_scene
            for clip in self.clips:
                clip_file = {'name': clip.name,
                             'volume': str(clip.volume),
                             'frame_offset': str(clip.frame_offset),
                             'beat_offset': str(clip.beat_offset),
                             'beat_diviser': str(clip.beat_diviser),
                             'output': clip.output,
                             'mute_group': str(clip.mute_group),
                             'audio_file': basename(
                                 clip.audio_file)}
                if clip_file['audio_file'] is None:
                    clip_file['audio_file'] = 'no-sound'
                song_file["%s/%s" % (clip.x, clip.y)] = clip_file

            buffer = StringIO()
            song_file.write(buffer)
            zip.writestr('metadata.ini', buffer.getvalue())

            for member in self.data:
                buffer = BytesIO()
                sf.write(buffer, self.data[member],
                         self.samplerate[member],
                         subtype=sf.default_subtype('WAV'),
                         format='WAV')
                zip.writestr(member, buffer.getvalue())

        self.file_name = file
Ejemplo n.º 27
0
def main(dbFilename, targetFs, force=False):
	util.createDirectory(NormalizeDir)

	rirDb = json.load(open(dbFilename))

	bar = util.ConsoleProgressBar()
	bar.start('Normalize RIRs')
	i = 0
	for rirId, rir in rirDb.items():
		targetFilename = join(NormalizeDir, rir['id'] + '.wav')
		if not force:
			if rir['filename'] == targetFilename and \
				rir['fs'] == targetFs and \
				targetFilename:
				continue

		x, fs_x = sf.read(join(ImportDir, rir['id'] + '.wav'), dtype='float32')
		y, fs_y = x, fs_x

		if fs_y != targetFs:
			y = resample(y, targetFs / fs_y, 'sinc_best')
			fs_y = targetFs

		rir['length_org'] = len(y) / fs_y
		y = util.trimSilence(y, 0.001, trimRight=False)
		y = util.normalizeAmplitude(y)

		sf.write(targetFilename, y, fs_y)
		
		rir['filename'] = targetFilename
		rir['fs'] = fs_y
		rir['length'] = len(y) / fs_y

		i += 1
		bar.progress(i / len(rirDb))
	bar.end()

	with open(dbFilename, 'w') as dbFile:
		json.dump(rirDb, dbFile, sort_keys=True, indent=4)
Ejemplo n.º 28
0
def test_read_int_data_from_float_file(file_inmemory):
    """This is a very uncommon use case."""
    unnormalized_float_to_clipped_int16 = [
        (-2.0**15 - 1 , -2**15),
        (-2.0**15     , -2**15),
        (-2.0**15 + 1 , -2**15 + 1),
        (-1.0         , -1),
        (-0.51        , -1),
        (-0.5         ,  0),
        ( 0.0         ,  0),
        ( 0.5         ,  0),
        ( 0.51        ,  1),
        ( 1.0         ,  1),
        ( 2.0**15 - 2 , 2**15 - 2),
        ( 2.0**15 - 1 , 2**15 - 1),
        ( 2.0**15     , 2**15 - 1),
    ]
    file_data, expected = zip(*unnormalized_float_to_clipped_int16)
    sf.write(file_inmemory, file_data, 44100, format='WAV', subtype='FLOAT')
    file_inmemory.seek(0)
    read, fs = sf.read(file_inmemory, always_2d=False, dtype='int16')
    assert np.all(read == expected)
    assert fs == 44100
Ejemplo n.º 29
0
def get_pitch_marks(v_sig, fs):
    
    temp_wav = lu.ins_pid('temp.wav')
    temp_pm  = lu.ins_pid('temp.pm')
        
    sf.write(temp_wav, v_sig, fs)
    reaper(temp_wav, temp_pm)
    v_pm = np.loadtxt(temp_pm, skiprows=7)
    v_pm = v_pm[:,0]
    
    # Protection against REAPER bugs 1:
    vb_correct = np.hstack(( True, np.diff(v_pm) > 0))
    v_pm = v_pm[vb_correct]
    
    # Protection against REAPER bugs 2 (maybe I need a better protection):
    if (v_pm[-1] * fs) >= (np.size(v_sig)-1):
        v_pm = v_pm[:-1]
    
    # Removing temp files:
    os.remove(temp_wav)
    os.remove(temp_pm)
    
    return v_pm
def resample(sample_rate=None, dir=None, csv_path=None):

    clips = []
    start_time = time.time()

    # List all clips that appear on the csv (train, eval or test)

    if csv_path != 'test':
        with open(csv_path, 'r') as csvFile:
            reader = csv.reader(csvFile)
            for row in reader:
                clips.append(row[0])

        csvFile.close()
        clips.remove('fname')
    else:
        clips = os.listdir(dir)

    if os.path.exists(dir+'/resampled/'):
        shutil.rmtree(dir+'/resampled', ignore_errors=True)  # ignore errors whit read only files

    os.mkdir(dir+'/resampled')

    for clip in clips:
        # Audio clip is read
        data, sr = sf.read(dir+'/'+clip)
        data = data.T
        # Audio data is resampled to desired sample_rate
        if sr != sample_rate:
            data_resampled = librosa.resample(data, sr, sample_rate)
        # Processed data is saved into a directory under train_clip_dir
        sf.write(dir+'/resampled/'+clip, data_resampled, sample_rate, subtype='PCM_16')

    print('Audio data has been resampled successfully')
    elapsed_time = time.time() - start_time
    print('Elapsed time ' + str(elapsed_time) + ' seconds')
Ejemplo n.º 31
0
def save_sound(dst, sound):
    """Save a sound to a file."""
    # Save without resampling
    sf.write(dst, sound[0], sound[1])
    return None
def save_wav(path, wav, sr):
    import soundfile as sf
    sf.write(path, wav, sr)
    pass
Ejemplo n.º 33
0
    parser.add_argument('--batch-size', type=int, default=16)

    args, _ = parser.parse_known_args()
    train_dataset, valid_dataset, args = load_datasets(parser, args)

    # Iterate over training dataset
    total_training_duration = 0
    for k in tqdm.tqdm(range(len(train_dataset))):
        x, y = train_dataset[k]
        total_training_duration += x.shape[1] / train_dataset.sample_rate
        if args.save:
            import soundfile as sf
            sf.write(
                "test/" + str(k) + 'x.wav',
                x.detach().numpy().T,
                44100,
            )
            sf.write(
                "test/" + str(k) + 'y.wav',
                y.detach().numpy().T,
                44100,
            )

    print("Total training duration (h): ", total_training_duration / 3600)
    print("Number of train samples: ", len(train_dataset))
    print("Number of validation samples: ", len(valid_dataset))

    # iterate over dataloader
    train_dataset.seq_duration = args.seq_dur
    train_dataset.random_chunks = True
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 19 08:49:04 2021

@author: CS
"""

# 改变采样率,统一为44.1kHz

import librosa
import numpy as np
import soundfile as sf
import os

file_path = 'D:/Project/DCASE_test/Data/Data_ShipsEar/'
sr_output = 44100
out_path = 'D:/Project/DCASE_test/Data/test/'

file_list = os.listdir(file_path)
for file in file_list:
    wav_path = file_path + file
    data, sr = librosa.load(wav_path, None)

    data_output = librosa.resample(data.astype(np.float32), sr, sr_output)

    out_name = out_path + file
    sf.write(out_name, data_output, sr_output)
To save a NumPy array as a WAV file, you can use wavio.write():

import wavio

wavio.write("myfile.wav", my_np_array, fs, sampwidth=2)
In this example, my_np_array is a NumPy array containing audio, fs is the sample rate of the recording (usually 44100 or 44800 Hz), and sampwidth is the sampling width of the audio (the number of bytes per sample, typically 1 or 2 bytes).

soundfile
The soundfile library can read and write all file formats supported by libsndfile. Although it can’t play back audio, it allows you to convert audio from and to FLAC, AIFF, and a few audio formats that are less common . To convert a WAV file to FLAC, you can use the following code:

import soundfile as sf

# Extract audio data and sampling rate from file 
data, fs = sf.read('myfile.wav') 
# Save as FLAC file at correct sampling rate
sf.write('myfile.flac', data, fs)  
Similar code will work for converting between other file formats supported by libsndfile.

pydub
pydub lets you save audio in any format that ffmpeg supports, which includes nearly all audio types you might encounter in your daily life. For example, you can convert your WAV file to MP3 with the following code:

from pydub import AudioSegment
sound = AudioSegment.from_wav('myfile.wav')

sound.export('myfile.mp3', format='mp3')
Using AudioSegment.from_file() is a more general way of loading audio files. For example, if you want to convert your file back from MP3 to WAV, you can do the following:

from pydub import AudioSegment
sound = AudioSegment.from_file('myfile.mp3', format='mp3')

sound.export('myfile.wav', format='wav')
Ejemplo n.º 36
0
sys.path.append(WAVERNN_FOLDER)

from gen_wavernn import generate
from utils import hparams as hp
from models.fatchord_version import WaveRNN

hp.configure(WAVERNN_FOLDER+'/hparams.py')
model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors,
                feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks,
                hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to('cpu')
model.load(CHECKPOINTS_FOLDER + "/" + wavernn_chpt)

y = []

ix=1
while os.path.exists(CHR_FOLDER+"/"+str(ix)+".npy"):
    y.append(np.load(CHR_FOLDER+"/"+str(ix)+".npy"))
    ix+=1

idx=1
for s in y:
    waveform = generate(model, s, hp.voc_gen_batched,
                      hp.voc_target, hp.voc_overlap)
    sf.write("wg-"+str(idx)+".wav", waveform, hp.sample_rate)
    idx+=1
    



Ejemplo n.º 37
0
def write_file(file_path, data, samplerate=16000):
    """
        write_file('test.wav', full_data, samplerate)
    """
    sf.write(file_path, data, samplerate)
Ejemplo n.º 38
0
from __future__ import division
import soundfile as sf 
from scipy import signal
import numpy as np  

x,fs1=sf.read('Sound_Noise.wav')
h,fs2=sf.read('transfer.wav')

# def dft(x):
# 	N=len(x)
# 	X=np.zeros((N,),dtype=complex)
# 	for k in range(0,N):
# 		for n in range(0,N):
# 			X[k]=X[k]+x[n]*np.exp(-np.pi*2j*k*n/N)
# 	return X

x1=np.fft.fft(x)
h1=np.fft.fft(h)

y=x1*h1

y1=np.fft.ifft(y)
y1=y1.real
sf.write('fft.wav',y1,fs1)
Ejemplo n.º 39
0
def main():
    args = parse_args()

    data_dir = None
    if args.vocals:
        data_dir = vocal_dir
    else:
        data_dir = novocal_dir

    if not os.path.isdir(data_dir):
        os.mkdir(data_dir)

    seq = 0
    for sd in args.stem_dirs:
        for song in os.scandir(sd):
            for dir_name, _, file_list in os.walk(song):
                instruments = [
                    os.path.join(dir_name, f) for f in file_list
                    if f.endswith(".wav")
                ]
                if instruments:
                    print("Found directory containing wav files: %d" % seq)
                    print(os.path.basename(dir_name).replace(" ", "_"))
                    loaded_wavs = [None] * len(instruments)
                    drum_track_index = -1
                    vocal_track_index = -1
                    mix_track_index = -1
                    for i, instrument in enumerate(instruments):
                        if "drum" in instrument.lower():
                            drum_track_index = i
                        elif "vocal" in instrument.lower():
                            vocal_track_index = i
                        elif "mix" in instrument.lower():
                            mix_track_index = i

                        # automatically resamples for us
                        loaded_wavs[i] = MonoLoader(
                            filename=instrument, sampleRate=args.sample_rate)()
                    track_len = len(loaded_wavs[0])

                    # ensure all stems have the same length
                    assert (len(loaded_wavs[i]) == track_len
                            for i in range(1, len(loaded_wavs)))

                    # first create the full mix
                    harmonic_mix = sum([
                        l for i, l in enumerate(loaded_wavs) if i not in [
                            drum_track_index,
                            vocal_track_index,
                            mix_track_index,
                        ]
                    ])

                    full_mix = None
                    if args.vocals:
                        full_mix = (harmonic_mix +
                                    loaded_wavs[drum_track_index] +
                                    loaded_wavs[vocal_track_index])
                    else:
                        full_mix = harmonic_mix + loaded_wavs[drum_track_index]

                    seg_samples = int(
                        numpy.floor(args.segment_size * args.sample_rate))
                    total_segs = int(numpy.floor(track_len / seg_samples))

                    seg_limit = min(total_segs - 1, args.segment_limit)

                    for seg in range(seg_limit):
                        if seg < args.segment_offset:
                            continue
                        seqstr = "%03d%04d" % (seq, seg)

                        left = seg * seg_samples
                        right = (seg + 1) * seg_samples

                        harm_path = os.path.join(
                            data_dir, "{0}_harmonic.wav".format(seqstr))
                        mix_path = os.path.join(data_dir,
                                                "{0}_mix.wav".format(seqstr))
                        perc_path = os.path.join(
                            data_dir, "{0}_percussive.wav".format(seqstr))
                        vocal_path = os.path.join(
                            data_dir, "{0}_vocal.wav".format(seqstr))

                        soundfile.write(harm_path, harmonic_mix[left:right],
                                        args.sample_rate)
                        soundfile.write(mix_path, full_mix[left:right],
                                        args.sample_rate)

                        # write the drum track
                        soundfile.write(
                            perc_path,
                            loaded_wavs[drum_track_index][left:right],
                            args.sample_rate,
                        )

                        if args.vocals:
                            # write the vocal track
                            soundfile.write(
                                vocal_path,
                                loaded_wavs[vocal_track_index][left:right],
                                args.sample_rate,
                            )

                    seq += 1

                    if args.track_limit > -1:
                        if seq == args.track_limit:
                            return 0

    return 0
Ejemplo n.º 40
0
import soundfile

filename = 'test.mp3'
print('loading {}...'.format(filename))
start = time.time()
y, sr = librosa.load(filename, mono=True)
print('elapsed {} sec'.format(time.time() - start))

print('y.shape: ' + str(y.shape))
print('sr: ' + str(sr))
duration = librosa.get_duration(y, sr)
print('duration: ' + str(duration))
print('{:.4f} hours'.format(duration / 3600))

print('splitting...')
start = time.time()
intervals = librosa.effects.split(y, top_db=60)
print('elapsed {} sec'.format(time.time() - start))

print('intervals.shape: ' + str(intervals.shape))
# print(intervals)

print('saving...')
start = time.time()
for i, inter in enumerate(intervals):
    yt = y[inter[0]:inter[1]]
    filename = os.path.join('audios', 'clip_{}.wav'.format(i))
    soundfile.write(filename, yt, samplerate=sr)
print('elapsed {} sec'.format(time.time() - start))
print('done')
Ejemplo n.º 41
0
 def write_wav(self, data, fs, filename, path='wav_out'):
     if not os.path.exists(path):
         os.makedirs(path)
     filepath = os.path.join(path, filename)
     print('Write to file: %s.' % filepath)
     sf.write(filepath, data.T, fs, subtype='PCM_16')
Ejemplo n.º 42
0
    args.add_argument("--src", help="Source database file.")
    args.add_argument("--dst", help="New audio that maps to the source file.")
    args.add_argument("--out",
                      default="output.wav",
                      help="Name of output file.")

    return args.parse_args()


if __name__ == "__main__":

    args = setup_args()

    a_audio = args.src
    b_audio = args.dst

    _, sr = sf.read(a_audio)

    a_segments = build_segment_db(a_audio)
    b_segments = build_segment_db(b_audio)

    new_segments = []
    for s in b_segments:
        match = nearest(s.mfcc, a_segments)
        try:
            new_segments.extend(match.quarter)
        except AttributeError:
            pass

    sf.write(args.out, new_segments, sr)
Ejemplo n.º 43
0
import soundfile as sf
import os

for file in os.listdir('./'):
    if file.startswith('LA'):
        for wav in os.listdir(file):
            data, fs = sf.read(file + os.sep + wav)
            thre = max(abs(data))
            ndata = data / thre * 0.8
            sf.write(file + os.sep + wav, ndata, fs)

Ejemplo n.º 44
0
def write_wav_skip_existing(path, y, sr, norm=False):
    if not os.path.exists(path):
        soundfile.write(path, y, sr, "PCM_16")
    else:
        print("WARNING: Tried writing audio to " + path +
              ", but audio file exists already. Skipping file!")
Ejemplo n.º 45
0
    def gpu_decode(feat_list, gpu):
        with torch.cuda.device(gpu):
            with torch.no_grad():
                model_waveform = GRU_WAVE_DECODER_DUALGRU_COMPACT_MBAND(
                    feat_dim=config.mcep_dim + config.excit_dim,
                    upsampling_factor=config.upsampling_factor,
                    hidden_units=config.hidden_units_wave,
                    hidden_units_2=config.hidden_units_wave_2,
                    kernel_size=config.kernel_size_wave,
                    dilation_size=config.dilation_size_wave,
                    n_quantize=config.n_quantize,
                    causal_conv=config.causal_conv_wave,
                    right_size=config.right_size,
                    n_bands=config.n_bands,
                    pad_first=True,
                    lpc=config.lpc)
                logging.info(model_waveform)
                model_waveform.cuda()
                model_waveform.load_state_dict(
                    torch.load(args.checkpoint)["model_waveform"])
                model_waveform.remove_weight_norm()
                model_waveform.eval()
                for param in model_waveform.parameters():
                    param.requires_grad = False
                torch.backends.cudnn.benchmark = True

                # define generator
                if args.string_path is None:
                    string_path = config.string_path
                else:
                    string_path = args.string_path
                logging.info(string_path)
                generator = decode_generator(
                    feat_list,
                    batch_size=args.batch_size,
                    upsampling_factor=config.upsampling_factor,
                    excit_dim=config.excit_dim,
                    string_path=string_path)

                # decode
                time_sample = []
                n_samples = []
                n_samples_t = []
                count = 0
                pqmf = PQMF(config.n_bands).cuda()
                print(
                    f'{pqmf.subbands} {pqmf.A} {pqmf.taps} {pqmf.cutoff_ratio} {pqmf.beta}'
                )
                for feat_ids, (batch_feat, n_samples_list) in generator:
                    logging.info("decoding start")
                    start = time.time()
                    logging.info(batch_feat.shape)

                    #batch_feat = F.pad(batch_feat.transpose(1,2), (model_waveform.pad_left,model_waveform.pad_right), "replicate").transpose(1,2)
                    samples = model_waveform.generate(batch_feat)
                    logging.info(samples.shape)  # B x n_bands x T//n_bands
                    samples = pqmf.synthesis(
                        samples)[:,
                                 0].cpu().data.numpy()  # B x 1 x T --> B x T
                    logging.info(samples.shape)

                    samples_list = samples

                    time_sample.append(time.time() - start)
                    n_samples.append(max(n_samples_list))
                    n_samples_t.append(
                        max(n_samples_list) * len(n_samples_list))

                    for feat_id, samples, samples_len in zip(
                            feat_ids, samples_list, n_samples_list):
                        #wav = np.clip(samples[:samples_len], -1, 1)
                        wav = np.clip(samples[:samples_len], -1,
                                      0.999969482421875)
                        outpath = os.path.join(args.outdir, feat_id + ".wav")
                        sf.write(outpath, wav, args.fs, "PCM_16")
                        logging.info("wrote %s." % (outpath))
                    #break

                    #figname = os.path.join(args.outdir, feat_id+"_wav.png")
                    #plt.subplot(2, 1, 1)
                    #plt.plot(wav_src)
                    #plt.title("source wave")
                    #plt.subplot(2, 1, 2)
                    #plt.plot(wav)
                    #plt.title("generated wave")
                    #plt.tight_layout()
                    #plt.savefig(figname)
                    #plt.close()

                    count += 1
                    #if count >= 3:
                    #if count >= 6:
                    #if count >= 1:
                    #    break

                logging.info("average time / sample = %.6f sec (%ld samples) [%.3f kHz/s]" % (\
                    sum(time_sample)/sum(n_samples), sum(n_samples), sum(n_samples)/(1000*sum(time_sample))))
                logging.info("average throughput / sample = %.6f sec (%ld samples) [%.3f kHz/s]" % (\
                sum(time_sample)/sum(n_samples_t), sum(n_samples_t), sum(n_samples_t)/(1000*sum(time_sample))))
Ejemplo n.º 46
0
        total_length = onsets[-1] + samples[
            (len(onsets) % round_robin - 1 + round_robin) %
            round_robin].shape[0]

        f = np.zeros([total_length, channels])

        index = 0
        for ost in onsets:
            f[ost:ost + samples[index].shape[0], :] += samples[index]
            index = (index + 1) % round_robin

        print('output file %s have %d channels' % (args.outfile, channels))
        norm_factor = np.max(f)
        if norm_factor > 1:
            f /= norm_factor
        sf.write(args.outfile, np.squeeze(f), samplerate)
    else:  ##output midi
        file = pretty_midi.PrettyMIDI(resolution=960, initial_tempo=bpm)
        drum_prog = pretty_midi.instrument_name_to_program('Steel Drums')
        trig_drum = pretty_midi.Instrument(program=drum_prog)

        for ost in onsets:
            time = float(ost) / samplerate
            note = pretty_midi.Note(velocity=127,
                                    pitch=36,
                                    start=time,
                                    end=time + 0.001)
            trig_drum.notes.append(note)

        file.instruments.append(trig_drum)
        file.write(args.outfile)
Ejemplo n.º 47
0
p.terminate()
print('Finished recording')
# Save the recorded data as a WAV file
wf = wave.open(filename, 'wb')
wf.setnchannels(channels)
wf.setsampwidth(p.get_sample_size(sample_format))
wf.setframerate(fs)
wf.writeframes(b''.join(frames))
wf.close()
playsound(filename)

# Read the audio data
fs, data = wavfile.read(filename)
data = data[:, 0]

# Normalize the data
data = normalize(data)
# Preprocess the raw data
# Filter requirements.
order = 10
cutoff = 4000  # desired cutoff frequency of the filter, Hz
y = butter_lowpass_filter(data, cutoff, fs, order)
noise = y[0:20000]
y_reduced_noise = nr.reduce_noise(audio_clip=y,
                                  noise_clip=noise,
                                  verbose=False)
y_reduced_noise = normalize(y_reduced_noise)

# Write to file
sf.write(filename, y_reduced_noise, fs)
playsound(filename)
Ejemplo n.º 48
0
    waveglow.cuda().eval().half()
    for k in waveglow.convinv:
        k.float()
    denoiser = Denoiser(waveglow)

    text = "相对论直接和间接的催生了量子力学的诞生 也为研究微观世界的高速运动确立了全新的数学模型"
    text = pinyin.get(text, format="numerical", delimiter=" ")
    print(text)
    sequence = np.array(text_to_sequence(text))[None, :]
    sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()

    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
    plot_data((mel_outputs.float().data.cpu().numpy()[0],
               mel_outputs_postnet.float().data.cpu().numpy()[0],
               alignments.float().data.cpu().numpy()[0].T))

    ensure_folder('images')
    plt.savefig('images/mel_spec.jpg')

    mel_outputs_postnet = mel_outputs_postnet.type(torch.float16)
    with torch.no_grad():
        audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)

    audio = audio[0].data.cpu().numpy()
    audio = audio.astype(np.float32)

    print('audio.shape: ' + str(audio.shape))
    print(audio)

    sf.write('output.wav', audio, sampling_rate, 'PCM_24')
Ejemplo n.º 49
0
import sounddevice as sd
import soundfile as sf

samplerate = 44100  # Hertz
duration = 30  # seconds
filename = 'output.wav'

mydata = sd.rec(int(samplerate * duration),
                samplerate=samplerate,
                channels=2,
                blocking=True)
sf.write(filename, mydata, samplerate)
Ejemplo n.º 50
0
table_size = 1024
lfo_freq = 2
mod_amount = 70

lfo = generate_wave(samplerate, duration, lfo_freq)

# phase = dry_phase(samplerate, duration, base_freq)
phase = fm_phase(samplerate, base_freq, mod_amount, lfo)
# phase = pm_phase(samplerate, base_freq, mod_amount, lfo)

additive = additive_saw(samplerate, base_freq, phase)
naive = naive_saw(samplerate, phase, table_size)
linterp = yoshimi_saw_linterp(samplerate, base_freq, phase, table_size)
cubic = yoshimi_saw_cubic(samplerate, base_freq, phase, table_size)
sinc = sinc_saw(samplerate, base_freq, phase, table_size)

# errors = calc_error(additive, naive, linterp, cubic, sinc)
# pp = pprint.PrettyPrinter(indent=4)
# pp.pprint(errors)

# compare(additive, naive)
# compare(additive, linterp)
# compare(additive, cubic)
# compare(additive, sinc)

soundfile.write("snd/modAdditive.wav", normalize(additive), samplerate)
soundfile.write("snd/modNaive.wav", normalize(naive), samplerate)
soundfile.write("snd/modLinterp.wav", normalize(linterp), samplerate)
soundfile.write("snd/modCubic.wav", normalize(cubic), samplerate)
soundfile.write("snd/modSinc.wav", normalize(sinc), samplerate)
Ejemplo n.º 51
0
def write_audio(path, audio, sample_rate):
    soundfile.write(file=path, data=audio, samplerate=sample_rate)
Ejemplo n.º 52
0
            torch.nn.Conv1d(W, W, 3, 1, 1, 1, bias=False),
            AdaptiveBatchNorm1d(W),
            torch.nn.LeakyReLU(0.2),
            torch.nn.Conv1d(W, 1, 1, 1, 0, bias=True),
        )

    def forward(self, input):
        return self.conv(input)


with torch.no_grad():
    model = DenoiseNetwork()

    speech_data = torch.load("SPEECH.pt")
    noise_data = torch.load("NOISE.pt")

    speech_sample = speech_data[0:220500].view(1, 1, -1)
    noise_sample = noise_data[0:220500].view(1, 1, -1)
    w = torch.rand(1)
    noisy_sample = (w * speech_sample) + ((1 - w) * noise_sample)

    output = model(noisy_sample)

    print(output)

    print(torch.nn.functional.mse_loss(output, speech_sample))

    soundfile.write("speech_sample.wav", speech_sample.view(-1).numpy(), 22050)
    soundfile.write("noise_sample.wav", noise_sample.view(-1).numpy(), 22050)
    soundfile.write("noisy_sample.wav", noisy_sample.view(-1).numpy(), 22050)
    soundfile.write("output_sample.wav", output.view(-1).numpy(), 22050)
Ejemplo n.º 53
0
def py_gen_decay_indices(limit, n):
    # generates indices for STM input that mimic a decay-ing behaiviour of memory
    result = [1]
    if n > 1:
        ratio = (float(limit) / result[-1])**(1.0 / (n - len(result)))
    while len(result) < n:
        next_value = result[-1] * ratio
        if next_value - result[-1] >= 1:
            result.append(next_value)
        else:
            result.append(result[-1] + 1)
            ratio = (float(limit) / result[-1])**(1.0 / (n - len(result)))

    indices = list(map(lambda x: -round(x) + limit, result[::-1]))
    return indices


def tf_gen_decay_indices(limit, n):
    return tf.py_function(gen_decay_indices, [limit, n], tf.int32)


if __name__ == "__main__":
    # Check preprocessing
    x = load_audio(HP.train_path, HP.sr)
    x = encode_mulaw(x, HP.bits)
    x = decode_mulaw(x, HP.bits)
    x = encode_16bit(x)

    sf.write("outputs/final/preprocess_check.wav", x, HP.sr)
Ejemplo n.º 54
0
        os.makedirs("sounds")
    for octave in range(1, 4):
        freqs.extend((freqs_base * octave).tolist())

    env = create_env_cos()
    t = np.arange(0, env.size) / sr

    #plt.plot(t, env)

    for f, freq in enumerate(freqs):
        fn_out = "sounds/%.2fHz.wav" % (freq)
        if Osc == "square":
            T = int(sr / freq)
            square = np.zeros((T, )) - 1
            square[int(T / 4):int(-T / 4)] = 1
            sample = np.tile(square, (int(np.ceil(sr * DUR / T)), ))
            sample = sample[:int(sr * DUR)]

        elif Osc == "saw":
            T = int(sr / freq)
            saw = np.zeros((T, ))
            saw[:int(T / 2)] = np.linspace(-1, 1, int(T / 2))
            saw[int(T / 2):] = np.linspace(1, -1, T - int(T / 2))
            sample = np.tile(saw, (int(np.ceil(sr * DUR / T)), ))
            sample = sample[:int(sr * DUR)]

        sample *= env
        sf.write(fn_out, 0.707 * sample / np.max(np.abs(sample)), sr)

# %%
Ejemplo n.º 55
0
def main():

    print "Generation started at:", datetime.strftime(datetime.now(),
                                                      '%Y-%m-%d %H:%M')

    args = get_args()

    SEQ_LEN = args.seq_len
    CON_DIM = args.con_dim
    CON_FRAME_SIZE = args.con_frame_size
    BIG_FRAME_SIZE = args.big_frame_size
    FRAME_SIZE = args.frame_size
    WEIGHT_NORM = args.weight_norm
    EMB_SIZE = args.emb_size
    RNN_HIDDEN_DIM = args.rnn_hidden_dim
    RNN_TYPE = args.rnn_type
    DNN_HIDDEN_DIM = args.dnn_hidden_dim
    LEARN_H0 = args.learn_h0
    Q_LEVELS = args.q_levels
    Q_TYPE = args.q_type
    BATCH_SIZE = args.batch_size
    WAV_OUT_PATH = args.wav_out_path
    RESTORE_FROM = args.restore_from
    H0_MULT = 2 if RNN_TYPE == 'LSTM' else 1

    assert SEQ_LEN % CON_FRAME_SIZE == 0,\
        'seq_len should be divisible by con_frame_size'
    assert CON_FRAME_SIZE % BIG_FRAME_SIZE == 0,\
        'con_frame_size should be divisible by big_frame_size'
    assert BIG_FRAME_SIZE % FRAME_SIZE == 0,\
        'big_frame_size should be divisible by frame_size'

    if os.path.exists(TEST_WAV_SORTED_LIST) and os.path.exists(
            TEST_CON_SORTED_LIST):
        print 'Test sorted list already exists!'
    else:
        if not os.path.exists(SORTED_LIST_PATH):
            os.makedirs(SORTED_LIST_PATH)
        print 'generating test sorted list...'
        generate_sorted_list(TEST_WAV_PATH, TEST_CON_PATH, SAMPLE_RATE,
                             TEST_WAV_SORTED_LIST, TEST_CON_SORTED_LIST)
        print 'Done.'

    if not os.path.exists(WAV_OUT_PATH):
        os.makedirs(WAV_OUT_PATH)

    data_feeder = load_data(TEST_WAV_SORTED_LIST, TEST_CON_SORTED_LIST,
                            BATCH_SIZE, SEQ_LEN, CON_FRAME_SIZE, CON_DIM,
                            Q_LEVELS, Q_TYPE, SAMPLE_RATE)

    con_ph = tf.placeholder(dtype=tf.float32, shape=[None, CON_DIM])
    con_mask_ph = tf.placeholder(dtype=tf.int32, shape=[None, CON_FRAME_SIZE])
    big_wav_ph = tf.placeholder(dtype=tf.int32, shape=[None, BIG_FRAME_SIZE])
    big_mask_ph = tf.placeholder(dtype=tf.int32, shape=[None, BIG_FRAME_SIZE])
    wav_ph = tf.placeholder(dtype=tf.int32, shape=[None, FRAME_SIZE])
    mask_ph = tf.placeholder(dtype=tf.int32, shape=[None, FRAME_SIZE])
    wav_t_ph = tf.placeholder(dtype=tf.int32, shape=[
        None,
    ])
    mask_t_ph = tf.placeholder(dtype=tf.int32, shape=[
        None,
    ])

    con_h0_ph = tf.placeholder(dtype=tf.float32,
                               shape=[None, H0_MULT * sum(RNN_HIDDEN_DIM[0])])
    big_h0_ph = tf.placeholder(dtype=tf.float32,
                               shape=[None, H0_MULT * sum(RNN_HIDDEN_DIM[1])])
    h0_ph = tf.placeholder(dtype=tf.float32,
                           shape=[None, H0_MULT * sum(RNN_HIDDEN_DIM[2])])

    con_frame_level_output_ph = tf.placeholder(
        dtype=tf.float32, shape=[None, 1, RNN_HIDDEN_DIM[0][-1]])
    big_frame_level_output_ph = tf.placeholder(
        dtype=tf.float32, shape=[None, 1, RNN_HIDDEN_DIM[1][-1]])
    frame_level_output_ph = tf.placeholder(dtype=tf.float32,
                                           shape=[None, RNN_HIDDEN_DIM[2][-1]])
    prev_samples_ph = tf.placeholder(dtype=tf.int32, shape=[None, FRAME_SIZE])
    sample_level_output_ph = tf.placeholder(dtype=tf.float32,
                                            shape=[None, Q_LEVELS])

    reset_ph = tf.placeholder(dtype=tf.int32, shape=[])
    argmax_ph = tf.placeholder(dtype=tf.int32, shape=[])

    with tf.variable_scope('SampleRNNModel', reuse=None):

        net = SampleRNNModel(seq_len=SEQ_LEN,
                             con_dim=CON_DIM,
                             con_frame_size=CON_FRAME_SIZE,
                             big_frame_size=BIG_FRAME_SIZE,
                             frame_size=FRAME_SIZE,
                             weight_norm=WEIGHT_NORM,
                             emb_size=EMB_SIZE,
                             rnn_hidden_dim=RNN_HIDDEN_DIM,
                             dnn_hidden_dim=DNN_HIDDEN_DIM,
                             rnn_type=RNN_TYPE,
                             learn_h0=LEARN_H0,
                             q_levels=Q_LEVELS)

        con_frame_level_output, con_h0 = net.con_frame_level_rnn(
            con_ph, con_mask_ph, con_h0_ph, reset_ph)
        big_frame_level_output, big_h0 = net.big_frame_level_rnn(
            big_wav_ph, big_mask_ph, con_frame_level_output_ph, big_h0_ph,
            reset_ph)
        frame_level_output, h0 = net.frame_level_rnn(
            wav_ph, mask_ph, big_frame_level_output_ph, h0_ph, reset_ph)
        sample_level_output = net.sample_level_predictor(
            frame_level_output_ph, prev_samples_ph)
        new_sample, ce_loss_t, accuracy_t = net.create_generator(
            sample_level_output_ph, wav_t_ph, mask_t_ph, argmax_ph)

    sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))

    saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=400)
    load_model(saver, sess, RESTORE_FROM)

    try:
        ce_loss_list = []
        accuracy_list = []

        wav_mat_list = []
        mask_mat_list = []

        samples_number = 0
        count = 0

        print 'Generation!'
        start_time = time()

        for _wav_batch, _mask_batch, _con_batch, _reset_batch, _end_batch, _end_epoch in data_feeder:

            if _reset_batch == 1:
                CON_H0 = np.zeros(
                    (BATCH_SIZE, H0_MULT * sum(RNN_HIDDEN_DIM[0])))
                BIG_H0 = np.zeros(
                    (BATCH_SIZE, H0_MULT * sum(RNN_HIDDEN_DIM[1])))
                H0 = np.zeros((BATCH_SIZE, H0_MULT * sum(RNN_HIDDEN_DIM[2])))

                samples = np.full((_wav_batch.shape[0], CON_FRAME_SIZE),
                                  np.int32((Q_LEVELS - 1) // 2),
                                  dtype='int32')

                cumulative_ce_loss = np.zeros((_wav_batch.shape[0], ),
                                              dtype='float32')
                cumulative_accuracy = np.zeros((_wav_batch.shape[0], ),
                                               dtype='float32')
                cumulative_mask = np.zeros((_wav_batch.shape[0], ),
                                           dtype='int32')

                mask_gen = _mask_batch[:, CON_FRAME_SIZE:]
                index = CON_FRAME_SIZE
            else:
                mask_gen = np.concatenate(
                    [mask_gen, _mask_batch[:, CON_FRAME_SIZE:]], axis=1)

            for t in xrange(CON_FRAME_SIZE, SEQ_LEN + CON_FRAME_SIZE):
                if t % CON_FRAME_SIZE == 0:
                    CON_FRAME_OUTPUT, CON_H0 = sess.run(
                        [con_frame_level_output, con_h0],
                        feed_dict={
                            con_ph:
                            _con_batch[:, (t // CON_FRAME_SIZE) *
                                       CON_DIM:(t // CON_FRAME_SIZE + 1) *
                                       CON_DIM],
                            con_mask_ph:
                            _mask_batch[:, t:t + CON_FRAME_SIZE],
                            con_h0_ph:
                            CON_H0,
                            reset_ph:
                            _reset_batch
                        })

                if t % BIG_FRAME_SIZE == 0:
                    BIG_FRAME_OUTPUT, BIG_H0 = sess.run(
                        [big_frame_level_output, big_h0],
                        feed_dict={
                            big_wav_ph:
                            samples[:, index - BIG_FRAME_SIZE:index],
                            big_mask_ph:
                            _mask_batch[:, t:t + BIG_FRAME_SIZE],
                            con_frame_level_output_ph:
                            CON_FRAME_OUTPUT[:, (t / BIG_FRAME_SIZE) %
                                             (CON_FRAME_SIZE /
                                              BIG_FRAME_SIZE)].reshape(
                                                  _wav_batch.shape[0], 1, -1),
                            big_h0_ph:
                            BIG_H0,
                            reset_ph:
                            _reset_batch
                        })

                if t % FRAME_SIZE == 0:
                    FRAME_OUTPUT, H0 = sess.run(
                        [frame_level_output, h0],
                        feed_dict={
                            wav_ph:
                            samples[:, index - FRAME_SIZE:index],
                            mask_ph:
                            _mask_batch[:, t:t + FRAME_SIZE],
                            big_frame_level_output_ph:
                            BIG_FRAME_OUTPUT[:, (t / FRAME_SIZE) %
                                             (BIG_FRAME_SIZE /
                                              FRAME_SIZE)].reshape(
                                                  _wav_batch.shape[0], 1, -1),
                            h0_ph:
                            H0,
                            reset_ph:
                            _reset_batch
                        })

                SAMPLE_OUTPUT = sess.run(sample_level_output,
                                         feed_dict={
                                             frame_level_output_ph:
                                             FRAME_OUTPUT[:, t % FRAME_SIZE],
                                             prev_samples_ph:
                                             samples[:,
                                                     index - FRAME_SIZE:index]
                                         })

                if index < ARGMAX_SAMPLES + CON_FRAME_SIZE:
                    NEW_SAMPLE, CE_LOSS, ACCURACY = sess.run(
                        [new_sample, ce_loss_t, accuracy_t],
                        feed_dict={
                            sample_level_output_ph: SAMPLE_OUTPUT,
                            wav_t_ph: _wav_batch[:, t],
                            mask_t_ph: _mask_batch[:, t],
                            argmax_ph: 1
                        })
                else:
                    NEW_SAMPLE, CE_LOSS, ACCURACY = sess.run(
                        [new_sample, ce_loss_t, accuracy_t],
                        feed_dict={
                            sample_level_output_ph: SAMPLE_OUTPUT,
                            wav_t_ph: _wav_batch[:, t],
                            mask_t_ph: _mask_batch[:, t],
                            argmax_ph: 0
                        })

                cumulative_ce_loss += CE_LOSS
                cumulative_accuracy += ACCURACY
                cumulative_mask += _mask_batch[:, t]

                samples = np.concatenate([samples, NEW_SAMPLE], axis=1)

                index += 1

            if _end_batch == 1:
                ce_loss_list.extend(list(cumulative_ce_loss / cumulative_mask))
                accuracy_list.extend(
                    list(cumulative_accuracy / cumulative_mask))

                wav_mat_list.append(samples[:, CON_FRAME_SIZE:])
                mask_mat_list.append(mask_gen)

        ce_loss = np.mean(ce_loss_list)
        accuracy = np.mean(accuracy_list)

        fid = open(TEST_WAV_SORTED_LIST, 'r')
        gen_id_list = fid.readlines()
        fid.close()

        for i in xrange(len(wav_mat_list)):
            samples_number += wav_mat_list[i].shape[0] * wav_mat_list[i].shape[
                1]
            for j in xrange(wav_mat_list[i].shape[0]):
                samplei = wav_mat_list[i][j]
                maski = mask_mat_list[i][j]
                samplei = samplei[0:len(np.where(maski == 1)[0])]
                if Q_TYPE == 'mu-law':
                    from datasets.audio_reader import mu2linear
                    samplei = mu2linear(samplei, Q_LEVELS)

                sf.write(
                    WAV_OUT_PATH + os.sep +
                    gen_id_list[count].split()[0].split('/')[-1], samplei,
                    SAMPLE_RATE, 'PCM_16')
                count += 1

        generation_time = time() - start_time

        log = "{} samples generated in {} hours.\nThe time of generating 1 second speech is {} seconds.\n"
        log += "Performance:\n\tloss:{:.4f}\taccuracy:{:.4f}%\n"
        log = log.format(len(gen_id_list), generation_time / 3600,
                         generation_time / samples_number * SAMPLE_RATE,
                         ce_loss, accuracy * 100)
        print log

        fid_log = open(LOG_FILE, 'a')
        fid_log.write(log)
        fid_log.close()

        print "Generation ended at:", datetime.strftime(
            datetime.now(), '%Y-%m-%d %H:%M')

    except KeyboardInterrupt:
        # Introduce a line break after ^C is displayed so save message
        # is on its own line.
        print()
Ejemplo n.º 56
0
def record(length=1, reclength=1, filename=None, thres=0):
    """ 
    Merekam suara secara stream dan metode callback
    """

    global cumulated_status, end_count, start_count, recording, magnitudo, audiodata, predicting, i_quit, listening
    predicting = False
    listening = True
    end_count = False
    start_count = 0
    recording = False
    magnitudo = []
    audiodata = []
    try:
        import sounddevice as sd

        #samplerate = sd.query_devices(args.device, 'input')['default_samplerate']
        samplerate = 16000.0

        delta_f = (high - low) / screenwidth
        fftsize = np.ceil(samplerate / delta_f).astype(int)
        low_bin = int(np.floor(low / delta_f))

        cumulated_status = sd.CallbackFlags()

        def callback(indata, frames, time, status):
            global cumulated_status, audiodata, magnitudo, end_count, start_count, recording, smodel, predicting, i_quit

            cumulated_status |= status
            if any(indata):
                magnitude = np.abs(np.fft.rfft(indata[:, 0], n=fftsize))
                magnitude *= gain / fftsize

                rms = librosa.feature.rmse(S=indata)
                rms = int(rms * 32768)
                start_count += 1
                if rms >= thres:
                    if not recording:  #and not end_count
                        #print("Start record")
                        recording = True
                        start_count = 0

                if recording:
                    audiodata.extend(itertools.chain(indata.tolist()))
                    magnitudo.append(magnitude)
                    if start_count == int(samplerate /
                                          (samplerate * DURATION / 1000)):
                        #print("End record")
                        start_count = 0
                        end_count = True
                        recording = False
                        try:
                            if not predicting:
                                print("Predict")
                                soundfile.write("temp.wav", audiodata, 16000)
                                predict("temp.wav", model=smodel)
                                predicting = False
                            pass
                        except:
                            pass
                        audiodata = []

        with sd.InputStream(device=None,
                            channels=1,
                            callback=callback,
                            blocksize=int(samplerate * DURATION / 1000),
                            samplerate=samplerate):
            while True:
                #response = input()
                #if response in ('', 'q', 'Q'):
                if listening == False:
                    #time.sleep(length)
                    break
            if filename != None: soundfile.write(filename, audiodata, 16000)

        if cumulated_status:
            logging.warning(str(cumulated_status))
    except Exception as e:
        print(e)
Ejemplo n.º 57
0
def main(fft_window_size, fft_window_step):
    """Generates the audio and PESQ vs SNR graphs for a given STFT
    setup. Saves the graphs and generated audio files to disk.

    Args:
        fft_window_size: The FFT window size.
        fft_window_step: The FFT window step.
    """

    os.environ['CUDA_VISIBLE_DEVICES'] = '1'
    print(fft_window_size, ' ', fft_window_step, ' ', (fft_window_size // 2))

    origonal_audio, _ = sf.read(WAVEFORM_PATH)
    origonal_audio = origonal_audio.astype(np.float32)

    for representation in REPRESENTATIONS:
        REPRESENTATIONS[representation]['perceptual_errors'] = []
        REPRESENTATIONS[representation]['waveforms'] = []

    for snr in SNRS:
        pb_i = utils.Progbar(len(REPRESENTATIONS) * N_REPEATS)
        print('SNR: ', snr)
        for representation in REPRESENTATIONS:
            all_perceptual_errors = []
            for _ in range(N_REPEATS):
                perceptual_errors, audio_hats = process_representation_at_snr(
                    representation, origonal_audio, snr, fft_window_size,
                    fft_window_step)
                all_perceptual_errors.append(perceptual_errors)
                pb_i.add(1)

            print(' ', representation, ' -> ', np.mean(all_perceptual_errors,
                                                       0))
            REPRESENTATIONS[representation]['perceptual_errors'].append(
                np.mean(all_perceptual_errors, 0))
            REPRESENTATIONS[representation]['waveforms'].append(audio_hats)

    # Plot the graph
    for representation in REPRESENTATIONS:
        perceptual_errors = REPRESENTATIONS[representation][
            'perceptual_errors']
        perceptual_errors = np.array(perceptual_errors)

        plot = plt.plot(SNRS, perceptual_errors[:, 0], label=representation)
        for i in range(perceptual_errors.shape[-1] - 1):
            plt.plot(SNRS,
                     perceptual_errors[:, i + 1],
                     color=plot[0].get_color(),
                     linestyle=LINE_STYLES[i])

    plt.xlabel('SNR')
    plt.ylabel('PESQ')
    plt.legend()

    file_name = 'pesq_vs_snr__{}ws_{}s'.format(fft_window_size,
                                               fft_window_step)
    plt.savefig(os.path.join(RESULTS_PATH, file_name),
                bbox_inches='tight',
                dpi=920)
    plt.clf()

    # Save the audio files
    setup = 'audio_{}ws_{}s'.format(fft_window_size, fft_window_step)
    base_audio_dir = os.path.join(RESULTS_PATH, setup)

    for representation in REPRESENTATIONS:
        audio_dir = os.path.join(base_audio_dir, representation)
        os.makedirs(audio_dir, exist_ok=True)

        for i, audio in enumerate(
                REPRESENTATIONS[representation]['waveforms']):
            for j, wav in enumerate(audio):
                file_path = os.path.join(
                    audio_dir, '{}_{}db_{}.wav'.format(representation, SNRS[i],
                                                       j))
                sf.write(file_path, wav, SAMPLE_RATE)
Ejemplo n.º 58
0
def clean_getfirst3secs(audiofile):
    data, samplerate = sf.read(audiofile)
    os.remove(audiofile)
    data2 = data[0:samplerate * 3]
    sf.write(audiofile, data2, samplerate)
    return [audiofile]
Ejemplo n.º 59
0
sec = int(time.time())  # current Unix timestamp

ofname = 'R_' + str(sec) + '.wav'

try:
    data, fs = sf.read(args.filename, dtype='float32')
    #print("Play fs: %d\n",fs)

    # sd.play(data, fs, device=args.device)  # play 'data' array at fs sample rate
    recdata = sd.playrec(
        data, fs, channels=recChan,
        device=args.device)  # play data array AND record recChan channels
    # recdata is a [n][recChan] array of float32 type (?)
    status = sd.wait()  # wait here until playback is done

    #pk1 = np.amax(recdata[:,0])  # peak recorded value (positive)
    #pk2 = np.amax(recdata[:,1])

    opath = outdir + "/" + ofname
    sf.write(opath, recdata, fs)  # save recorded data to file
    tstamp = ofname[2:-4]
    # print("%s %d %4.2f %4.2f " % (tstamp,int(fs/1000),pk1,pk2),end="")
    print("%s" % opath)

except KeyboardInterrupt:
    parser.exit('\nInterrupted by user')
except Exception as e:
    parser.exit(type(e).__name__ + ': ' + str(e))
if status:
    parser.exit('Error during playback: ' + str(status))
Ejemplo n.º 60
0
    def test(self, args):
        with open(args.tt_list, 'r') as f:
            self.tt_list = [line.strip() for line in f.readlines()]
        self.model_file = args.model_file
        self.ckpt_dir = args.ckpt_dir
        self.est_path = args.est_path
        self.write_ideal = args.write_ideal
        self.gpu_ids = tuple(map(int, args.gpu_ids.split(',')))
        if len(self.gpu_ids) == 1 and self.gpu_ids[0] == -1:
            # cpu only
            self.device = torch.device('cpu')
        else:
            # gpu
            self.device = torch.device('cuda:{}'.format(self.gpu_ids[0]))

        if not os.path.isdir(self.ckpt_dir):
            os.makedirs(self.ckpt_dir)
        logger = getLogger(os.path.join(self.ckpt_dir, 'test.log'), log_file=True)

        # create a network
        net = Net()
        logger.info('Model summary:\n{}'.format(net))

        net = net.to(self.device)

        # calculate model size
        param_count = numParams(net)
        logger.info('Trainable parameter count: {:,d} -> {:.2f} MB\n'.format(param_count, param_count*32/8/(2**20)))
        
        # training criterion and optimizer
        criterion = LossFunction()
        
        # net feeder
        feeder = NetFeeder(self.device, self.win_size, self.hop_size)
        
        # resynthesizer
        resynthesizer = Resynthesizer(self.device, self.win_size, self.hop_size)
        
        # load model
        logger.info('Loading model from {}'.format(self.model_file))
        ckpt = CheckPoint()
        ckpt.load(self.model_file, self.device)
        net.load_state_dict(ckpt.net_state_dict)
        logger.info('model info: epoch {}, iter {}, cv_loss - {:.4f}\n'.format(ckpt.ckpt_info['cur_epoch']+1,
            ckpt.ckpt_info['cur_iter']+1, ckpt.ckpt_info['cv_loss']))
        
        net.eval()
        for i in range(len(self.tt_list)):
            # create a data loader for testing
            tt_loader = AudioLoader(self.tt_list[i], self.sample_rate, unit='utt',
                                    segment_size=None, segment_shift=None,
                                    batch_size=1, buffer_size=10,
                                    in_norm=self.in_norm, mode='eval')
            logger.info('[{}/{}] Estimating on {}'.format(i+1, len(self.tt_list), self.tt_list[i]))

            est_subdir = os.path.join(self.est_path, self.tt_list[i].split('/')[-1].replace('.ex', ''))
            if not os.path.isdir(est_subdir):
                os.makedirs(est_subdir)
        
            accu_tt_loss = 0.
            accu_n_frames = 0        
            for k, egs in enumerate(tt_loader):
                mix = egs['mix']
                sph = egs['sph']
                n_samples = egs['n_samples']

                n_frames = countFrames(n_samples, self.win_size, self.hop_size)
                
                mix = mix.to(self.device)
                sph = sph.to(self.device)

                feat, lbl = feeder(mix, sph)

                with torch.no_grad():
                    loss_mask = lossMask(shape=lbl.shape, n_frames=n_frames, device=self.device)
                    est = net(feat)
                    loss = criterion(est, lbl, loss_mask, n_frames)

                accu_tt_loss += loss.data.item() * sum(n_frames)
                accu_n_frames += sum(n_frames)
                
                sph_idl = resynthesizer(lbl, mix)
                sph_est = resynthesizer(est, mix)
                
                # save estimates
                mix = mix[0].cpu().numpy()
                sph = sph[0].cpu().numpy()
                sph_est = sph_est[0].cpu().numpy()
                sph_idl = sph_idl[0].cpu().numpy()
                mix, sph, sph_est, sph_idl = wavNormalize(mix, sph, sph_est, sph_idl)
                sf.write(os.path.join(est_subdir, '{}_mix.wav'.format(k)), mix, self.sample_rate)
                sf.write(os.path.join(est_subdir, '{}_sph.wav'.format(k)), sph, self.sample_rate)
                sf.write(os.path.join(est_subdir, '{}_sph_est.wav'.format(k)), sph_est, self.sample_rate)
                if self.write_ideal:
                    sf.write(os.path.join(est_subdir, '{}_sph_idl.wav'.format(k)), sph_idl, self.sample_rate)

            avg_tt_loss = accu_tt_loss / accu_n_frames
            logger.info('loss: {:.4f}\n'.format(avg_tt_loss))

        return