def convert_mp3_to_wav(cfg, mp3_string): """ Convert a string with mp3 to a string with audio in the default format (mono, pcm16, default sample rate). """ sample_rate = cfg['Audio']['sample_rate'] # write the buffer to a temporary file tmp1fh, tmp1path = mkstemp() tmp1fh = fdopen(tmp1fh, 'wb') tmp1fh.write(mp3_string) tmp1fh.close() # transform the temporary file using SoX (can't do this in memory :-() tmp2fh, tmp2path = mkstemp() sox_in = pysox.CSoxStream(tmp1path, fileType='mp3') sox_out = pysox.CSoxStream(tmp2path, 'w', pysox.CSignalInfo(sample_rate, 1, 16), fileType='wav') sox_chain = pysox.CEffectsChain(sox_in, sox_out) sox_chain.add_effect(pysox.CEffect("rate", [str(sample_rate)])) sox_chain.flow_effects() sox_out.close() # read the transformation results back to the buffer return load_wav(cfg, fdopen(tmp2fh, 'rb'))
def convert_wav(cfg, wav): """ Convert the given WAV byte buffer into the desired sample rate using SoX. Assumes mono + 16-bit sample size. """ sample_rate = cfg['Audio']['sample_rate'] # write the buffer to a temporary file tmp1fh, tmp1path = mkstemp() tmp1fh = fdopen(tmp1fh, 'wb') tmp1fh.write(wav) tmp1fh.close() # transform the temporary file using SoX (can't do this in memory :-() tmp2fh, tmp2path = mkstemp() sox_in = pysox.CSoxStream(tmp1path) sox_out = pysox.CSoxStream(tmp2path, 'w', pysox.CSignalInfo(sample_rate, 1, 16), fileType='wav') sox_chain = pysox.CEffectsChain(sox_in, sox_out) sox_chain.add_effect(pysox.CEffect("rate", [str(sample_rate)])) sox_chain.flow_effects() sox_out.close() # read the transformation results back to the buffer return load_wav(cfg, fdopen(tmp2fh, 'rb'))
def to_wav(src_path, wav_path): sox_in = pysox.CSoxStream(src_path) sox_out = pysox.CSoxStream(wav_path, 'w', pysox.CSignalInfo(16000, 1, 16), fileType='wav') sox_chain = pysox.CEffectsChain(sox_in, sox_out) sox_chain.add_effect(pysox.CEffect('rate', ['16000'])) sox_chain.flow_effects() sox_out.close()
def downsample(fs, sig): in_file = random_string() + ".wav" out_file = random_string() + ".wav" frame_len = fs * WINDOW_SIZE pad = len(sig) % frame_len if pad > 0: sig = np.append(sig, np.zeros(frame_len - pad)) f = Sndfile(in_file, 'w', Format(type="wav", encoding='pcm16', endianness="file"), 1, fs) f.write_frames(sig) f.close() sox_in = pysox.CSoxStream(in_file) sox_out = pysox.CSoxStream(out_file, 'w', pysox.CSignalInfo(SAMPLE_RATE, 1, 8), fileType='wav') sox_chain = pysox.CEffectsChain(sox_in, sox_out) sox_chain.add_effect(pysox.CEffect("rate", [str(SAMPLE_RATE)])) sox_chain.flow_effects() sox_out.close() f = Sndfile(out_file, 'r') sig = f.read_frames(f.nframes) f.close() os.unlink(in_file) os.unlink(out_file) return sig
def extractParam(matrice, spkId, wavFile): ''' analyse sur des fenetres de 10ms toutes les 5 ms -> nb echantillons entiers ''' spkTurns = _getSpeakerTurns(matrice, spkId) shortTimeEnergyContainer = numpy.empty((0)) #~ windowSize = 10 ; stepSize = 5 ; # in ms for seg in spkTurns: tempFile = wavFile.split('.wav')[0] + '_' + str(seg[0]) + '_' + str( seg[1]) + '_' + str(seg[2]) + '.wav' #~ print(tempFile) # == trim sox via pysox == # infile = pysox.CSoxStream(wavFile) outfile = pysox.CSoxStream(tempFile, 'w', infile.get_signal()) chain = pysox.CEffectsChain(infile, outfile) effect = pysox.CEffect( 'trim', [str(seg[1] / 100.), str((seg[2] - seg[1]) / 100.)]) chain.add_effect(effect) chain.flow_effects() outfile.close() # == loading segment == # wavData, fe = _loadingWavFile(tempFile) windowSizeSample = 10 * fe / 1000 stepSizeSample = 5 * fe / 1000 # == computing and concatening short time enerfy == # shortTimeEnergy = _getShortTimeEnergy(wavData, fe, windowSizeSample, stepSizeSample) shortTimeEnergyContainer = numpy.concatenate( (shortTimeEnergyContainer, shortTimeEnergy)) os.remove(tempFile) spkTrame = dict() spkTrame['avgPowerSignal'] = float("%0.3e" % (_getAvgPowerSignal(shortTimeEnergy))) spkTrame['stdPowerSignal'] = float("%0.3e" % (_getStdPowerSignal(shortTimeEnergy))) spkTrame['avgLowPowerSignal'] = float( "%0.3e" % (_getAvgLowPowerSignal(shortTimeEnergy))) spkTrame['stdLowPowerSignal'] = float( "%0.3e" % (_getStdLowPowerSignal(shortTimeEnergy))) spkTrame['minLowPowerSignal'] = float( "%0.3e" % (_getMinLowPowerSignal(shortTimeEnergy))) spkTrame['maxLowPowerSignal'] = float( "%0.3e" % (_getMaxLowPowerSignal(shortTimeEnergy))) spkTrame['avgHighPowerSignal'] = float( "%0.3e" % (_getAvgHighPowerSignal(shortTimeEnergy))) spkTrame['stdHighPowerSignal'] = float( "%0.3e" % (_getStdHighPowerSignal(shortTimeEnergy))) spkTrame['minHighPowerSignal'] = float( "%0.3e" % (_getMinHighPowerSignal(shortTimeEnergy))) spkTrame['maxHighPowerSignal'] = float( "%0.3e" % (_getMaxHighPowerSignal(shortTimeEnergy))) return spkTrame
def process_and_move(self, dl_entry): """ Make new audio file in the media directory. Take the audio file with in_name, normalize, convert to self.output_format, put in the media folder with a suitable file name, delete the old file and return the new name. """ # NB. We don't check the sox import *here*. We only use this # when the import worked in __init.py__. suffix = dl_entry.file_extension in_name = dl_entry.file_path try: sox_in_file = pysox.CSoxStream(in_name) except IOError: if not '.mp3' == suffix: # We can't do anything about the error after all. (We # CAN save mp3 files with pydub. Some soxes can't # handle mp3s. Those that can should not raise the # IOError.) raise # So use pydub to convert mp3s to wavs. segments = AudioSegment.from_mp3(in_name) wav_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav') wav_file.close() segments.export(wav_file.name, format='wav') os.remove(in_name) in_name = wav_file.name sox_in_file = pysox.CSoxStream(in_name) # Now we should be pretty much at the point we were at the # except IOError. With the data in the sox_in_file object. # Now do the processing with pysox. tof = tempfile.NamedTemporaryFile(delete=False, suffix=self.output_format) # Use a(nother) temp file here, because pysox seemst to have # problems with files with non-ASCII names. temp_out_file_name = tof.name tof.close() sox_signal = sox_in_file.get_signal() # An old version did some more effects: # * Upmixing from mono to stereo: To work around a problem # with an mp3-player that didn't do that. Not needed. And # broken in pysox. # * Doing a processing where we removed silence at the # beginning and the end. That kind-of worked, but tended to # either clip bits of speech or to not remove anything. # Those are gone now. sox_out_file = pysox.CSoxStream(temp_out_file_name, 'w', sox_signal) sox_chain = pysox.CEffectsChain(sox_in_file, sox_out_file) sox_chain.add_effect(pysox.CEffect('gain', [b'-n'])) sox_chain.flow_effects() sox_out_file.close() os.remove(in_name) return self.unmunge_to_mediafile(temp_out_file_name, dl_entry.base_name, self.output_format)
def augment(name, num): infile = pysox.CSoxStream(os.path.join(args.wav_dir, name) + ".wav") info = infile.get_signal().get_signalinfo() length = info['length']/info['rate'] names = [name + "_rand_" + str(n).zfill(2) for n in range(1, num+1)] for n in names: infile = pysox.CSoxStream(os.path.join(args.wav_dir, name) + ".wav") outfile = pysox.CSoxStream(os.path.join(args.wav_dir, n) + ".wav", 'w', pysox.CSignalInfo(16000.0,1,16)) chain = pysox.CEffectsChain(infile, outfile) # Pitch bend b = np.random.ranf()*0.5*length + 0.15*length p = np.random.randint(-250, 250) bend = pysox.CEffect("bend", [bytes(str(b)+','+str(p)+','+str(length - b - 0.01), 'utf-8')]) chain.add_effect(bend) # Tremolo s = np.random.randint(2,5)/length d = np.random.randint(30, 80) tremolo = pysox.CEffect("tremolo", [bytes(str(s), 'utf-8'), bytes(str(d), 'utf-8')]) chain.add_effect(tremolo) # Tempo t = np.random.randint(6,14)*0.1 tempo = pysox.CEffect("tempo", [b'-s', bytes(str(t), 'utf-8')]) chain.add_effect(tempo) # Gain g = np.random.ranf()*1.25 + 0.25 vol = pysox.CEffect("vol", [bytes(str(g), 'utf-8')]) chain.add_effect(vol) try: chain.flow_effects() except: print(n, b, length) outfile.close() infile.close() return names
def find_noise(in_path): in_stream = pysox.CSoxStream(in_path) # Try 1 second from the end first with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as end_f: end_path = end_f.name end_out = pysox.CSoxStream(end_path, 'w', in_stream.get_signal()) end_chain = pysox.CEffectsChain(in_stream, end_out) end_chain.add_effect(pysox.CEffect('reverse', [])) end_chain.add_effect(pysox.CEffect('trim', [b'0', b'1'])) end_chain.flow_effects() end_out.close() in_stream.close() play(end_path) resp = input('Is this only noise? [Y/n] ') if resp.lower() in ['y', '']: return (0, 1, True)
def extractParam(matrice, spkId, wavFile): spkTurns = _getSpeakerTurns(matrice, spkId) #scvContainer = numpy.empty((0)) scvContainer = [] pitchContainer = [] for seg in spkTurns : tempFile = wavFile.split('.wav')[0] +'_'+ str(seg[0]) +'_'+ str(seg[1]) +'_'+ str(seg[2]) + '.wav' print(tempFile) # == trim sox via pysox == # infile = pysox.CSoxStream(wavFile) outfile = pysox.CSoxStream(tempFile,'w',infile.get_signal()) chain = pysox.CEffectsChain(infile, outfile) effect = pysox.CEffect('trim', [ str(seg[1] / 100. ), str( (seg[2]-seg[1])/100.) ]) chain.add_effect(effect) chain.flow_effects() outfile.close() scvContainer += _getSCV(seg, tempFile) pitchContainer += _getPitch(spkTurns, tempFile) os.remove(tempFile) pitchContainer = numpy.asarray(pitchContainer) scvContainer = numpy.asarray(scvContainer) #_getSpkPitch(spkTurns, wavFile) #print(_getPitched(pitchContainer)) #print(_getPitchedNb(pitchContainer)) #print(_getVowel(scvContainer)) #print(_getSilence(scvContainer)) #print(temporalParam._getOvActi(spkTurns)) #~ scvContainer = _getSCV(spkTurns, wavFile) #~ pitchContainer = _getPitch(spkTurns, wavFile) spkTrame = dict() spkTrame['avgPitch'] = float("%0.3e"%(_getAvgPitch(pitchContainer))) spkTrame['stdPitch'] = float("%0.3e"%(_getStdPitch(pitchContainer))) spkTrame['maxPitch'] = float("%0.3e"%(_getMaxPitch(pitchContainer))) spkTrame['voicedZoneRate'] = float("%0.3e"%(_getVoicedZoneRate(_getPitchedNb(pitchContainer),temporalParam._getOvActi(spkTurns)))) spkTrame['nbVocalic'] = float("%0.3e"%(_getNbVocalic(_getVowel(scvContainer)))) spkTrame['rateVocalic'] = float("%0.3e"%(_getRateVocalic(_getVowel(scvContainer) , temporalParam._getOvActi(spkTurns) ))) spkTrame['avgVocalicLen'] = float("%0.3e"%(_getAvgVocalicLen(_getVowel(scvContainer)))) spkTrame['stdVocalicLen'] = float("%0.3e"%(_getStdVocalicLen(_getVowel(scvContainer)))) spkTrame['nbSilence'] = float("%0.3e"%(_getNbSilence(_getSilence(scvContainer)))) spkTrame['rateSilence'] = float("%0.3e"%(_getRateSilence(_getSilence(scvContainer) , temporalParam._getOvActi(spkTurns) ))) spkTrame['avgSilenceLen'] = float("%0.3e"%(_getAvgSilenceLen(_getSilence(scvContainer)))) spkTrame['stdSilenceLen'] = float("%0.3e"%(_getStdSilenceLen(_getSilence(scvContainer)))) #~ print(spkTrame) return spkTrame
def change_tempo(cfg, tempo, wav): """ Change tempo of an input WAV byte buffer. """ sample_rate = cfg['Audio']['sample_rate'] # write the buffer to a temporary file tmp1fh, tmp1path = mkstemp() save_wav(cfg, tmp1path, wav) # transform the temporary file using SoX (can't do this in memory :-() tmp2fh, tmp2path = mkstemp() sox_in = pysox.CSoxStream(tmp1path) sox_out = pysox.CSoxStream(tmp2path, 'w', pysox.CSignalInfo(sample_rate, 1, 16), fileType='wav') sox_chain = pysox.CEffectsChain(sox_in, sox_out) sox_chain.add_effect(pysox.CEffect("tempo", [str(tempo)])) sox_chain.flow_effects() sox_out.close() # read the transformation results back to the buffer return load_wav(cfg, fdopen(tmp2fh, 'rb'))
dest='model_file', help='the model file to use', default='models/params.pkl') args = parser.parse_args() # downsample file to 8KHz, 8 bits per sample in_file = args.input_file out_file = random_string() + ".wav" sox_in = pysox.CSoxStream(in_file) sox_out = pysox.CSoxStream(out_file, 'w', pysox.CSignalInfo(SAMPLE_RATE, 1, 8), fileType='wav') sox_chain = pysox.CEffectsChain(sox_in, sox_out) sox_chain.add_effect(pysox.CEffect("rate", [str(SAMPLE_RATE)])) sox_chain.flow_effects() sox_out.close() input_file = Sndfile(out_file, 'r') fs = input_file.samplerate num_frames = input_file.nframes window = args.window_size / 1000. chunk_size = int(np.floor(window * fs)) mlp = MLP_VAD(args.model_file) noise_seconds = [] frame_num = 0
noise = lp_filter(np.random.randn(*data2.shape), -0.5) result = (3 - percent_noise) * data2 + percent_noise * noise wavwrite(result, "G.wav", 44100) # Takes in an input file infile = pysox.CSoxStream('G.wav') # Creates a new output file to be written to outfile = pysox.CSoxStream('G_Modified.wav','w',infile.get_signal()) # Creates a new chain of effects chain = pysox.CEffectsChain(infile, outfile) # Adds chorus chorus = pysox.CEffect("chorus", [b'0.7', b'0.9', b'55', b'0.4', b'0.25', b'2', b'-t']) chain.add_effect(chorus) # Adds reverb to the file reverb = pysox.CEffect("reverb", [b'100']) chain.add_effect(reverb) # Adds echo echo = pysox.CEffect("echo", [b'0.8', b'0.88', b'1000', b'0.4']) chain.add_effect(echo) # Adds pitch bending (tremolo) tremolo = pysox.CEffect("tremolo", [b'5', b'50']) chain.add_effect(tremolo) # Adds overdrive
import cPickle, gzip #for storing our data import pickle data_dir = "../data/expanded_samples.pkl.gz" training_data_dir = "../data/training_data/" validation_data_dir = None test_data_dir = "../data/test_data/" #For each sample, make two augmented versions: #One with speed .9, #One with speed 1.1 . #speed(1.1), speed(.9) max_audio_len = 83521 #289*289 expansion_factor = 3 #amount data will be increased decrease_speed_effect = pysox.CEffect("speed", ['.9']) increase_speed_effect = pysox.CEffect("speed", ['1.1']) #effect = pysox.CEffect("vol", [ b'18dB' ]) def get_expanded_data(data_dir, data): label_num = 0 sample_num = 0 for f in os.listdir(os.path.abspath(data_dir)): f = os.path.abspath(data_dir + f) label = f.split("/")[-1] if os.path.isdir(f): local_sample_total = len( os.listdir(f)) #for use with new increments of filenames