def info_soundfile(fp): info = {} info['duration'] = sf.info(fp).duration info['samples'] = int(sf.info(fp).duration * sf.info(fp).samplerate) info['channels'] = sf.info(fp).channels info['sampling_rate'] = sf.info(fp).samplerate return info
def __getitem__(self, idx): """Convert (noisy, clean, vad) paths to features on indexing.""" noisy, clean, vad = self.filepaths[idx] if self.select is not None: # Quite a hacky way because noisy and clean have unequal lengths if sf.info(noisy).frames > sf.info(clean).frames: shorter = clean else: shorter = noisy nstart, nend = self.select(shorter) else: nstart, nend = 0, None sigx, sr1 = audioread(noisy, start=nstart, stop=nend) sigs, sr2 = audioread(clean, start=nstart, stop=nend) assert sr1 == sr2 # Equalize lengths if necessary if len(sigx) > len(sigs): sigx = sigx[:len(sigs)] elif len(sigx) < len(sigs): sigs = sigs[:len(sigx)] # Calculate new vad timestamps offset = nstart * 1. / sr1 vadref = self.tabread(vad) vadref = [(ts - offset, te - offset) for ts, te in vadref] sample = NoisySpeech(noisy=Audio(sigx, sr1), clean=Audio(sigs, sr2), vad=vadref) if self.transform: sample = self.transform(sample) return sample
def load_wav(wavInPath, wavLength, printInfo=False): ''' Load an audio file as a floating point time series. wavInPath: path to .wav file wavLength: length of audio to load in seconds (0 = full length) info: prints details of loaded .wav to screen returns: [xSrc, numChannels, fs, xSamples] to unpack: if numChannels==1: xSrc_ch1 = xSrc elif numChannels==2: xSrc_ch1 = xSrc[:,0] xSrc_ch2 = xSrc[:,1] ''' audioSrc = wavInPath #with open(audioSrc, 'rb') as f: # ySrc, ySrcSr = sf.read(f) #ySrc, ySrcSr = sf.read(audioSrc, channels=1, samplerate=44100, subtype='FLOAT') numChannels = sf.info(audioSrc).channels # STEREO or MONO SOURCE WAVE # ** wavLength==0 - use full length of src .wav file ** if (wavLength == 0): xSrc, fs = sf.read(audioSrc) xSamples = len(xSrc) xLength = samples_to_time(xSamples, fs)[0] else: xLength = wavLength fsTmp = sf.info(audioSrc).samplerate durTmp = sf.info(audioSrc).duration if xLength > durTmp: sys.exit( 'ERROR: wavLength setting exceeds the length of audio source') xSamples = int(time_to_samples(xLength, fsTmp)) if numChannels == 1: xSrc, fs = sf.read(audioSrc, channels=1, frames=xSamples) elif numChannels == 2: xSrc, fs = sf.read(audioSrc, frames=xSamples) # if numChannels==1: # xSrc_ch1 = xSrc # xSrc_ch2 = 0 # elif numChannels==2: # xSrc_ch1 = xSrc[:,0] # xSrc_ch2 = xSrc[:,1] numChannels = len(np.shape(xSrc)) if printInfo == True: # length of input signal - '0' => length of input .wav file print('number of Channels = ' + str(len(np.shape(xSrc)))) print('length of input signal in seconds: ----- ' + str(xLength)) print('length of input signal in samples: ----- ' + str(xSamples)) print('audio sample rate: --------------------- ' + str(fs) + '\n') return xSrc, numChannels, fs, xLength, xSamples
def getSources(self, people, locations, duration): '''Accepts DataFrame of people, and returns a source with a random location in the corresponding to each person a.''' sources = [] for index, person in enumerate(people["ID"]): fullPath = f"{self.directory}/{person}" chapters = os.listdir(fullPath) book = random.choice(chapters) sentences = [ clip.replace("\\", "/") for clip in sorted(glob.glob(fullPath + f"/{book}/*.flac")) ] name = f"s{person}-b{book}-d" line = sentences.pop(0) currDur = sf.info(line).duration data, sampRate = sf.read(line) lastNum = "0000" for line in sentences: if currDur < duration: currDur += sf.info(line).duration data = np.append(data, sf.read(line)[0]) lastNum = line[-9: -5] #isolate the line number from filename else: break sources.append( Scene.Source(location=locations[index], name=name + lastNum, data=(data[:sampRate * duration], sampRate))) del data return sources
def check_audio_info(self, tar_path, ref_path): """ check basic information of the provided audio signals and display warnings if necessary checklist: - sampling rate (48kHz) - formats (wave) - channels (mono) - durations? """ self.m_ref_path = ref_path self.m_tar_path = tar_path tar_info = sf.info(self.m_tar_path) ref_info = sf.info(self.m_ref_path) self.IS_TOO_LONG = False self.IS_SAME_FILE = False if tar_info.samplerate != 48000: warnings.warn( "target signal sampling rate is not 48kHz, it will be resampled" ) if ref_info.samplerate != 48000: warnings.warn( "reference signal sampling rate is not 48kHz, it will be resampled" ) if tar_info.format != ref_info.format: warnings.warn( "Target and reference signals are in different formats") if tar_info.channels >= 2: warnings.warn( "target signal has channel number >= 2, it will be downmixed to mono" ) if ref_info.channels >= 2: warnings.warn( "reference signal has channel number >= 2, it will be downmixed to mono" ) if tar_info.duration != ref_info.duration: warnings.warn( "Target and reference signals have different durations; longer one will be truncated" ) if min(tar_info.duration, ref_info.duration) >= self.m_duration_thres_sec: self.IS_TOO_LONG = True warnings.warn( "Files are longer than %d seconds; using segmental SMAQ" % self.m_duration_thres_sec) if min(tar_info.duration, ref_info.duration ) <= 0.48: # minimum duration due to VISQOL patch size warnings.warn( "Files should be at least 0.48 second; unreliable results might be returned" ) if filecmp.cmp(self.m_tar_path, self.m_ref_path, shallow=False): self.IS_SAME_FILE = True warnings.warn("Files are bit-for-bit identical") print("=============================================================") self.m_tar_info = tar_info self.m_ref_info = ref_info
def btn_open_test_file(self): """ Callback when the GUI button for open a test WAV file is pressed. """ dlg = QFileDialog() dlg.setFileMode(QFileDialog.AnyFile) dlg.setNameFilters([ "WAV files (*.wav)", "MP3 (*.mp3)", #"MP4 (*.mp4)", "FLAC (*.flac)", "OGG (*.ogg)", "Other Audio Formats (*.AIFF *.AU *.RAW)" ]) if dlg.exec_(): filename = dlg.selectedFiles()[0] fname, ext = os.path.splitext(filename) if ext.lower() == ".mp3": try: print("Converting mp3 to wav") mp3 = filename filename = "" new_wav = fname + ".wav" if not os.path.isfile(new_wav): if new_wav not in self.converted_mp3: sound = AudioSegment.from_mp3(mp3) sound.export(new_wav, format="wav") self.converted_mp3 += [new_wav] filename = new_wav except: msg = QMessageBox() msg.setIcon(QMessageBox.Information) msg.setText("Could not convert mp3. Try installing ffmpeg") msg.setWindowTitle("Warning") #msg.setDetailedText("The details are as follows:") msg.setStandardButtons(QMessageBox.Ok) msg.exec() if len(filename) > 0: self.txtTestFile.setText(filename) if self.wf is not None: self.wf.close() self.wf = sf.SoundFile(filename) self.wf_info = sf.info(filename) self.fs = sf.info(filename).samplerate self.combo_setindex_by_value(self.comboFS, self.fs) self.aa.set_properties(fs=self.fs) self.canvas.set_plot_properties(fs=self.fs) self.sliderWavPlayer.setValue(self.sliderWavPlayer.minimum()) self.update_player_timelabel(0)
def __init__(self, playback_sound=[], rate=44100, chunk_size=1024): self.pa = pyaudio.PyAudio() self.rate = rate self.chunk_size = chunk_size self.FORMAT = pyaudio.paInt16 self.play_sound = False # if given play that sound if playback_sound: self.play_sound = True self.playback_sound = playback_sound self.recording_time = sf.info(playback_sound).duration print(self.recording_time) self.rate = sf.info(playback_sound).samplerate
def main(args): assert args.valid_percent >= 0 and args.valid_percent <= 1.0 if not os.path.exists(args.dest): os.makedirs(args.dest) dir_path = os.path.realpath(args.root) search_path = os.path.join(dir_path, "**/*." + args.ext) rand = random.Random(args.seed) valid_f = (open(os.path.join(args.dest, "valid.tsv"), "w") if args.valid_percent > 0 else None) with open(os.path.join(args.dest, "train.tsv"), "w") as train_f: print(dir_path, file=train_f) if valid_f is not None: print(dir_path, file=valid_f) for fname in glob.iglob(search_path, recursive=True): file_path = os.path.realpath(fname) if args.path_must_contain and args.path_must_contain not in file_path: continue frames = soundfile.info(fname).frames dest = train_f if rand.random() > args.valid_percent else valid_f print("{}\t{}".format(os.path.relpath(file_path, dir_path), frames), file=dest) if valid_f is not None: valid_f.close()
def from_file(path: Pathlike, recording_id: Optional[str] = None): """ Read an audio file's header and create the corresponding ``Recording``. Suitable to use when each physical file represents a separate recording session. If a recording session consists of multiple files (e.g. one per channel), it is advisable to create the ``Recording`` object manually, with each file represented as a separate ``AudioSource`` object. :param path: Path to an audio file supported by libsoundfile (pysoundfile). :param recording_id: recording id, when not specified ream the filename's stem ("x.wav" -> "x"). :return: a new ``Recording`` instance pointing to the audio file. """ import soundfile info = soundfile.info(path) return Recording( id=recording_id if recording_id is not None else Path(path).stem, sampling_rate=info.samplerate, num_samples=info.frames, duration=info.duration, sources=[ AudioSource( type='file', channels=list(range(info.channels)), source=str(path) ) ] )
def get_num_segments(path, max_segment_length, min_segment_length): """ Calculate the number of audio segments of sufficient length contained within an audio file. Args: path: Path to a single audio file max_segment_length (float): The maximum length (in seconds) of each audio segment. If `None`, 1 segment is assumed. min_segment_length (float): The minimum length (in seconds) of each audio segment. """ try: file_info = sf.info(str(path)) # Load file info and check its validity # Return 1 if segmenting is disabled if max_segment_length is None: return 1 # Compute the number of segments otherwise else: sr = file_info.samplerate samples = file_info.frames duration = samples / sr num_segments = int(duration / max_segment_length) if duration % max_segment_length >= min_segment_length: num_segments += 1 return num_segments except RuntimeError: # SoundFile raises a `RuntimeError` when it fails to read a file :( return 0
def select_wav_file(self): ''' Allows the user to select a file and laods info about it ''' file = select_file(self, ['wav', 'flac']) if file is not None: self.__clear() self.file.setText(file) import soundfile as sf self.__info = sf.info(file) self.channelSelector.clear() for i in range(0, self.__info.channels): self.channelSelector.addItem(f"{i + 1}") self.channelSelector.setEnabled(self.__info.channels > 1) self.startTime.setTime(QtCore.QTime(0, 0, 0)) self.startTime.setEnabled(True) self.__duration = math.floor(self.__info.duration * 1000) max_time = QtCore.QTime(0, 0, 0).addMSecs(self.__duration) self.endTime.setMaximumTime(max_time) self.endTime.setTime(max_time) self.endTime.setEnabled(True) self.maxTime.setMaximumTime(max_time) self.maxTime.setTime(max_time) self.loadButton.setEnabled(True) self.updateChart.setEnabled(True) else: self.__signal = None
def _vqt_fn(self, wav_file, err_db=False): with matlab.engine.start_matlab(option='-nojvm -nodesktop') as mat_eng: if not err_db: coeffs = mat_eng.vqt_fn('wav_file', wav_file) else: coeffs, err_db = mat_eng.vqt_fn('wav_file', wav_file, nargout=2) assert err_db >= 285. logging.info('vqt accuracy - {} dB'.format(err_db)) coeffs = np.array(coeffs._data, dtype=np.float32).reshape(coeffs.size, order='F') wav_info = soundfile.info(wav_file) sr = 44100 assert wav_info.samplerate >= sr if wav_info.samplerate > sr: num_frames = (wav_info.frames * sr + wav_info.samplerate - 1) // wav_info.samplerate else: num_frames = wav_info.frames num_frames = (num_frames + 63) // 64 num_frames = (num_frames + 21) // 22 assert coeffs.shape == (num_frames, 336) coeffs = np.require(coeffs, dtype=np.float32, requirements=['C', 'O']) return coeffs
def _setSndFromFile(self, filename): self.sndFile = f = sf.SoundFile(filename) self.sourceType = 'file' self.sampleRate = f.samplerate if self.channels == -1: # if channels was auto then set to file val self.channels = f.channels info = sf.info(filename) # needed for duration? # process start time if self.startTime and self.startTime > 0: startFrame = self.startTime*self.sampleRate self.sndFile.seek(int(startFrame)) self.t = self.startTime else: self.t = 0 # process stop time if self.stopTime and self.stopTime > 0: requestedDur = self.stopTime - self.t maxDur = info.duration self.duration = min(requestedDur, maxDur) else: self.duration = info.duration - self.t # can now calculate duration in frames self.durationFrames = int(round(self.duration*self.sampleRate)) # are we preloading or streaming? if self.preBuffer == 0: # no buffer - stream from disk on each call to nextBlock pass elif self.preBuffer == -1: # no buffer - stream from disk on each call to nextBlock sndArr = self.sndFile.read(frames=len(self.sndFile)) self.sndFile.close() self._setSndFromArray(sndArr)
def get_samplerate(path): '''Get the sampling rate for a given file. Parameters ---------- path : string, int, or file-like The path to the file to be loaded As in `load()`, this can also be an integer or open file-handle that can be processed by `soundfile`. Returns ------- sr : number > 0 The sampling rate of the given audio file Examples -------- Get the sampling rate for the included audio file >>> path = librosa.util.example_audio_file() >>> librosa.get_samplerate(path) 44100 ''' try: return sf.info(path).samplerate except RuntimeError: with audioread.audio_open(path) as fdesc: return fdesc.samplerate
def get_duration(path: Pathlike, ) -> float: """ Read a audio file, it supports pipeline style wave path and real waveform. :param path: Path to an audio file or a Kaldi-style pipe. :return: float duration of the recording, in seconds. """ path = str(path) if path.strip().endswith("|"): if not is_module_available("kaldiio"): raise ValueError( "To read Kaldi's data dir where wav.scp has 'pipe' inputs, " "please 'pip install kaldiio' first.") from kaldiio import load_mat # Note: kaldiio.load_mat returns # (sampling_rate: int, samples: 1-D np.array[int]) sampling_rate, samples = load_mat(path) assert len(samples.shape) == 1 duration = samples.shape[0] / sampling_rate return duration try: # Try to parse the file using pysoundfile first. import soundfile info = soundfile.info(path) except: # Try to parse the file using audioread as a fallback. info = audioread_info(path) return info.duration
def __init__(self, path="None", is_wav=False, stem_id=None, subset=None, chunk_start=0, chunk_duration=None): self.path = path self.subset = subset self.stem_id = stem_id self.is_wav = is_wav self.chunk_start = chunk_start self.chunk_duration = chunk_duration # load and store metadata if os.path.exists(self.path): if not self.is_wav: self.info = stempeg.Info(self.path) self.samples = int(self.info.samples(self.stem_id)) self.duration = self.info.duration(self.stem_id) self.rate = self.info.rate(self.stem_id) else: self.info = sf.info(self.path) self.samples = self.info.frames self.duration = self.info.duration self.rate = self.info.samplerate else: # set to `None` if no path was set (fake file) self.info = None self.samples = None self.duration = None self.rate = None self._audio = None
def sampling_rate(file: str) -> int: """Sampling rate of audio file. Args: file: file name of input audio file Returns: sampling rate of audio file Raises: RuntimeError: if ``file`` is broken or not a supported format """ file = audeer.safe_path(file) if file_extension(file) in SNDFORMATS: return soundfile.info(file).samplerate else: try: return int(sox.file_info.sample_rate(file)) except sox.core.SoxiError: cmd = f'mediainfo --Inform="Audio;%SamplingRate%" "{file}"' sampling_rate = run(cmd) if sampling_rate: return int(sampling_rate) else: raise RuntimeError(broken_file_error(file))
def channels(file: str) -> int: """Number of channels in audio file. Args: file: file name of input audio file Returns: number of channels in audio file Raises: RuntimeError: if ``file`` is broken or not a supported format """ file = audeer.safe_path(file) if file_extension(file) in SNDFORMATS: return soundfile.info(file).channels else: try: return int(sox.file_info.channels(file)) except sox.core.SoxiError: # For MP4 stored and returned number of channels can be different cmd1 = f'mediainfo --Inform="Audio;%Channel(s)_Original%" "{file}"' cmd2 = f'mediainfo --Inform="Audio;%Channel(s)%" "{file}"' try: return int(run(cmd1)) except ValueError: try: return int(run(cmd2)) except ValueError: raise RuntimeError(broken_file_error(file))
def add_random_background(self, label=None): """ Add a random background to a scaper object Args: label: str or list, possible labels are names the subfolders of self.bg_path. None can use them all. """ # If str or None, keep it like this if label is not None: if isinstance(label, list): bg_label = self.random_state.choice(label) elif isinstance(label, str): bg_label = label else: raise NotImplementedError( "Background label can only be a list of available labels or a string" ) else: bg_label = "*" chosen_file = self._choose_file(osp.join(self.bg_path, bg_label)) file_duration = sf.info(chosen_file).duration starting_source = min( self.random_state.rand() * file_duration, max(file_duration - self.duration, 0), ) self.add_background( label=("const", chosen_file.split("/")[-2]), source_file=("const", chosen_file), source_time=("const", starting_source), )
def __init__(self, *args, **kwargs): """Constructor Parameters ---------- fs : int Target sampling frequency, if loaded audio does have different sampling frequency, audio will be re-sampled. Default value "44100" mono : bool Monophonic target, multi-channel audio will be down-mixed. Default value "True" filename : str, optional File path logger : logger Logger class instance, If none given logger instance will be created Default value "None" """ self.data = kwargs.get('data', None) # Audio data itself self.filename = kwargs.get('filename', None) if self.filename: self.format = self.detect_file_format(self.filename) if self.format == 'wav': self.info = soundfile.info(file=self.filename) self.logger = kwargs.get('logger', logging.getLogger(__name__)) if not self.logger.handlers: logging.basicConfig() self.fs = kwargs.get('fs', 44100) self.mono = kwargs.get('mono', True)
def maps_sg_and_label_fn(wav_file): wav_info = soundfile.info(wav_file) assert wav_info.samplerate == 44100 num_frames = MiscFns.num_samples_to_num_frames_fn(wav_info.frames) rec_name = os.path.basename(wav_file)[:-4] hcqt_file = os.path.join(os.environ['maps_hcqt'], rec_name + '.hcqt') _rec_name, hcqt = MiscFns.load_np_array_from_file_fn(hcqt_file) assert _rec_name == rec_name _num_frames = hcqt.shape[0] assert _num_frames == num_frames or _num_frames == num_frames + 1 if _num_frames > num_frames: hcqt = hcqt[1:] assert hcqt.shape == (num_frames, 440, 6) and hcqt.dtype == np.float32 mid_file = wav_file[:-3] + 'mid' num_frames_from_midi = mido.MidiFile(mid_file).length num_frames_from_midi = int( np.ceil(num_frames_from_midi * wav_info.samplerate)) num_frames_from_midi = MiscFns.num_samples_to_num_frames_fn( num_frames_from_midi) num_frames_from_midi += 2 num_frames = min(num_frames, num_frames_from_midi) hcqt = hcqt[:num_frames] label = MiscFns.label_fn(mid_file_name=mid_file, num_frames=num_frames) hcqt = np.require(hcqt, dtype=np.float32, requirements=['O', 'C']) hcqt.flags['WRITEABLE'] = False label.flags['WRITEABLE'] = False return dict(sg=hcqt, label=label)
def _vqt_without_shift_fn(self, wav_file, err_db=False): with matlab.engine.start_matlab(option='-nojvm -nodesktop') as mat_eng: _pars = [ 'db_scale', False, 'mono', False, 'wav_file', wav_file ] try: if not err_db: coeffs = mat_eng.vqt_without_pitch_shift_fn(*_pars) else: coeffs, err_db = mat_eng.vqt_without_pitch_shift_fn(*_pars, nargout=2) assert err_db >= 290. logging.info('vqt accuracy - {} dB'.format(err_db)) coeffs = np.array(coeffs._data, dtype=np.float32).reshape(coeffs.size, order='F') except Exception as _e: os.system('free -g') raise _e wav_info = soundfile.info(wav_file) sr = 44100 assert wav_info.samplerate == sr num_frames = wav_info.frames num_frames = int(np.ceil(np.ceil(num_frames / 64.) / 22.)) assert coeffs.shape == (num_frames, 336, 2) coeffs = np.require(coeffs, dtype=np.float32, requirements=['C', 'O']) return coeffs
def get_duration( path: Pathlike, ) -> float: """ Read a audio file, it supports pipeline style wave path and real waveform. :param path: Path to an audio file or a Kaldi-style pipe. :return: float duration of the recording, in seconds. """ path = str(path) if path.strip().endswith("|"): if not is_module_available("kaldi_native_io"): raise ValueError( "To read Kaldi's data dir where wav.scp has 'pipe' inputs, " "please 'pip install kaldi_native_io' first." ) import kaldi_native_io wave = kaldi_native_io.read_wave(path) assert wave.data.shape[0] == 1, f"Expect 1 channel. Given {wave.data.shape[0]}" return wave.duration try: # Try to parse the file using pysoundfile first. import soundfile info = soundfile.info(path) except: # Try to parse the file using audioread as a fallback. info = audioread_info(path) return info.duration
def preprocess_source(self, source): # Only get info (sample rate), read audio file when first read request # happens self.audio_info = soundfile.info(source) self.sample_rate = self.audio_info.samplerate self.samples = [] return source
def _setSndFromFile(self, filename): self.sndFile = f = sf.SoundFile(filename) self.sourceType = 'file' self.sampleRate = f.samplerate if self.channels == -1: # if channels was auto then set to file val self.channels = f.channels info = sf.info(filename) # needed for duration? # process start time if self.startTime and self.startTime > 0: startFrame = self.startTime * self.sampleRate self.sndFile.seek(int(startFrame)) self.t = self.startTime else: self.t = 0 # process stop time if self.stopTime and self.stopTime > 0: requestedDur = self.stopTime - self.t maxDur = info.duration self.duration = min(requestedDur, maxDur) else: self.duration = info.duration - self.t # can now calculate duration in frames self.durationFrames = int(round(self.duration * self.sampleRate)) # are we preloading or streaming? if self.preBuffer == 0: # no buffer - stream from disk on each call to nextBlock pass elif self.preBuffer == -1: # no buffer - stream from disk on each call to nextBlock sndArr = self.sndFile.read(frames=len(self.sndFile)) self.sndFile.close() self._setSndFromArray(sndArr)
def maps_sg_and_label_fn(wav_file): """ read the STFT spectrogram and generate target labels for a recording """ wav_info = soundfile.info(wav_file) assert wav_info.samplerate == 44100 num_frames = MiscFns.num_samples_to_num_frames_fn(wav_info.frames) rec_name = os.path.basename(wav_file)[:-4] stft_file = os.path.join(os.environ['maps_stft'], rec_name + '.stft') _rec_name, stft = MiscFns.load_np_array_from_file_fn(stft_file) assert stft.shape == (num_frames, 2817) and stft.dtype == np.float32 mid_file = wav_file[:-3] + 'mid' num_frames_from_midi = mido.MidiFile(mid_file).length num_frames_from_midi = int( np.ceil(num_frames_from_midi * wav_info.samplerate)) num_frames_from_midi = MiscFns.num_samples_to_num_frames_fn( num_frames_from_midi) num_frames_from_midi += 2 num_frames = min(num_frames, num_frames_from_midi) stft = stft[:num_frames] label = MiscFns.label_fn(mid_file_name=mid_file, num_frames=num_frames) stft = np.require(stft, dtype=np.float32, requirements=['O', 'C']) stft.flags['WRITEABLE'] = False label.flags['WRITEABLE'] = False return dict(sg=stft, label=label)
def readWav(inputSignalFile, selectedChannel=1, start=None, end=None) -> Signal: """ reads a wav file into a Signal. :param inputSignalFile: a path to the input signal file :param selectedChannel: the channel to read. :param start: the time to start reading from in HH:mm:ss.SSS format. :param end: the time to end reading from in HH:mm:ss.SSS format. :returns: Signal. """ def asFrames(time, fs): hours, minutes, seconds = (time.split(":"))[-3:] hours = int(hours) minutes = int(minutes) seconds = float(seconds) millis = int((3600000 * hours) + (60000 * minutes) + (1000 * seconds)) return int(millis * (fs / 1000)) import soundfile as sf if start is not None or end is not None: info = sf.info(inputSignalFile) startFrame = 0 if start is None else asFrames(start, info.samplerate) endFrame = None if end is None else asFrames(end, info.samplerate) ys, frameRate = sf.read(inputSignalFile, start=startFrame, stop=endFrame) else: ys, frameRate = sf.read(inputSignalFile) return Signal(ys[::selectedChannel], frameRate)
def __init__(self, parent, dirList, fileList, scanMode, formats, sampleRates, channels, scanLimits, tag): if dirList: ImportDialogScan.__init__(self, parent, dirList, scanMode, formats, sampleRates, channels, scanLimits) self.defaultTags = [tag] else: ImportDialog.__init__(self, parent) unknownFiles = [] self.dirList = dirList for filePath in fileList: try: info = soundfile.info(filePath) except: unknownFiles.append(filePath) continue fileInfo = QtCore.QFileInfo(filePath) fileItem = QtGui.QStandardItem(fileInfo.fileName()) fileItem.setData(filePath, FilePathRole) fileItem.setData(info, InfoRole) fileItem.setToolTip(fileInfo.fileName()) fileItem.setCheckable(True) fileItem.setCheckState(QtCore.Qt.Checked) dirItem = QtGui.QStandardItem(fileInfo.absolutePath()) dirItem.setToolTip(fileInfo.absoluteFilePath()) lengthItem = QtGui.QStandardItem('{:.3f}'.format(float(info.frames) / info.samplerate)) formatItem = QtGui.QStandardItem(info.format) rateItem = QtGui.QStandardItem(str(info.samplerate)) channelsItem = QtGui.QStandardItem(str(info.channels)) subtypeItem = QtGui.QStandardItem(info.subtype) tagsItem = QtGui.QStandardItem() tagsItem.setData([tag], TagsRole) self.sampleModel.appendRow([fileItem, dirItem, lengthItem, formatItem, rateItem, channelsItem, subtypeItem, tagsItem])
def get_zip_manifest( zip_path: Path, zip_root: Optional[Path] = None, is_audio=False ): _zip_path = Path.joinpath(zip_root or Path(""), zip_path) with zipfile.ZipFile(_zip_path, mode="r") as f: info = f.infolist() paths, lengths = {}, {} for i in tqdm(info): utt_id = Path(i.filename).stem offset, file_size = i.header_offset + 30 + len(i.filename), i.file_size paths[utt_id] = f"{zip_path.as_posix()}:{offset}:{file_size}" with open(_zip_path, "rb") as f: f.seek(offset) byte_data = f.read(file_size) assert len(byte_data) > 1 if is_audio: assert is_sf_audio_data(byte_data), i else: assert is_npy_data(byte_data), i byte_data_fp = io.BytesIO(byte_data) if is_audio: lengths[utt_id] = sf.info(byte_data_fp).frames else: lengths[utt_id] = np.load(byte_data_fp).shape[0] return paths, lengths
def main(args): assert args.valid_percent >= 0 and args.valid_percent <= 1.0 dir_path = os.path.realpath(args.root) search_path = os.path.join(dir_path, "**/84-*." + args.ext) rand = random.Random(args.seed) with open(os.path.join(args.dest, "train.tsv"), "w") as train_f, open( os.path.join(args.dest, "valid.tsv"), "w") as valid_f: print(dir_path, file=train_f) print(dir_path, file=valid_f) for fname in glob.iglob(search_path, recursive=True): file_path = os.path.realpath(fname) if args.path_must_contain and args.path_must_contain not in file_path: continue if fname.split('.')[-1] == 'flac' or fname.split('.')[-1] == 'wav': # print(fname.split('.')[-1], fname.split('.')[-1] == 'flac' or 'wav', file=train_f) frames = soundfile.info(fname).frames dest = train_f if rand.random() > args.valid_percent else valid_f print( "{}\t{}".format(os.path.relpath(file_path, dir_path), frames), file=dest ) else: # process embedding import numpy frames = len(numpy.loadtxt(fname)) dest = train_f if rand.random() > args.valid_percent else valid_f print( "{}\t{}".format(os.path.relpath(file_path, dir_path), frames), file=dest )
def stream(path, block_length, frame_length, hop_length, mono=True, offset=0.0, duration=None, fill_value=None, dtype=np.float32): '''Stream audio in fixed-length buffers. This is primarily useful for processing large files that won't fit entirely in memory at once. Instead of loading the entire audio signal into memory (as in `load()`, this function produces *blocks* of audio spanning a fixed number of frames at a specified frame length and hop length. While this function strives for similar behavior to `load`, there are a few caveats that users should be aware of: 1. This function does not return audio buffers directly. It returns a generator, which you can iterate over to produce blocks of audio. A *block*, in this context, refers to a buffer of audio which spans a given number of (potentially overlapping) frames. 2. Automatic sample-rate conversion is not supported. Audio will be streamed in its native sample rate, so no default values are provided for `frame_length` and `hop_length`. It is recommended that you first get the sampling rate for the file in question, using `get_samplerate()`, and set these parameters accordingly. 3. Many analyses require access to the entire signal to behave correctly, such as `resample`, `cqt`, or `beat_track`, so these methods will not be appropriate for streamed data. 4. The `block_length` parameter specifies how many frames of audio will be produced per block. Larger values will consume more memory, but will be more efficient to process down-stream. The best value will ultimately depend on your application and other system constraints. 5. By default, most librosa analyses (e.g., short-time Fourier transform) assume centered frames, which requires padding the signal at the beginning and end. This will not work correctly when the signal is carved into blocks, because it would introduce padding in the middle of the signal. To disable this feature, use `center=False` in all frame-based analyses. See the examples below for proper usage of this function. Parameters ---------- path : string, int, or file-like object path to the input file to stream. Any codec supported by `soundfile` is permitted here. block_length : int > 0 The number of frames to include in each block. Note that at the end of the file, there may not be enough data to fill an entire block, resulting in a shorter block by default. To pad the signal out so that blocks are always full length, set `fill_value` (see below). frame_length : int > 0 The number of samples per frame. hop_length : int > 0 The number of samples to advance between frames. Note that by when `hop_length < frame_length`, neighboring frames will overlap. Similarly, the last frame of one *block* will overlap with the first frame of the next *block*. mono : bool Convert the signal to mono during streaming offset : float Start reading after this time (in seconds) duration : float Only load up to this much audio (in seconds) fill_value : float [optional] If padding the signal to produce constant-length blocks, this value will be used at the end of the signal. In most cases, `fill_value=0` (silence) is expected, but you may specify any value here. dtype : numeric type data type of audio buffers to be produced Yields ------ y : np.ndarray An audio buffer of (at most) `block_length * (hop_length-1) + frame_length` samples. See Also -------- load get_samplerate soundfile.blocks Examples -------- Apply a short-term Fourier transform to blocks of 256 frames at a time. Note that streaming operation requires left-aligned frames, so we must set `center=False` to avoid padding artifacts. >>> filename = librosa.util.example_audio_file() >>> sr = librosa.get_samplerate(filename) >>> stream librosa.stream(filename, ... block_length=256, ... frame_length=4096, ... hop_length=1024) >>> for y_block in stream: ... D_block = librosa.stft(y_block, center=False) Or compute a mel spectrogram over a stream, using a shorter frame and non-overlapping windows >>> filename = librosa.util.example_audio_file() >>> sr = librosa.get_samplerate(filename) >>> stream = librosa.stream(filename, ... block_length=256, ... frame_length=2048, ... hop_length=2048) >>> for y_block in stream: ... m_block = librosa.feature.melspectrogram(y_block, sr=sr, ... n_fft=2048, ... hop_length=2048, ... center=False) ''' if not (np.issubdtype(type(block_length), np.integer) and block_length > 0): raise ParameterError('block_length={} must be a positive integer') if not (np.issubdtype(type(frame_length), np.integer) and frame_length > 0): raise ParameterError('frame_length={} must be a positive integer') if not (np.issubdtype(type(hop_length), np.integer) and hop_length > 0): raise ParameterError('hop_length={} must be a positive integer') # Get the sample rate from the file info sr = sf.info(path).samplerate # Construct the stream if offset: start = int(offset * sr) else: start = 0 if duration: frames = int(duration * sr) else: frames = -1 blocks = sf.blocks(path, blocksize=frame_length + (block_length - 1) * hop_length, overlap=frame_length - hop_length, fill_value=fill_value, start=start, frames=frames, dtype=dtype, always_2d=False) for block in blocks: if mono: yield to_mono(block.T) else: yield block.T
def get_duration(y=None, sr=22050, S=None, n_fft=2048, hop_length=512, center=True, filename=None): """Compute the duration (in seconds) of an audio time series, feature matrix, or filename. Examples -------- >>> # Load the example audio file >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> librosa.get_duration(y=y, sr=sr) 61.45886621315193 >>> # Or directly from an audio file >>> librosa.get_duration(filename=librosa.util.example_audio_file()) 61.4 >>> # Or compute duration from an STFT matrix >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> S = librosa.stft(y) >>> librosa.get_duration(S=S, sr=sr) 61.44 >>> # Or a non-centered STFT matrix >>> S_left = librosa.stft(y, center=False) >>> librosa.get_duration(S=S_left, sr=sr) 61.3471201814059 Parameters ---------- y : np.ndarray [shape=(n,), (2, n)] or None audio time series sr : number > 0 [scalar] audio sampling rate of `y` S : np.ndarray [shape=(d, t)] or None STFT matrix, or any STFT-derived matrix (e.g., chromagram or mel spectrogram). Durations calculated from spectrogram inputs are only accurate up to the frame resolution. If high precision is required, it is better to use the audio time series directly. n_fft : int > 0 [scalar] FFT window size for `S` hop_length : int > 0 [ scalar] number of audio samples between columns of `S` center : boolean - If `True`, `S[:, t]` is centered at `y[t * hop_length]` - If `False`, then `S[:, t]` begins at `y[t * hop_length]` filename : str If provided, all other parameters are ignored, and the duration is calculated directly from the audio file. Note that this avoids loading the contents into memory, and is therefore useful for querying the duration of long files. As in `load()`, this can also be an integer or open file-handle that can be processed by `soundfile`. Returns ------- d : float >= 0 Duration (in seconds) of the input time series or spectrogram. Raises ------ ParameterError if none of `y`, `S`, or `filename` are provided. Notes ----- `get_duration` can be applied to a file (`filename`), a spectrogram (`S`), or audio buffer (`y, sr`). Only one of these three options should be provided. If you do provide multiple options (e.g., `filename` and `S`), then `filename` takes precedence over `S`, and `S` takes precedence over `(y, sr)`. """ if filename is not None: try: return sf.info(filename).duration except RuntimeError: with audioread.audio_open(filename) as fdesc: return fdesc.duration if y is None: if S is None: raise ParameterError('At least one of (y, sr), S, or filename must be provided') n_frames = S.shape[1] n_samples = n_fft + hop_length * (n_frames - 1) # If centered, we lose half a window from each end of S if center: n_samples = n_samples - 2 * int(n_fft / 2) else: # Validate the audio buffer. Stereo is okay here. util.valid_audio(y, mono=False) if y.ndim == 1: n_samples = len(y) else: n_samples = y.shape[-1] return float(n_samples) / sr