def get_audio(): display(HTML(AUDIO_HTML)) data = eval_js("data") binary = b64decode(data.split(',')[1]) process = (ffmpeg .input('pipe:0') .output('pipe:1', format='wav') .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True) ) output, err = process.communicate(input=binary) riff_chunk_size = len(output) - 8 # Break up the chunk size into four bytes, held in b. q = riff_chunk_size b = [] for i in range(4): q, r = divmod(q, 256) b.append(r) # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk. riff = output[:4] + bytes(b) + output[8:] sr, audio = wav_read(io.BytesIO(riff)) return audio, sr
def load(path=None, subsample=1): if path is None: path = os.environ["DATASET_PATH"] audiomnist.download(path) t0 = time.time() # load wavs f = zipfile.ZipFile(path + "audiomnist/data.zip") wavs = list() digits = list() speakers = list() N = 0 for filename in tqdm(f.namelist(), ascii=True): if ".wav" not in filename: continue filename_end = filename.split("/")[-1] digits.append(int(filename_end.split("_")[0])) speakers.append(int(filename_end.split("_")[1]) - 1) wavfile = f.read(filename) byt = io.BytesIO(wavfile) wavs.append(wav_read(byt)[1].astype("float32")[::subsample]) N = max(N, len(wavs[-1])) digits = np.array(digits) speakers = np.array(speakers) all_wavs = np.zeros((len(wavs), N)) for i in range(len(wavs)): left = (N - len(wavs[i])) // 2 all_wavs[i, left:left + len(wavs[i])] = wavs[i] print("Audio-MNIST loaded in {} s.".format(time.time() - t0)) return all_wavs, digits, speakers
def load(path=None): if path is None: path = os.environ['DATASET_PATH'] irmas.download(path) t0 = time.time() train_wavs = list() train_labels = list() test_wavs = list() test_labels = list() # loading the training set f = zipfile.ZipFile(path + 'irmas/IRMAS-TrainingData.zip') namelist = f.namelist() for filename in tqdm(namelist, ascii=True): if '.wav' not in filename: continue wavfile = f.read(filename) byt = io.BytesIO(wavfile) train_wavs.append(wav_read(byt)[1].astype('float32')) train_labels.append(filename.split('/')[-2]) base = 'irmas/IRMAS-TestingData-Part{}.zip' for part in ['1', '2', '3']: f = zipfile.ZipFile(path + base.format(part)) namelist = f.namelist() for filename in tqdm(namelist, ascii=True): if '.wav' not in filename: continue byt = io.BytesIO(f.read(filename)) test_wavs.append(wav_read(byt)[1].astype('float32')) byt = io.BytesIO(f.read(filename.replace('.wav', '.txt'))) test_labels.append(np.loadtxt(byt, dtype='str')[0]) categories = np.array(labels) labels = np.zeros(len(categories)) wavs = np.array(wavs) for i, c in enumerate(np.unique(categories)): labels[categories == c] = i print('Dataset IRMAS loaded in {0:.2f}s.'.format(time.time() - t0)) return wavs, labels, categories
def load_audio( path: Path, channel: Optional[int] = None, mmap: bool = False, channel_names: List[str] = ["left", "right"], ) -> MultiTrack: """load waveform from file""" multiTrack = MultiTrack() assert 0 < len(channel_names) <= 2 try: fs, value = wav_read(path, mmap=mmap) except ValueError: try: import soundfile as sf value, fs = sf.read(path, dtype="int16") except ImportError: logging.error( f"Scipy was unable to import {path}, " f"try installing soundfile python package for more compatability" ) raise ImportError except RuntimeError: raise RuntimeError(f"Unable to import audio file {path}") if value.ndim == 1: if channel is not None and channel != 0: raise MultiChannelError( f"cannot select channel {channel} from monaural file {path}") multiTrack[channel_names[0]] = Wave(value[:, np.newaxis], fs, path=path) if value.ndim == 2: if channel is None: multiTrack[channel_names[0]] = Wave(value[:, 0], fs, path=path) multiTrack[channel_names[1]] = Wave(value[:, 1], fs, path=path) else: try: multiTrack[channel_names[channel]] = Wave(value[:, channel], fs, path=path) except IndexError: raise MultiChannelError( f"cannot select channel {channel} from file " f"{path} with {value.shape[1]} channels") for k in multiTrack.keys(): value = multiTrack[k].value if np.issubdtype(value.dtype, np.integer): multiTrack[k].min = np.iinfo(value.dtype).min multiTrack[k].max = np.iinfo(value.dtype).max elif np.issubdtype(value.dtype, np.floating): multiTrack[k].min = -1.0 multiTrack[k].max = 1.0 else: logging.error(f"Wave dtype {value.dtype} not supported") raise NotImplementedError return multiTrack
def load(path=None): """ Parameters ---------- path: str (optional) default ($DATASET_PATH), the path to look for the data and where the data will be downloaded if not present Returns ------- wavs: array the waveforms in the time amplitude domain labels: array binary values representing the presence or not of an avian flag: array the Xeno-Canto ID """ if path is None: path = os.environ["DATASET_PATH"] download_dataset(path, _dataset, _urls, extract=True) t0 = time.time() archive = zipfile.ZipFile(path + "picidae/PicidaeDataset.zip") wavs = list() labels = list() XC = list() for item in tqdm(archive.namelist(), ascii=True): if item[-4:] == ".wav" and "._" not in item: wavfile = archive.read(item) byt = io.BytesIO(wavfile) wavs.append(wav_read(byt)[1].astype("float32")) labels.append(item.split("/")[1]) XC.append(item.split("/")[2].split("-")[0]) labels = np.array(labels) unique = np.unique(labels) y = np.zeros(len(labels), dtype="int32") for k, name in enumerate(np.sort(unique)): y[labels == name] = k data = { "wavs": wavs, "labels": y, "names": labels, "XC_identifiers": XC, "DOC": DOC, } print("Dataset picidae loaded in {0:.2f}s.".format(time.time() - t0)) return data
def load(path=None): """ Parameters ---------- path: str (optional) default ($DATASET_PATH), the path to look for the data and where the data will be downloaded if not present Returns ------- wavs: array the waveforms in the time amplitude domain labels: array binary values representing the presence or not of an avian recording: array the file number from which the sample has been extracted """ if path is None: path = os.environ["DATASET_PATH"] birdvox_dcase_20k.download(path) t0 = time.time() # Loading the file basefile = path + "birdvox_dcase_20k/BirdVox-DCASE-20k.zip" wavs = list() labels = np.loadtxt( path + "birdvox_dcase_20k/data_labels.csv", skiprows=1, delimiter=",", dtype="str", ) wav_names = list(labels[:, 0]) wav_labels = labels[:, 2].astype("int") labels = list() f = zipfile.ZipFile(basefile) for name in tqdm(f.namelist(), ascii=True): filename = name.split("/")[-1][:-4] if ".wav" not in name or filename not in wav_names: continue byt = io.BytesIO(f.read(name)) wavs.append(wav_read(byt)[1].astype("float32")) labels.append(wav_labels[wav_names.index(filename)]) wavs = np.array(wavs).astype("float32") labels = np.array(labels).astype("int32") print("Dataset birdvox_dcase_20k loaded in {0:.2f}s.".format( time.time() - t0)) return wavs, labels
def load(path=None, classes=range(10)): if path is None: path = os.environ["DATASET_PATH"] sonycust.download(path) t0 = time.time() # Loading the file files = tarfile.open(path + "ust/audio-dev.tar.gz", "r:gz") annotations = np.loadtxt(path + "ust/annotations-dev.csv", delimiter=",", skiprows=1, dtype="str") # get name filenames = list(annotations[:, 2]) for i in range(len(filenames)): filenames[i] = annotations[i, 0] + "/" + str(filenames[i]) # get fine labels and limts for coarse classes fine_labels = annotations[:, 4:33].astype("float32").astype("int32") class_limits = [0, 4, 9, 10, 14, 19, 23, 28, 29] n_classes = len(class_limits) - 1 n_samples = len(annotations) llabels = np.zeros((n_samples, n_classes), dtype="int") for k in range(n_classes): block = fine_labels[:, class_limits[k]:class_limits[k + 1]] llabels[:, k] = block.max(1) wavs = np.zeros((2794, 441000), dtype="float32") coarse = np.zeros((2794, 8), dtype="int32") fine = np.zeros((2794, 29), dtype="int32") filenames = files.getnames() cpt = 0 for name in tqdm(filenames, ascii=True): if ".wav" not in name: continue wav = wav_read(files.extractfile(name))[1].astype("float32") wavs[cpt] = wav_read(files.extractfile(name))[1].astype("float32") coarse[cpt] = llabels[filenames.index(name)] fine[cpt] = fine_labels[filenames.index(name)] cpt += 1 return wavs, fine, coarse
def load(path=None): """ Parameters ---------- path: str (optional) a string where to load the data and download if not present Returns ------- singers: list the list of singers as strings, 11 males and 9 females as in male1, male2, ... genders: list the list of genders of the singers as in male, male, female, ... vowels: list the vowels being pronunced data: list the list of waveforms, not all equal length """ if path is None: path = os.environ["DATASET_PATH"] vocalset.download(path) t = time.time() # load wavs f = zipfile.ZipFile(path + "vocalset/VocalSet11.zip") # init. the data array singers = [] genders = [] vowels = [] # techniques = [] data = [] for filename in tqdm(f.namelist(), ascii=True): if ".wav" not in filename or "excerpts" in filename or "_" == filename[ 0]: continue vowel = filename[-5] if vowel not in ["a", "e", "i", "o", "u"]: continue vowels.append(vowel) bytes_ = io.BytesIO(f.read(filename)) data.append(wav_read(bytes_)[1].astype("float32")) split = filename.split("/") genders.append("".join(x for x in split[1] if x.isalpha())) singers.append(split[1]) # techniques.append(split[-1][3:-6]) return singers, genders, vowels, data
def record_audio(): wav_file = 'chunk.wav' run([ 'arecord', '-D', f'hw:{config.audio.device}', '-f', f'{config.audio.format}', '-r', f'{config.audio.sample_rate}', '-c', f'{config.audio.channels}', '-d', f'{config.audio.chunk_duration}', wav_file ], check=True) sample_rate, data = wav_read(wav_file) return data
def load(path=None): if path is None: path = os.environ["DATASET_PATH"] speech_commands.download(path) t0 = time.time() print("Loading speech command") tar = tarfile.open( path + "speech_commands/speech_commands_v0.01.tar.gz", "r:gz" ) # Load train set wavs = list() labels = list() noises = list() noise_labels = list() names = tar.getmembers() for name in tqdm(names, ascii=True): if "wav" not in name.name: continue f = tar.extractfile(name.name) # .read() wav = wav_read(f)[1] if "noise" in name.name: noises.append(wav) noise_labels.append(name.name.split("/")[-1]) else: left = 16000 - len(wav) to_pad = left // 2 wavs.append(np.pad(wav, [[to_pad, left - to_pad]])) labels.append(name.name.split("/")[-2]) labels = np.array(labels) unique_labels = np.unique(labels) y = np.squeeze( np.array( [np.nonzero(label == unique_labels)[0] for label in labels] ).astype("int32") ) data = { "wavs": np.array(wavs).astype("float32"), "labels": y, "names": labels, "noises": noises, "noises_labels": noises_labels, "INFOS": speech_commands.__doc__, } print("Dataset speech commands loaded in{0:.2f}s.".format(time.time() - t0)) return data
def get_wav_duration(file: str) -> float: """ Calc duration of wave file :param file: file path :return: wave duration in seconds """ try: sr, wav = wav_read(file) dur = len(wav) / sr except: dur = -1 return dur
def read_wav(cls, path, channel=None, mmap=False): """load waveform from file""" try: fs, value = wav_read(path, mmap=mmap) except ValueError: try: import soundfile as sf value, fs = sf.read(path, dtype="int16") except ImportError: logging.error( f"Scipy was unable to import {path}, " f"try installing soundfile python package for more compatability" ) raise ImportError except RuntimeError: raise RuntimeError(f"Unable to import audio file {path}") if value.ndim == 1: if channel is not None and channel != 0: raise MultiChannelError( f"cannot select channel {channel} from monaural file {path}" ) if value.ndim == 2: if channel is None: raise MultiChannelError( f"must select channel when loading file {path} with {value.shape[1]} channels" ) try: value = value[:, channel] except IndexError: raise MultiChannelError( f"cannot select channel {channel} from file " f"{path} with {value.shape[1]} channels") wav = Wave(value, fs, path=path) if value.dtype == numpy.dtype(numpy.int16): wav.min = -32767 wav.max = 32768 elif value.dtype == numpy.dtype(numpy.int32): wav.min = -2147483648 wav.max = 2147483647 elif value.dtype == numpy.dtype(numpy.uint8): wav.min = 0 wav.max = 255 elif value.dtype in set( [numpy.dtype(numpy.float64), numpy.dtype(numpy.float32)]): wav.max = 1.0 wav.min = -1.0 else: logging.error(f"Wave dtype {value.dtype} not supported") raise NotImplementedError return wav
def load(path=None): """music genre classification This dataset was used for the well known paper in genre classification "Musical genre classification of audio signals" by G. Tzanetakis and P. Cook in IEEE Transactions on Audio and Speech Processing 2002. Unfortunately the database was collected gradually and very early on in my research so I have no titles (and obviously no copyright permission etc). The files were collected in 2000-2001 from a variety of sources including personal CDs, radio, microphone recordings, in order to represent a variety of recording conditions. Nevetheless I have been providing it to researchers upon request mainly for comparison purposes etc. Please contact George Tzanetakis ([email protected]) if you intend to publish experimental results using this dataset. There are some practical and conceptual issues with this dataset, described in "The GTZAN dataset: Its contents, its faults, their effects on evaluation, and its future use" by B. Sturm on arXiv 2013. """ if path is None: path = os.environ["DATASET_PATH"] download_dataset(path, "gtzan", _urls) print("Loading gtzan") t0 = time.time() tar = tarfile.open(path + "gtzan/genres.tar.gz", "r:gz") # Load train set train_songs = list() train_labels = list() names = tar.getnames() names = tar.getmembers() for name in tqdm(names, ascii=True, total=1000): if "wav" not in name.name: continue f = tar.extractfile(name.name) # .read() train_songs.append(wav_read(f)[1]) t = name.name.split("/")[1] train_labels.append(gtzan.name2class[t]) N = np.min([len(w) for w in train_songs]) train_songs = [w[:N] for w in train_songs] train_songs = np.stack(train_songs).astype("float32") train_labels = np.array(train_labels).astype("int32") print("Dataset gtzan loaded in{0:.2f}s.".format(time.time() - t0)) data = {"wavs": train_songs, "labels": train_labels} return data
def load(path=None): """ Parameters ---------- path: str (optional) default ($DATASET_PATH), the path to look for the data and where the data will be downloaded if not present Returns ------- wavs: array the waveforms in the time amplitude domain labels: array binary values representing the presence or not of an avian flag: array the Xeno-Canto ID """ if path is None: path = os.environ['DATASET_PATH'] picidae.download(path) t0 = time.time() archive = zipfile.ZipFile(path + 'picidae/PicidaeDataset.zip') wavs = list() labels = list() XC = list() for item in tqdm(archive.namelist(), ascii=True): if item[-4:] == '.wav' and '._' not in item: wavfile = archive.read(item) byt = io.BytesIO(wavfile) wavs.append(wav_read(byt)[1].astype('float32')) labels.append(item.split('/')[1]) XC.append(item.split('/')[2].split('-')[0]) labels = np.array(labels) unique = np.unique(labels) y = np.zeros(len(labels), dtype='int32') for k, name in enumerate(np.sort(unique)): y[labels == name] = k print('Dataset picidae loaded in {0:.2f}s.'.format(time.time() - t0)) return wavs, y, labels, XC
def load(path=None): """ digit recognition https://github.com/soerenab/AudioMNIST A simple audio/speech dataset consisting of recordings of spoken digits in wav files at 8kHz. The recordings are trimmed so that they have near minimal silence at the beginnings and ends. FSDD is an open dataset, which means it will grow over time as data is contributed. In order to enable reproducibility and accurate citation the dataset is versioned using Zenodo DOI as well as git tags. Current status 4 speakers 2,000 recordings (50 of each digit per speaker) English pronunciations """ if path is None: path = os.environ["DATASET_PATH"] download_dataset(path, _dataset, _urls) t0 = time.time() # load wavs f = zipfile.ZipFile(os.path.join(path, _dataset, "data.zip")) wavs = list() digits = list() speakers = list() N = 0 for filename in tqdm(f.namelist(), ascii=True): if ".wav" not in filename: continue filename_end = filename.split("/")[-1] digits.append(int(filename_end.split("_")[0])) speakers.append(int(filename_end.split("_")[1]) - 1) wavfile = f.read(filename) byt = io.BytesIO(wavfile) wavs.append(wav_read(byt)[1].astype("float32")) N = max(N, len(wavs[-1])) digits = np.array(digits) speakers = np.array(speakers) all_wavs = np.zeros((len(wavs), N)) for i in range(len(wavs)): left = (N - len(wavs[i])) // 2 all_wavs[i, left:left + len(wavs[i])] = wavs[i] print("Audio-MNIST loaded in {} s.".format(time.time() - t0)) return all_wavs, digits, speakers
def load(path=None): """ESC 50. https://github.com/karolpiczak/ESC-50#download Parameters ---------- path: str (optional) default $DATASET_path), the path to look for the data and where the data will be downloaded if not present """ if path is None: path = os.environ['DATASET_PATH'] esc50.download(path) t0 = time.time() f = zipfile.ZipFile(path + 'esc50/master.zip') meta = np.loadtxt(io.BytesIO(f.read('ESC-50-master/meta/esc50.csv')), delimiter=',', skiprows=1, dtype='str') filenames = list(meta[:, 0]) folds = meta[:, 1].astype('int32') fine_labels = meta[:, 2].astype('int32') categories = meta[:, 3] coarse_labels = np.array([esc50.fine_to_coarse[c] for c in categories]).astype('int32') wavs = list() order = list() N = 0 for filename in f.namelist(): if '.wav' not in filename: continue wavfile = f.read(filename) byt = io.BytesIO(wavfile) wavs.append(wav_read(byt)[1].astype('float32')[::subsample]) order.append(filenames.index(filename.split('/')[-1])) N = max(N, len(wavs[-1])) all_wavs = np.zeros((len(wavs), N)) for i in range(len(wavs)): left = (N - len(wavs[i])) // 2 all_wavs[order[i], left:left + len(wavs[i])] = wavs[i] return all_wavs, fine_labels, coarse_labels, categories
def load(path=None): """Binary audio classification, presence or absence of a bird. `Warblr <http://machine-listening.eecs.qmul.ac.uk/bird-audio-detection-challenge/#downloads>`_ comes from a UK bird-sound crowdsourcing research spinout called Warblr. From this initiative we have 10,000 ten-second smartphone audio recordings from around the UK. The audio totals around 44 hours duration. The audio will be published by Warblr under a Creative Commons licence. The audio covers a wide distribution of UK locations and environments, and includes weather noise, traffic noise, human speech and even human bird imitations. It is directly representative of the data that is collected from a mobile crowdsourcing initiative. Load the data given a path """ if path is None: path = os.environ["DATASET_PATH"] download_dataset(path, _name, _urls) # Load the dataset (download if necessary) and set # the class attributes. print("Loading warblr") t = time.time() # Loading Labels labels = np.loadtxt( path + "warblr/warblrb10k_public_metadata.csv", delimiter=",", skiprows=1, dtype="str", ) # Loading the files f = zipfile.ZipFile(path + "warblr/warblrb10k_public_wav.zip") N = labels.shape[0] wavs = list() for i, files_ in tqdm(enumerate(labels), ascii=True): wavfile = f.read("wav/" + files_[0] + ".wav") byt = io.BytesIO(wavfile) wavs.append(np.expand_dims(wav_read(byt)[1].astype("float32"), 0)) labels = labels[:, 1].astype("int32") print("Dataset warblr loaded in", "{0:.2f}".format(time.time() - t), "s.") dataset = {"wavs": wavs, "labels": labels} return dataset
def __init__(self, wavfile, fs, windowlen, slidelen, fft_n=512, mel_n=25, p=13): if type(wavfile) == str: self.fs, self.signal = wav_read(wavfile) else: self.fs = fs self.signal = np.asarray(wavfile) self.windowlen = windowlen self.slidelen = slidelen self.fft_n = fft_n self.mel_n = 25 self.p = p
def load(path=None, classes=range(10)): if path is None: path = os.environ['DATASET_PATH'] download(path) t0 = time.time() # Loading the file files = tarfile.open(path + 'ust/audio-dev.tar.gz', 'r:gz') annotations = np.loadtxt(path + 'ust/annotations-dev.csv', delimiter=',', skiprows=1, dtype='str') # get name filenames = list(annotations[:, 2]) for i in range(len(filenames)): filenames[i] = annotations[i, 0] + '/' + str(filenames[i]) # get fine labels and limts for coarse classes fine_labels = annotations[:, 4:33].astype('float32').astype('int32') class_limits = [0, 4, 9, 10, 14, 19, 23, 28, 29] n_classes = len(class_limits) - 1 n_samples = len(annotations) llabels = np.zeros((n_samples, n_classes), dtype='int') for k in range(n_classes): block = fine_labels[:, class_limits[k]:class_limits[k + 1]] block = block.astype('float32').astype('int32') llabels[:, k] = block.max(1) POT = [] wavs = np.zeros((2794, 441000)) labels = np.zeros((2794, n_classes)).astype('int') filenames = files.getnames() cpt = 0 for name in tqdm(filenames, ascii=True): if '.wav' not in name: continue wav = wav_read(files.extractfile(name))[1].astype('float32') wavs[cpt, :len(wav)] = wav labels[cpt] = llabels[filenames.index(name)] cpt += 1 return wavs, labels
def load_DCLDE(window_size=441000,PATH=None): """ToDo """ if PATH is None: PATH = os.environ['DATASET_PATH'] dict_init = [('sampling_rate',44100),("n_classes",2),("path",PATH), ("name","freefield1010"),('classes',["no bird","bird"])] dataset = Dataset(**dict(dict_init)) # Load the dataset (download if necessary) and set # the class attributes. print("Loading DCLDE") t = time.time() if not os.path.isdir(PATH+'DCLDE'): print('\tCreating Directory') os.mkdir(PATH+'DCLDE') if not os.path.exists(PATH+'DCLDE/DCLDE_LF_Dev.zip'): url = 'http://sabiod.univ-tln.fr/workspace/DCLDE2018/DCLDE_LF_Dev.zip' with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc='Wav files') as t: urllib.request.urlretrieve(url,PATH+'DCLDE/DCLDE_LF_Dev.zip') # Loading the files f = zipfile.ZipFile(PATH+'DCLDE/DCLDE_LF_Dev.zip') wavs = list() # labels = list() for zipf in tqdm(f.filelist,ascii=True): if '.wav' in zipf.filename and '.d100.' in zipf.filename: wavfile = f.read(zipf) byt = io.BytesIO(wavfile) wav = wav_read(byt)[1].astype('float32') for s in range(len(wav)//window_size): wavs.append(wav[s*window_size:(s+1)*window_size]) # labels.append(zipf.filename.split('/')[2]) # return wavs,labels wavs = np.expand_dims(np.asarray(wavs),1) dataset.add_variable({'signals':{'train_set':wavs}}) print('Dataset freefield1010 loaded in','{0:.2f}'.format(time.time()-t),'s.') return dataset
def __init__(self, channel_file_name): self.file_name = channel_file_name self.file_name_formatted = channel_file_name[:-4].capitalize() try: # Ignoring warnings here because SciPy warns if it finds non-data block, like the header, which is not a problem for us with warnings.catch_warnings(): warnings.simplefilter("ignore") self.sample_rate, self.frames_array = wav_read( "./" + input_files_folder + "/" + channel_file_name) # The time that each sample takes is the reciprocal of the sample rate self.timing = 1 / self.sample_rate except FileNotFoundError: print( "Oops, I thought I found a file, but it seems it does not exist... \nIf you are seeing this, something went pretty wrong, " "but i'm continuing anyway") self.note_indices = self.get_note_indices() # The first time we want all the notes timing from the start of the list self.note_framing_list = self.get_note_framing_list(0) self.note_timing_list = self.get_note_timing_list( self.note_framing_list)
def redraw(): """Clears and re-draws WAV plot with trim lines.""" nonlocal sample_rate plot.cla() if current_path: wav_path = input_dir / current_path _LOGGER.debug("Loading %s", wav_path) sample_rate, wav_data = wav_read(str(wav_path)) audio = wav_data[:, 0] plot.plot(audio, color="blue") plot.set_xlim(0, len(audio)) # Trim lines if left_cut is not None: plot.axvline(linewidth=2, x=left_cut, color="red") if right_cut is not None: plot.axvline(linewidth=2, x=right_cut, color="green") canvas.draw()
def read_wav(cls, path, channel=None, mmap=False): """load waveform from file""" try: fs, value = wav_read(path, mmap=mmap) if np.ndim(value) == 1: value = value.reshape(-1, 1) except ValueError: try: if mmap: logger.warning( "mmap is not supported by soundfile, ignoring") import soundfile as sf audioEncodings: DefaultDict[str, str] = defaultdict( lambda: "float64") audioEncodings[ "PCM_S8"] = "int16" # soundfile does not support int8 audioEncodings[ "PCM_U8"] = "int16" # soundfile does not support uint16 audioEncodings["PCM_16"] = "int16" audioEncodings["PCM_24"] = "int32" # there is no np.int24 audioEncodings["PCM_32"] = "int32" audioEncodings["FLOAT"] = "float32" audioEncodings["DOUBLE"] = "float64" file_info = sf.info(path) value, fs = sf.read(path, dtype=audioEncodings[file_info.subtype], always_2d=True) except ImportError: logger.error( "Install soundfile for greater audio file compatability") except RuntimeError: logger.error("Soundfile was unable to open file") return None if channel is not None: value = value[:, channel] wav = Wave(value, fs, path=path) return wav
def load(path=None): if path is None: path = os.environ['DATASET_PATH'] speech_commands.download(path) t0 = time.time() print('Loading speech command') tar = tarfile.open(path+'speech_commands/speech_commands_v0.01.tar.gz', 'r:gz') # Load train set wavs = list() labels = list() noises = list() noise_labels = list() names = tar.getmembers() for name in tqdm(names, ascii=True): if 'wav' not in name.name: continue f = tar.extractfile(name.name)#.read() wav = wav_read(f)[1] if 'noise' in name.name: noises.append(wav) noise_labels.append(name.name.split('/')[-1]) else: left = 16000 - len(wav) to_pad = left // 2 wavs.append(np.pad(wav, [[to_pad, left - to_pad]])) labels.append(name.name.split('/')[-2]) labels = np.array(labels) unique_labels = np.unique(labels) y = np.squeeze(np.array([np.nonzero(label == unique_labels)[0] for label in labels]).astype('int32')) print('Dataset speech commands loaded in{0:.2f}s.'.format(time.time()-t0)) return np.array(wavs).astype('float32'), y, labels, noises, noise_labels
def load(path=None): if path is None: path = os.environ['DATASET_path'] freefield1010.download(path) t = time.time() # load labels labels = np.loadtxt(path+'freefield1010/ff1010bird_metadata.csv', delimiter=',',skiprows=1,dtype='int32') # load wavs f = zipfile.ZipFile(path+'freefield1010/ff1010bird_wav.zip') # init. the data array N = labels.shape[0] wavs = np.empty((N,441000//subsample),dtype='float32') for i, files_ in tqdm(enumerate(labels[:,0]),ascii=True, total=N): wavfile = f.read('wav/'+str(files_)+'.wav') byt = io.BytesIO(wavfile) wavs[i] = wav_read(byt)[1].astype('float32')[::subsample] labels = labels[:,1] return wavs, labels
def load(path=None): """Audio binary classification, presence or absence of bird songs. `freefield1010 <http://machine-listening.eecs.qmul.ac.uk/bird-audio-detection-challenge/#downloads>`_. is a collection of over 7,000 excerpts from field recordings around the world, gathered by the FreeSound project, and then standardised for research. This collection is very diverse in location and environment, and for the BAD Challenge we have newly annotated it for the presence/absence of birds. """ if path is None: path = os.environ["DATASET_PATH"] download_dataset(path, "freefield1010", _urls) # load labels labels = np.loadtxt( path + "freefield1010/ff1010bird_metadata.csv", delimiter=",", skiprows=1, dtype="int32", ) # load wavs f = zipfile.ZipFile(path + "freefield1010/ff1010bird_wav.zip") # init. the data array N = labels.shape[0] wavs = np.empty((N, 441000), dtype="float32") for i, files_ in tqdm(enumerate(labels[:, 0]), ascii=True, total=N): wavfile = f.read("wav/" + str(files_) + ".wav") byt = io.BytesIO(wavfile) wavs[i] = wav_read(byt)[1].astype("float32") labels = labels[:, 1] data = {"wavs": wavs, "labels": labels} return data
def __init__( self, filepath_or_array, ): if isinstance(filepath_or_array, str): self.sample_rate, self.data = wav_read(filepath_or_array) self.data = self.data / (2.**15.) elif isinstance(filepath_or_array, wav): self.sample_rate, self.data = filepath_or_array.sample_rate, filepath_or_array.data elif len(filepath_or_array) == 2 and not isinstance( filepath_or_array, np.ndarray): self.sample_rate, self.data = filepath_or_array[0], np.asarray( filepath_or_array[1]) else: self.sample_rate, self.data = self.DEFAULT_SAMPLE_RATE, np.asarray( filepath_or_array) if len(self.data.shape) == 1: self.data = self.data.reshape(self.data.size, 1) self.shape = self.data.shape self.size, self.channels = self.shape self.length = float(self.size / self.sample_rate) self.time = np.arange(0, self.length, 1.0 / float(self.sample_rate))
def load(path=None): """ESC-10/50: Environmental Sound Classification https://github.com/karolpiczak/ESC-50#download The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings suitable for benchmarking methods of environmental sound classification. The dataset consists of 5-second-long recordings organized into 50 semantical classes (with 40 examples per class) loosely arranged into 5 major categories: Animals Natural soundscapes & water sounds Human, non-speech sounds Interior/domestic sounds Exterior/urban noises Clips in this dataset have been manually extracted from public field recordings gathered by the Freesound.org project. The dataset has been prearranged into 5 folds for comparable cross-validation, making sure that fragments from the same original source file are contained in a single fold. ESC 50. https://github.com/karolpiczak/ESC-50#download Parameters ---------- path: str (optional) default $DATASET_path), the path to look for the data and where the data will be downloaded if not present Returns ------- wavs: array the wavs as a numpy array (matrix) with first dimension the data and second dimension time fine_labels: array the labels of the final classes (50 different ones) as a integer vector coarse_labels: array the labels of the classes big cateogry (5 of them) folds: array the fold as an integer from 1 to 5 specifying how to split the data one should not split a fold into train and set as it would make the same recording (but different subparts) be present in train and test, biasing optimistically the results. esc10: array the boolean vector specifying if the corresponding datum (wav, label, ...) is in the ESC-10 dataset or not. That is, to load the ESC-10 dataset simply load ESC-50 and use this boolean vector to extract only the ESC-10 data. """ if path is None: path = os.environ["DATASET_PATH"] download_dataset(path, _dataset, _urls, _baseurl) t0 = time.time() f = zipfile.ZipFile(path + "esc50/master.zip") meta = np.loadtxt( io.BytesIO(f.read("ESC-50-master/meta/esc50.csv")), delimiter=",", skiprows=1, dtype="str", ) filenames = list(meta[:, 0]) folds = meta[:, 1].astype("int32") fine_labels = meta[:, 2].astype("int32") categories = meta[:, 3] esc10 = meta[:, 4] == "True" coarse_labels = np.array([esc.fine_to_coarse[c] for c in categories]) coarse_labels = coarse_labels.astype("int32") wavs = list() order = list() N = 0 for filename in tqdm(f.namelist(), ascii=True): if ".wav" not in filename: continue wavfile = f.read(filename) byt = io.BytesIO(wavfile) wavs.append(wav_read(byt)[1].astype("float32")) order.append(filenames.index(filename.split("/")[-1])) N = max(N, len(wavs[-1])) all_wavs = np.zeros((len(wavs), N)) for i in range(len(wavs)): left = (N - len(wavs[i])) // 2 all_wavs[order[i], left:left + len(wavs[i])] = wavs[i] data = { "wavs": all_wavs, "fine_labels": fine_labels, "coarse_labels": coarse_labels, "folds": folds, "esc10": esc10, } return data
def load(path=None): """ Parameters ---------- path: str (optional) default $DATASET_path), the path to look for the data and where the data will be downloaded if not present Returns ------- wavs: array the wavs as a numpy array (matrix) with first dimension the data and second dimension time labels: array the labels of the final classes (41 different ones) as a integer vector """ if path is None: path = os.environ['DATASET_PATH'] FSDKaggle2018.download(path) t0 = time.time() f = zipfile.ZipFile(path + 'FSDKaggle2018/audio_train.zip') wavs_train = list() names_train = list() for filename in tqdm(f.namelist(), ascii=True, desc='Loading train set'): if '.wav' not in filename: continue wavfile = f.read(filename) byt = io.BytesIO(wavfile) wavs_train.append(wav_read(byt)[1].astype('float32')) names_train.append((filename.split('/')[-1])) f = zipfile.ZipFile(path + 'FSDKaggle2018/audio_test.zip') wavs_test = list() names_test = list() for filename in tqdm(f.namelist(), ascii=True, desc='Loading test set'): if '.wav' not in filename: continue wavfile = f.read(filename) byt = io.BytesIO(wavfile) wavs_test.append(wav_read(byt)[1].astype('float32')) names_test.append((filename.split('/')[-1])) f = zipfile.ZipFile(path + 'FSDKaggle2018/meta.zip') meta_train = np.loadtxt(io.BytesIO( f.read('FSDKaggle2018.meta/train_post_competition.csv')), delimiter=',', skiprows=1, dtype='str') meta_test = np.loadtxt(io.BytesIO( f.read( 'FSDKaggle2018.meta/test_post_competition_scoring_clips.csv')), delimiter=',', skiprows=1, dtype='str') filenames = list(meta_train[:, 0]) labels_train, verified, fsid_train = [], [], [] for i in range(len(wavs_train)): index = filenames.index(names_train[i]) labels_train.append(meta_train[index][1]) verified.append(meta_train[index][2]) fsid_train.append(meta_train[index][3]) filenames = list(meta_test[:, 0]) labels_test, usage, fsid_test = [], [], [] for i in range(len(wavs_test)): index = filenames.index(names_test[i]) labels_test.append(meta_test[index][1]) usage.append(meta_test[index][2]) fsid_test.append(meta_test[index][3]) dataset = { 'wavs_train': wavs_train, 'labels_train': labels_train, 'verified_train': verified, 'fsid_train': fsid_train, 'wavs_test': wavs_test, 'labels_test': labels_test, 'usage_test': usage, 'fsid_test': fsid_test } return dataset
def load(path=None): """music instrument classification ref https://zenodo.org/record/1290750#.WzCwSRyxXMU This dataset includes musical audio excerpts with annotations of the predominant instrument(s) present. It was used for the evaluation in the following article: Bosch, J. J., Janer, J., Fuhrmann, F., & Herrera, P. “A Comparison of Sound Segregation Techniques for Predominant Instrument Recognition in Musical Audio Signals”, in Proc. ISMIR (pp. 559-564), 2012 Please Acknowledge IRMAS in Academic Research IRMAS is intended to be used for training and testing methods for the automatic recognition of predominant instruments in musical audio. The instruments considered are: cello, clarinet, flute, acoustic guitar, electric guitar, organ, piano, saxophone, trumpet, violin, and human singing voice. This dataset is derived from the one compiled by Ferdinand Fuhrmann in his PhD thesis, with the difference that we provide audio data in stereo format, the annotations in the testing dataset are limited to specific pitched instruments, and there is a different amount and lenght of excerpts. """ if path is None: path = os.environ["DATASET_PATH"] download_dataset(path, "irmas", _urls) t0 = time.time() train_wavs = list() train_labels = list() test_wavs = list() test_labels = list() # loading the training set f = zipfile.ZipFile(path + "irmas/IRMAS-TrainingData.zip") namelist = f.namelist() for filename in tqdm(namelist, ascii=True): if ".wav" not in filename: continue wavfile = f.read(filename) byt = io.BytesIO(wavfile) train_wavs.append(wav_read(byt)[1].astype("float32")) train_labels.append(filename.split("/")[-2]) base = "irmas/IRMAS-TestingData-Part{}.zip" for part in ["1", "2", "3"]: f = zipfile.ZipFile(path + base.format(part)) namelist = f.namelist() for filename in tqdm(namelist, ascii=True, desc="Test data {}/3".format(part)): if ".wav" not in filename: continue byt = io.BytesIO(f.read(filename)) test_wavs.append(wav_read(byt)[1].astype("float32")) byt = io.BytesIO(f.read(filename.replace(".wav", ".txt"))) test_labels.append(np.loadtxt(byt, dtype="str")) data = { "train_set/wavs": np.array(train_wavs), "train_set/labels": train_labels, "test_wavs": np.array(test_wavs), "test_labels": test_labels, } print("Dataset IRMAS loaded in {0:.2f}s.".format(time.time() - t0)) return data
def to_wav(mp4_path): with TemporaryDirectory() as d: wav_path = path.join(d, "out.wav") check_call(["ffmpeg", "-v", "0", "-i", mp4_path, wav_path]) return wav_read(wav_path)