def noisebursts(self): threshold = [2*i for i in range(-5,6)] precisionArr = [] recallArr = [] FscoreArr = [] for value in threshold: print("threshold: {} being evaluated".format(value)) valueResults = [] for i, filename in enumerate(self.files): print("Executing file {} number {}/{}".format(filename, i+1, len(self.files)), end='\r') audio, sr, channels, _, _, _ = std.AudioLoader(filename=filename)() audio = np.sum(audio, axis=1)/channels _, _, ret = essNoiseburstDetector(audio, threshold=value) valueResults.append((filename.replace(self.wavDatasetPath, ""), ret)) print('') valueResults = sorted(valueResults, key=lambda x: x[0]) _, precision, recall = self.evaluateValue(valueResults, "NoiseBursts") precisionArr.append(precision) recallArr.append(recall) if (precision + recall) == 0.0: FscoreArr.append(0.0) else: FscoreArr.append((1 + Fbeta**2) * precision * recall / (Fbeta**2 * precision + recall)) u.plot("./results/noisethreshold.png", precision=precisionArr, recall=recallArr, Fscore=FscoreArr, x_values=threshold) alpha = [float(i)/10 for i in range(1,10)] precisionArr = [] recallArr = [] FscoreArr = [] for value in alpha: print("alpha: {} being evaluated".format(value)) valueResults = [] for i, filename in enumerate(self.files): print("Executing file {} number {}/{}".format(filename, i+1, len(self.files)), end='\r') audio, sr, channels, _, _, _ = std.AudioLoader(filename=filename)() audio = np.sum(audio, axis=1)/channels _, _, ret = essNoiseburstDetector(audio, alpha=value) valueResults.append((filename.replace(self.wavDatasetPath, ""), ret)) print('') valueResults = sorted(valueResults, key=lambda x: x[0]) _, precision, recall = self.evaluateValue(valueResults, "NoiseBursts") precisionArr.append(precision) recallArr.append(recall) if (precision + recall) == 0.0: FscoreArr.append(0.0) else: FscoreArr.append((1 + Fbeta**2) * precision * recall / (Fbeta**2 * precision + recall)) u.plot("./results/noiseealpha.png", precision=precisionArr, recall=recallArr, Fscore=FscoreArr, x_values=alpha)
def bandwidth(self): BWsumThreshold = [1e-7, 5e-7, 1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 5e-4] bandWidth = BwDetection() precisionArr = [] recallArr = [] FscoreArr = [] for value in BWsumThreshold: print("sumThreshold: {} being evaluated".format(value)) valueResults = [] for i, filename in enumerate(self.files): print("Executing file {} number {}/{}".format(filename, i+1, len(self.files)), end='\r') audio, sr, channels, _, _, _ = std.AudioLoader(filename=filename)() audio = np.sum(audio, axis=1)/channels _, _, ret = bandWidth(audio, sr, sumThreshold=value) valueResults.append((filename.replace(self.wavDatasetPath, ""), ret)) print('') valueResults = sorted(valueResults, key=lambda x: x[0]) _, precision, recall = self.evaluateValue(valueResults, "Bandwidth") precisionArr.append(precision) recallArr.append(recall) FscoreArr.append((1 + Fbeta**2) * precision * recall / (Fbeta**2 * precision + recall)) u.plot("./results/BWsumThreshold.png", precision=precisionArr,recall=recallArr, Fscore=FscoreArr, x_values=BWsumThreshold) BWConfTh = [0.6, 0.7, 0.8, 0.9] precisionArr = [] recallArr = [] FscoreArr = [] for value in BWConfTh: print("ConfTh: {} being evaluated".format(value)) valueResults = [] for i, filename in enumerate(self.files): print("Executing file {} number {}/{}".format(filename, i+1, len(self.files)), end='\r') audio, sr, channels, _, _, _ = std.AudioLoader(filename=filename)() audio = np.sum(audio, axis=1)/channels bandWidth = BwDetection(confTh=value) _, _, ret = bandWidth(audio, sr) valueResults.append((filename.replace(self.wavDatasetPath, ""), ret)) print('') valueResults = sorted(valueResults, key=lambda x: x[0]) _, precision, recall = self.evaluateValue(valueResults, "Bandwidth") precisionArr.append(precision) recallArr.append(recall) FscoreArr.append((1 + Fbeta**2) * precision * recall / (Fbeta**2 * precision + recall)) u.plot("./results/BWConfTh.png", precision=precisionArr,recall=recallArr, Fscore=FscoreArr, x_values=BWConfTh)
def silence(self): threshold = [-1*int(10*i) for i in range(1, 11)][::-1] precisionArr = [] recallArr = [] FscoreArr = [] for value in threshold: print("threshold: {} being evaluated".format(value)) valueResults = [] for i, filename in enumerate(self.files): print("Executing file {} number {}/{}".format(filename, i+1, len(self.files)), end='\r') audio, sr, channels, _, _, _ = std.AudioLoader(filename=filename)() audio = np.sum(audio, axis=1)/channels _, ret = essStartstopDetector(audio, threshold=value) valueResults.append((filename.replace(self.wavDatasetPath, ""), ret)) print('') valueResults = sorted(valueResults, key=lambda x: x[0]) _, precision, recall = self.evaluateValue(valueResults, "Clicks") precisionArr.append(precision) recallArr.append(recall) FscoreArr.append((1 + Fbeta**2) * precision * recall / (Fbeta**2 * precision + recall)) u.plot("./results/silencethreshold.png", precision=precisionArr,recall=recallArr, Fscore=FscoreArr, x_values=threshold) frameSize = [int(2**i) for i in range(5,10)] precisionArr = [] recallArr = [] FscoreArr = [] for value in frameSize: print("frameSize: {} being evaluated".format(value)) valueResults = [] for i, filename in enumerate(self.files): print("Executing file {} number {}/{}".format(filename, i+1, len(self.files)), end='\r') audio, sr, channels, _, _, _ = std.AudioLoader(filename=filename)() audio = np.sum(audio, axis=1)/channels _, ret = essStartstopDetector(audio, frameSize=value, hopSize=value) valueResults.append((filename.replace(self.wavDatasetPath, ""), ret)) print('') valueResults = sorted(valueResults, key=lambda x: x[0]) _, precision, recall = self.evaluateValue(valueResults, "Clicks") precisionArr.append(precision) recallArr.append(recall) FscoreArr.append((1 + Fbeta**2) * precision * recall / (Fbeta**2 * precision + recall)) u.plot("./results/silenceframeSize.png", precision=precisionArr, recall=recallArr, Fscore=FscoreArr, x_values=frameSize)
def analyze_misc(filename, segment_duration=20): # Compute replay gain and duration on the entire file, then load the # segment that is centered in time with replaygain applied audio = es.MonoLoader(filename=filename)() replaygain = es.ReplayGain()(audio) segment_start = (len(audio) / 44100 - segment_duration) / 2 segment_end = segment_start + segment_duration if segment_start < 0 or segment_end > len(audio) / 44100: raise ValueError( 'Segment duration is larger than the input audio duration') loader = es.EasyLoader(filename=filename, replayGain=replaygain, startTime=segment_start, endTime=segment_end) windowing = es.Windowing(type='blackmanharris62') spectrum = es.Spectrum() powerspectrum = es.PowerSpectrum() centroid = es.Centroid() zcr = es.ZeroCrossingRate() rms = es.RMS() hfc = es.HFC() pool = essentia.Pool() audio = loader() for frame in es.FrameGenerator(audio, frameSize=2048, hopSize=1024): frame_spectrum = spectrum(windowing(frame)) pool.add('rms', rms(frame)) pool.add('rms_spectrum', rms(frame_spectrum)) pool.add('hfc', hfc(frame_spectrum)) pool.add('spectral_centroid', centroid(frame_spectrum)) pool.add('zcr', zcr(frame)) audio_st, sr, _, _, _, _ = es.AudioLoader(filename=filename)() # Ugly hack because we don't have a StereoResample left, right = es.StereoDemuxer()(audio_st) resampler = es.Resample(inputSampleRate=sr, outputSampleRate=44100) left = resampler(left) right = resampler(right) audio_st = es.StereoMuxer()(left, right) audio_st = es.StereoTrimmer(startTime=segment_start, endTime=segment_end)(audio_st) ebu_momentary, _, _, _ = es.LoudnessEBUR128(hopSize=1024 / 44100, startAtZero=True)(audio_st) pool.set('ebu_momentary', ebu_momentary) return pool
def Bit_Detection(fpath: str): if os.path.splitext(fpath)[1] != ".wav": raise ValueError("file must be wav") audio, SR, channels, _, br, _ = estd.AudioLoader(filename=fpath)() b = int(br / SR / channels) #number of bits used to code the fpath signal if channels >= 1: audio = audio[:, 0] audio = (2**b) * ((0.5 * audio) + 0.5) possible_b_array = [] b_tmp = b - 8 tolerance = 8 while b_tmp >= 8: possible_b_array.append(b_tmp) b_tmp -= 8 chunk_len = 100 number_of_chunks = 100 positions = np.random.randint(0, len(audio) - chunk_len - 1, size=number_of_chunks) audio_to_analyse = [] for idx in positions: audio_to_analyse = [ *audio_to_analyse, *audio[int(idx):int(idx + chunk_len)] ] audio_to_analyse = [int(val) for val in audio_to_analyse] conf_arr = [] for possible_b in possible_b_array: wrong = 0 hop = 2**(b - possible_b) #tolerance = 8 - b/possible_b for val in audio_to_analyse: #if possible_b == 16: print(val % hop) #if ((val % hop) > tolerance) and ((val % hop) < (hop - tolerance)): # wrong += 1 if val % hop == 0: wrong += 1 conf = 1 - wrong / len(audio_to_analyse) conf_arr.append(conf) print("b:{0}\tprob:{1}".format(possible_b, conf)) print(possible_b_array, conf_arr)
def Bit_Detection_Binary(fpath: str): if os.path.splitext(fpath)[1] != ".wav": raise ValueError("file must be wav") if not os.path.exists(fpath): raise ValueError("file {} does not exist".format(fpath)) audio, SR, channels, _, br, _ = estd.AudioLoader(filename=fpath)() b = int(br / SR / channels) #number of bits used to code the fpath signal if b not in [8, 16, 24, 32]: raise ValueError("Only bit depths accepted are 8, 16, 24, 32") if channels >= 1: audio = audio[:, 0] #if audio is stereo, only get the left channeñ #set audio to be ints from -2**(b-1) to 2**(b-1)-1 #and change type to int32 (32 bit is the highest coding depth allowed) audio = (2**(b - 1)) * audio.astype('float64') if b == 8: audio = audio.astype('int8') elif b == 16: audio = audio.astype('int16') elif b == 24: audio = audio.astype('int32') elif b == 32: audio = audio.astype('int32') else: audio = audio.astype('int64') #get 100 random splices of data of 100 samples each one chunk_len = 100 number_of_chunks = 100 positions = np.random.randint(0, len(audio) - chunk_len - 1, size=number_of_chunks) audio_to_analyse = [] for idx in positions: audio_to_analyse = [ *audio_to_analyse, *audio[int(idx):int(idx + chunk_len)] ] result = [0] * b for sample in audio_to_analyse: bin_arr = convert_to_bin_array(sample, b) result = [a or b for a, b in zip(result, bin_arr)] #print(result) for i, el in enumerate(reversed(result)): if el != 0: bits_predicted = len(result) - i break print(bits_predicted)
def load_audio(path, sample_rate, mono=True): """ Load an audio file using Essentia :param path: (str) location of audio file to load :param sample_rate: (int) sampling rate to load audio at :param mono: (bool) convert file to mono, defaults to True :return: audio samples """ # Load audio file loader = es.AudioLoader(filename=path) results = loader() samples = results[0] orig_rate = results[1] channels = results[2] # Make sure we get a mono or stereo audio if channels > 2: raise RuntimeError("Can't handle more than two audio channels.") # If there is only one channel, duplicate the first over to the second. # Essentia always loads as a stereo audio file and the right channel is # all zeros in this case. We'll convert to a stereo file for some of the # processing here such as the Loudness Normalization. if channels == 1: samples[:, 1] = samples[:, 0] # Mix to mono if required if mono: samples = mix_to_mono(samples) # Perform resampling if required if orig_rate != sample_rate: resample = es.Resample(inputSampleRate=orig_rate, outputSampleRate=sample_rate) # Resampling for a stereo audio file if not mono: resampled_left = resample(samples[:, 0]) resampled_right = resample(samples[:, 1]) samples = np.array([resampled_left, resampled_right]) samples = samples.T # Resampling for a mono audio file else: samples = resample(samples) return samples, channels
def channelSep(filename_wav, path_dcase): """ Separate stereo audio into left, right, average and difference :param filename_wav: :return: """ LOADER = es.AudioLoader(filename=filename_wav) audio, sr, num_chan, md5, bitrate, codec = LOADER() filename_wav = os.path.basename(filename_wav) print(filename_wav) wavfile.write(os.path.join(path_dcase, 'audio_left', filename_wav), sr, audio[:, 0]) wavfile.write(os.path.join(path_dcase, 'audio_right', filename_wav), sr, audio[:, 1]) wavfile.write(os.path.join(path_dcase, 'audio_average', filename_wav), sr, audio[:, 0] / 2.0 + audio[:, 1] / 2.0) wavfile.write(os.path.join(path_dcase, 'audio_difference', filename_wav), sr, audio[:, 0] - audio[:, 1])
def Bit_Detection_multifile(folder: str): if not os.path.exists(folder): raise ValueError("{} does not exist".format(folder)) df = pd.DataFrame() for file in os.listdir(folder): if os.path.splitext(file)[1] != ".wav": print("{} skipped because it was not a wav file".format(file)) continue fpath = os.path.join(folder, file) audio, SR, channels, _, br, _ = estd.AudioLoader(filename=fpath)() if channels >= 1: audio = audio[:, 0] b = int(br / SR / channels) #number of bits used to code the fpath signal extracted_b = Bit_Detection_Binary(audio, b) #correct_b = min(b,int(file.split('b')[0])) df_temp = pd.DataFrame({ "Filename": [file], "Container": [b], #"Correct" : [correct_b], "Extracted": [extracted_b], "Problem in file": [extracted_b < b], #"Extracted_correctly" : [correct_b==extracted_b] }) df = df.append(df_temp) #print("{}:\tcontainer_bits:{}\tcorrect_bits:{}\textracted_bits:{}\tcorrect:{}".format(file, b, correct_b, extracted_b, correct_b==extracted_b)) df = df.set_index("Filename") print(df) with open("results.tsv", "w") as tsv: df.to_csv(tsv, sep="\t")
def clicks(self): order = [int(2*i) for i in range(1,20)] detectionThreshold = [0, 5, 10, 15, 20, 25, 30, 35] powerEstimationThreshold = [int(2*i) for i in range(1, 8)] silenceThreshold = [-1*int(10*i) for i in range(1, 8)][::-1] precisionArr = [] recallArr = [] FscoreArr = [] for value in order: print("order: {} being evaluated".format(value)) valueResults = [] for i, filename in enumerate(self.files): print("Executing file {} number {}/{}".format(filename, i+1, len(self.files)), end='\r') audio, sr, channels, _, _, _ = std.AudioLoader(filename=filename)() audio = np.sum(audio, axis=1)/channels _, _, _, ret = essClickDetector(audio, order=value) valueResults.append((filename.replace(self.wavDatasetPath, ""), ret)) print('') valueResults = sorted(valueResults, key=lambda x: x[0]) _, precision, recall = self.evaluateValue(valueResults, "Clicks") precisionArr.append(precision) recallArr.append(recall) FscoreArr.append((1 + Fbeta**2) * precision * recall / (Fbeta**2 * precision + recall)) u.plot("./results/clicksorder.png", precision=precisionArr,recall=recallArr, Fscore=FscoreArr, x_values=order) precisionArr = [] recallArr = [] FscoreArr = [] for value in detectionThreshold: print("detectionThreshold: {} being evaluated".format(value)) valueResults = [] for i, filename in enumerate(self.files): print("Executing file {} number {}/{}".format(filename, i+1, len(self.files)), end='\r') audio, sr, channels, _, _, _ = std.AudioLoader(filename=filename)() audio = np.sum(audio, axis=1)/channels _, _, _, ret = essClickDetector(audio, detectionThreshold=value) valueResults.append((filename.replace(self.wavDatasetPath, ""), ret)) print('') valueResults = sorted(valueResults, key=lambda x: x[0]) _, precision, recall = self.evaluateValue(valueResults, "Clicks") precisionArr.append(precision) recallArr.append(recall) FscoreArr.append((1 + Fbeta**2) * precision * recall / (Fbeta**2 * precision + recall)) u.plot("./results/clicksdetectionThreshold.png", precision=precisionArr,recall=recallArr, Fscore=FscoreArr, x_values=detectionThreshold) precisionArr = [] recallArr = [] FscoreArr = [] for value in powerEstimationThreshold: print("powerEstimationThreshold: {} being evaluated".format(value)) valueResults = [] for i, filename in enumerate(self.files): print("Executing file {} number {}/{}".format(filename, i+1, len(self.files)), end='\r') audio, sr, channels, _, _, _ = std.AudioLoader(filename=filename)() audio = np.sum(audio, axis=1)/channels _, _, _, ret = essClickDetector(audio, powerEstimationThreshold=value) valueResults.append((filename.replace(self.wavDatasetPath, ""), ret)) print('') valueResults = sorted(valueResults, key=lambda x: x[0]) _, precision, recall = self.evaluateValue(valueResults, "Clicks") precisionArr.append(precision) recallArr.append(recall) FscoreArr.append((1 + Fbeta**2) * precision * recall / (Fbeta**2 * precision + recall)) u.plot("./results/clickspowerEstimationThreshold.png", precision=precisionArr,recall=recallArr, Fscore=FscoreArr, x_values=powerEstimationThreshold) precisionArr = [] recallArr = [] FscoreArr = [] for value in silenceThreshold: print("silenceThreshold: {} being evaluated".format(value)) valueResults = [] for i, filename in enumerate(self.files): print("Executing file {} number {}/{}".format(filename, i+1, len(self.files)), end='\r') audio, sr, channels, _, _, _ = std.AudioLoader(filename=filename)() audio = np.sum(audio, axis=1)/channels _, _, _, ret = essClickDetector(audio, silenceThreshold=value) valueResults.append((filename.replace(self.wavDatasetPath, ""), ret)) print('') valueResults = sorted(valueResults, key=lambda x: x[0]) _, precision, recall = self.evaluateValue(valueResults, "Clicks") precisionArr.append(precision) recallArr.append(recall) FscoreArr.append((1 + Fbeta**2) * precision * recall / (Fbeta**2 * precision + recall)) u.plot("./results/clickssilenceThreshold.png", precision=precisionArr,recall=recallArr, Fscore=FscoreArr, x_values=silenceThreshold)
def hum(self): timeWindow = [0.1, 0.3, 0.5, 1, 3, 5] precisionArr = [] recallArr = [] FscoreArr = [] for value in timeWindow: print("sumThreshold: {} being evaluated".format(value)) valueResults = [] for i, filename in enumerate(self.files): print("Executing file {} number {}/{}".format(filename, i+1, len(self.files)), end='\r') audio, sr, channels, _, _, _ = std.AudioLoader(filename=filename)() audio = np.sum(audio, axis=1)/channels _, ret = essHumDetector(audio, timeWindow=value) valueResults.append((filename.replace(self.wavDatasetPath, ""), ret)) print('') valueResults = sorted(valueResults, key=lambda x: x[0]) _, precision, recall = self.evaluateValue(valueResults, "Hum") precisionArr.append(precision) recallArr.append(recall) FscoreArr.append((1 + Fbeta**2) * precision * recall / (Fbeta**2 * precision + recall)) u.plot("./results/HumTimeWindow.png", precision=precisionArr,recall=recallArr, Fscore=FscoreArr, x_values=timeWindow) minimumDuration = [0.01, 0.07, 0.1, 0.3, 0.5, 1, 3, 5] precisionArr = [] recallArr = [] FscoreArr = [] for value in minimumDuration: print("sumThreshold: {} being evaluated".format(value)) valueResults = [] for i, filename in enumerate(self.files): print("Executing file {} number {}/{}".format(filename, i+1, len(self.files)), end='\r') audio, sr, channels, _, _, _ = std.AudioLoader(filename=filename)() audio = np.sum(audio, axis=1)/channels _, ret = essHumDetector(audio, minimumDuration=value) valueResults.append((filename.replace(self.wavDatasetPath, ""), ret)) print('') valueResults = sorted(valueResults, key=lambda x: x[0]) _, precision, recall = self.evaluateValue(valueResults, "Hum") precisionArr.append(precision) recallArr.append(recall) FscoreArr.append((1 + Fbeta**2) * precision * recall / (Fbeta**2 * precision + recall)) u.plot("./results/HumminimumDuration.png", precision=precisionArr,recall=recallArr, Fscore=FscoreArr, x_values=minimumDuration) timeContinuity = [0.1, 0.3, 0.5, 1, 3, 5] precisionArr = [] recallArr = [] FscoreArr = [] for value in timeContinuity: print("sumThreshold: {} being evaluated".format(value)) valueResults = [] for i, filename in enumerate(self.files): print("Executing file {} number {}/{}".format(filename, i+1, len(self.files)), end='\r') audio, sr, channels, _, _, _ = std.AudioLoader(filename=filename)() audio = np.sum(audio, axis=1)/channels _, ret = essHumDetector(audio, timeContinuity=value) valueResults.append((filename.replace(self.wavDatasetPath, ""), ret)) print('') valueResults = sorted(valueResults, key=lambda x: x[0]) _, precision, recall = self.evaluateValue(valueResults, "Hum") precisionArr.append(precision) recallArr.append(recall) FscoreArr.append((1 + Fbeta**2) * precision * recall / (Fbeta**2 * precision + recall)) u.plot("./results/HumtimeContinuity.png", precision=precisionArr,recall=recallArr, Fscore=FscoreArr, x_values=timeContinuity) numberHarmonics = [i for i in range(6)] precisionArr = [] recallArr = [] FscoreArr = [] for value in numberHarmonics: print("sumThreshold: {} being evaluated".format(value)) valueResults = [] for i, filename in enumerate(self.files): print("Executing file {} number {}/{}".format(filename, i+1, len(self.files)), end='\r') audio, sr, channels, _, _, _ = std.AudioLoader(filename=filename)() audio = np.sum(audio, axis=1)/channels _, ret = essHumDetector(audio, numberHarmonics=value) valueResults.append((filename.replace(self.wavDatasetPath, ""), ret)) print('') valueResults = sorted(valueResults, key=lambda x: x[0]) _, precision, recall = self.evaluateValue(valueResults, "Hum") precisionArr.append(precision) recallArr.append(recall) FscoreArr.append((1 + Fbeta**2) * precision * recall / (Fbeta**2 * precision + recall)) u.plot("./results/HumnumberHarmonics.png", precision=precisionArr,recall=recallArr, Fscore=FscoreArr, x_values=numberHarmonics)
def lowsnr(self): snrnrgThresholdArr = [0.1, 0.3, 0.5, 0.7, 0.9] snracThresholdArr = [0.1, 0.3, 0.5, 0.7, 0.9] snrThresholdArr = [-3, -1, 1, 3, 5, 7, 9] precisionArr = [] recallArr = [] FscoreArr = [] for value in snrnrgThresholdArr: print("sumThreshold: {} being evaluated".format(value)) valueResults = [] lsd = LowSnrDetector(nrgThreshold=value) for i, filename in enumerate(self.files): print("Executing file {} number {}/{}".format(filename, i+1, len(self.files)), end='\r') audio, sr, channels, _, _, _ = std.AudioLoader(filename=filename)() audio = np.sum(audio, axis=1)/channels _, ret = lsd(audio) valueResults.append((filename.replace(self.wavDatasetPath, ""), ret)) print('') valueResults = sorted(valueResults, key=lambda x: x[0]) _, precision, recall = self.evaluateValue(valueResults, "lowSNR") precisionArr.append(precision) recallArr.append(recall) FscoreArr.append((1 + Fbeta**2) * precision * recall / (Fbeta**2 * precision + recall)) u.plot("./results/snrnrgThreshold.png", precision=precisionArr, recall=recallArr, Fscore=FscoreArr, x_values=snrnrgThresholdArr) precisionArr = [] recallArr = [] FscoreArr = [] for value in snracThresholdArr: print("sumThreshold: {} being evaluated".format(value)) valueResults = [] lsd = LowSnrDetector(acThreshold=value) for i, filename in enumerate(self.files): print("Executing file {} number {}/{}".format(filename, i+1, len(self.files)), end='\r') audio, sr, channels, _, _, _ = std.AudioLoader( filename=filename)() audio = np.sum(audio, axis=1)/channels _, ret = lsd(audio) valueResults.append( (filename.replace(self.wavDatasetPath, ""), ret)) print('') valueResults = sorted(valueResults, key=lambda x: x[0]) _, precision, recall = self.evaluateValue( valueResults, "lowSNR") precisionArr.append(precision) recallArr.append(recall) FscoreArr.append((1 + Fbeta**2) * precision * recall / (Fbeta**2 * precision + recall)) u.plot("./results/snracThreshold.png", precision=precisionArr, recall=recallArr, Fscore=FscoreArr, x_values=snracThresholdArr) precisionArr = [] recallArr = [] FscoreArr = [] for value in snrThresholdArr: print("sumThreshold: {} being evaluated".format(value)) valueResults = [] lsd = LowSnrDetector(snrThreshold=value) for i, filename in enumerate(self.files): print("Executing file {} number {}/{}".format(filename, i+1, len(self.files)), end='\r') audio, sr, channels, _, _, _ = std.AudioLoader( filename=filename)() audio = np.sum(audio, axis=1)/channels _, ret = lsd(audio) valueResults.append( (filename.replace(self.wavDatasetPath, ""), ret)) print('') valueResults = sorted(valueResults, key=lambda x: x[0]) _, precision, recall = self.evaluateValue( valueResults, "lowSNR") precisionArr.append(precision) recallArr.append(recall) FscoreArr.append((1 + Fbeta**2) * precision * recall / (Fbeta**2 * precision + recall)) u.plot("./results/snrThreshold.png", precision=precisionArr, recall=recallArr, Fscore=FscoreArr, x_values=snrThresholdArr)
def detectBW(fpath: str, frame_size: float, hop_size: float, floor_db: float, oversample_f: int): if os.path.splitext(fpath)[1] != ".wav": raise ValueError( "file must be wav" ) #check if the file has a wav extension, else: raise error if not is_power2(oversample_f): raise ValueError("oversample factor can only be 1, 2 or 4" ) #check if the oversample factor is a power of two #audio loader returns x, sample_rate, number_channels, md5, bit_rate, codec, of which only the first 3 are needed audio, SR = estd.AudioLoader(filename=fpath)()[:2] if audio.shape[1] != 1: audio = (audio[:, 0] + audio[:, 1]) / 2 #if stereo: downmix to mono frame_size *= oversample_f #if an oversample factor is desired, apply it f = np.arange(int(frame_size / 2) + 1) / frame_size * SR #initialize frequency vector or xticks fc_index_arr = [] interpolated_spectrum = np.zeros( int(frame_size / 2) + 1) #initialize interpolated_spectrum array fft = estd.FFT(size=frame_size) #declare FFT function window = estd.Windowing(size=frame_size, type="hann") #declare windowing function for i, frame in enumerate( estd.FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True)): frame = window(frame) #apply window to the frame frame_fft = abs(fft(frame)) frame_fft_db = 20 * np.log10( frame_fft + eps) #calculate frame fft values in db #energy_arr.append(energy(frame_fft)) interp_frame = compute_spectral_envelope( frame_fft_db, f, "linear" ) #compute the linear interpolation between the values of the maxima of the spectrum interp_frame = modify_floor(interp_frame, floor_db, log=True) fc_index = compute_fc(interp_frame) fc_index_arr.append(fc_index) if energy_verification(frame_fft, fc_index): fc_index_arr.append(fc_index) #else: # fc_index_arr.append(len(f)-1) interpolated_spectrum += interp_frame #append the values to window interpolated_spectrum /= i + 1 #energy_arr = normalise(energy_arr) #energy_mask = energy_arr>0.05 if len(fc_index_arr) == 0: fc_index_arr = [frame_size] hist = compute_histogram(fc_index_arr, f) fc, conf, binary = compute_mean_fc(hist, fc_index_arr, f, SR) print("filename: ", fpath, "mean_fc: ", fc, " conf: ", conf, " binary_result: ", binary) fig, ax = plt.subplots(3, 1, figsize=(15, 9)) ax[0].plot(fc_index_arr, "x") ax[1].stem(f, hist) ax[2].plot(f, interpolated_spectrum) ax[2].axvline(x=fc, color="r") plt.show()
import essentia.standard as estd from essentia import array as esarr import matplotlib.pyplot as plt import os import numpy as np DIR = "../Dataset/BW detection/" for file in os.listdir(DIR): fpath = os.path.join(DIR, file) name, extension = os.path.splitext(file) print(file) if extension == ".wav": x, SR, channels, _, br, _ = estd.AudioLoader(filename=fpath)() channels = x.shape[1] if channels != 1: x = (x[:, 0] + x[:, 1]) / 2 print(x.shape, SR, channels, br) window = estd.Windowing(size=len(x), type="hann") x = window(x) N = int(2**(np.ceil(np.log2(len(x))))) x = np.append(x, np.zeros(N - len(x))) x = esarr(x) tfX = estd.FFT()(x) tfX = 20 * np.log10(abs(tfX)) f = np.arange(int(len(x) / 2) + 1) / len(x) * SR plt.plot(f, tfX[:int(len(x) / 2) + 1]) plt.savefig(os.path.join(DIR, name + ".png"))
import matplotlib.pyplot as plt import essentia.standard as ess import numpy as np M = 1024 N = 1024 H = 512 fs = 44100 x = ess.MonoLoader(filename='output3.wav', sampleRate=fs)() ess.AudioLoader() spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type='hann') pitchYin = ess.PitchYin() hpcp = ess.HPCP() hpcps = [] spectralPeaks = ess.SpectralPeaks() pitches = [] pitchConfidences = [] for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): pitch, pitchConfidence = pitchYin(frame) if pitchConfidence < 0.9:
def saturation(self): satEnergyThreshold = [-30, -20, -10, -7, -5, -3, -1, -0.01] satDifferentialThreshold = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10] satMinimumDuration = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1] precisionArr = [] recallArr = [] FscoreArr = [] for value in satEnergyThreshold: print("energy Threshold: {} being evaluated".format(value)) valueResults = [] for i, filename in enumerate(self.files): print("Executing file {} number {}/{}".format(filename, i+1, len(self.files)), end = '\r') audio, sr, channels, _, _, _ = std.AudioLoader(filename=filename)() audio = np.sum(audio, axis=1)/channels _, _, _, ret = essSaturationDetector(audio, energyThreshold=value) valueResults.append((filename.replace(self.wavDatasetPath, ""), ret)) print('') valueResults = sorted(valueResults, key=lambda x: x[0]) _, precision, recall = self.evaluateValue(valueResults, "Saturation") precisionArr.append(precision) recallArr.append(recall) FscoreArr.append((1 + Fbeta**2) * precision * recall / (Fbeta**2 * precision + recall)) u.plot("./results/satEnergyThreshold.png", precision=precisionArr, recall=recallArr, Fscore=FscoreArr, x_values=satEnergyThreshold) precisionArr = [] recallArr = [] FscoreArr = [] for value in satDifferentialThreshold: print("energy Threshold: {} being evaluated".format(value)) valueResults = [] for i, filename in enumerate(self.files): print("Executing file {} number {}/{}".format(filename, i+1, len(self.files)), end='\r') audio, sr, channels, _, _, _ = std.AudioLoader(filename=filename)() audio = np.sum(audio, axis=1)/channels _, _, _, ret = essSaturationDetector(audio, differentialThreshold=value) valueResults.append((filename.replace(self.wavDatasetPath, ""), ret)) print('') valueResults = sorted(valueResults, key=lambda x: x[0]) _, precision, recall = self.evaluateValue(valueResults, "Saturation") precisionArr.append(precision) recallArr.append(recall) FscoreArr.append((1 + Fbeta**2) * precision * recall / (Fbeta**2 * precision + recall)) u.plot("./results/satDifferentialThreshold.png", precision=precisionArr,recall=recallArr, Fscore=FscoreArr, x_values=satDifferentialThreshold) precisionArr = [] recallArr = [] FscoreArr = [] for value in satMinimumDuration: print("energy Threshold: {} being evaluated".format(value)) valueResults = [] for i, filename in enumerate(self.files): print("Executing file {} number {}/{}".format(filename, i+1, len(self.files)), end='\r') audio, sr, channels, _, _, _ = std.AudioLoader(filename=filename)() audio = np.sum(audio, axis=1)/channels _, _, _, ret = essSaturationDetector(audio, minimumDuration=value) valueResults.append((filename.replace(self.wavDatasetPath, ""), ret)) print('') valueResults = sorted(valueResults, key=lambda x: x[0]) _, precision, recall = self.evaluateValue(valueResults, "Saturation") precisionArr.append(precision) recallArr.append(recall) FscoreArr.append((1 + Fbeta**2) * precision * recall / (Fbeta**2 * precision + recall)) u.plot("./results/satMinimumDuration.png", precision=precisionArr,recall=recallArr, Fscore=FscoreArr, x_values=satMinimumDuration)
def Bit_Detection(fpath: str): if os.path.splitext(fpath)[1] != ".wav": # check if the file has a wav extension, else: raise error raise ValueError("file must be wav") #audio loader returns x, sample_rate, number_channels, md5, bit_rate, codec, of which only the first 3 are needed audio, SR, channels, _, br, _ = estd.AudioLoader(filename=fpath)() b = int(br / SR / channels) #number of bits used to code the fpath signal possible_values = np.arange(2**b) #_, ax = plt.subplots(3, channels, figsize=(15, 9)) print("array created") #bits_result = -1 #conf_result = 1 audio_int_channel = (2**b) * ((0.5 * audio[:, 0]) + 0.5) audio_int_channel = audio_int_channel[:10000] hist = compute_histogram(audio_int_channel, possible_values) plt.plot(hist, 'x') plt.show() assert False for channel in range(channels): audio_int_channel = (2**b) * ((0.5 * audio[:, channel]) + 0.5) hist = compute_histogram(audio_int_channel, possible_values) #hist_peaks = hist/sum(hist) #hist_peaks[hist_peaks <= 0.0001] = 0 #x_peaks, y_peaks = get_peaks(hist_peaks, possible_values) #y_peaks = np.array(y_peaks) * sum(hist) #tol = b/2 #resolution = 2 #center_x = [] #center_y = [] #first_idx = np.argmax(y_peaks) - resolution #for i in range(first_idx, first_idx + (3 * resolution + 1)): # center_x.append(x_peaks[i]) # center_y.append(y_peaks[i]) #b_pred = np.round(np.log2(np.mean(np.diff(center_x)))) #b_pred = max(8,b_pred) #hop = 2 ** b_pred #print(hop) #zero_idx = 2 ** (b - 1) #idx_arr = [] #idx = zero_idx - int(zero_idx/hop)*hop #while idx <= 2**b: # idx_arr.append(idx) # idx += hop #conf_hist = hist.copy() #for x_search in idx_arr: # if (x_search - tol)<0: # conf_hist[:int(x_search + tol)] = 0 # elif (x_search + tol)>len(conf_hist): # conf_hist[int(x_search - tol):] = 0 # else: # conf_hist[int(x_search - tol):int(x_search + tol)] = 0 #print("b_pred: ", b_pred, "conf: ", 1-sum(conf_hist)/sum(hist)) #bits_result = max(bits_result, b_pred) #conf_result *= 1-sum(conf_hist)/sum(hist) """ if channels == 1: ax[0].plot(audio_int_channel) ax[1].plot(possible_values, hist, 'x') ax[1].plot(x_peaks, y_peaks/sum(hist), 'x') ax[2].plot(possible_values, conf_hist, 'x') else: ax[0][channel].plot(audio_int_channel) ax[1][channel].plot(possible_values, hist, 'x') ax[1][channel].plot(x_peaks, y_peaks, 'xr') ax[1][channel].plot(x_peaks[np.argmax(y_peaks)], max(y_peaks), 'x') ax[2][channel].plot(possible_values, conf_hist, 'x') """ #print("bits_result: ", bits_result, "conf_result: ", conf_result) plt.plot(hist, 'x') plt.show()
def detectBW(fpath: str, frame_size: float, hop_size: float, floor_db: float, oversample_f: int): # check if the file has a wav extension, else: raise error if os.path.splitext(fpath)[1] != ".wav": raise ValueError("file must be wav") # check if the oversample factor is a power of two if not is_power2(oversample_f): raise ValueError("oversample factor can only be 1, 2 or 4") # audio loader returns x, sample_rate, number_channels, md5, bit_rate, codec, of which only the first 3 are needed audio, SR = estd.AudioLoader(filename=fpath)()[:2] # if stereo: downmix to mono if audio.shape[1] != 1: audio = (audio[:, 0] + audio[:, 1]) / 2 frame_size *= oversample_f # if an oversample factor is desired, apply it fc_index_arr = [] hist = np.zeros(129) fft = estd.FFT(size=frame_size) # declare FFT function window = estd.Windowing(size=frame_size, type="hann") # declare windowing function avg_frames = np.zeros(int(frame_size / 2) + 1) max_nrg = max([ sum(abs(fft(window(frame)))**2) for frame in estd.FrameGenerator( audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True) ]) for i, frame in enumerate( estd.FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True)): frame = window(frame) # apply window to the frame frame_fft = abs(fft(frame)) nrg = sum(frame_fft**2) if nrg >= 0.1 * max_nrg: for j in reversed(range(len(frame_fft))): if sum(frame_fft[j:] / j) >= 1e-5: j = int(j / frame_size * 128) fc_index_arr.append(j) hist[j] += nrg break avg_frames = avg_frames + frame_fft if len(fc_index_arr) == 0: fc_index_arr.append(128) hist[128] += 1 avg_frames /= (i + 1) most_likely_bin, conf, binary = compute_mean_fc(avg_frames, fc_index_arr, [], SR, hist=hist) most_likely_bin *= int(frame_size / 128) print("f={:0=2f}, conf={:0=2f}, problem={}".format( most_likely_bin * SR / frame_size, conf, str(binary))) fig, ax = plt.subplots(2, 1, figsize=(15, 9)) ax[0].plot(20 * np.log10(avg_frames + eps)) ax[0].axvline(x=most_likely_bin, color='r') ax[0].set_ylim(bottom=-120) ax[1].stem(hist) plt.show()
def main(fpath: str, frame_size: float, hop_size: float, entropy_th: float): if os.path.splitext(fpath)[1] != ".wav": # check if the file has a wav extension, else: raise error raise ValueError("file must be wav") #audio loader returns x, sample_rate, number_channels, md5, bit_rate, codec, of which only the first 3 are needed audio, _, _, _, _, _ = estd.AudioLoader(filename=fpath)() #bit_depth = int(br / SR / channels) #number of bits used to code the fpath signal if audio.shape[1] > 1: audio = np.reshape(audio, audio.shape[0] * audio.shape[1], order='F') audio = audio.astype("float32") / max(audio.astype("float32")) #bit_depth = min(bit_depth,16) audio = esarr(audio.astype("float16")) max_nrg = max([ sum(frame**2) for frame in estd.FrameGenerator( audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True) ]) ac_arr = [] nrg_arr = [] sig_pwr = 0 noise_pwr = 0 sig_cnt = 0 noise_cnt = 0 ac_th = 0.6 for frame in estd.FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): ac = abs(autocorr(frame, mode="half")) #ac /= sum(ac) #plt.plot(ac); plt.show() nrg = sum(frame**2) ac = ac[0] / sum(ac) if sum(ac) > 0 else 0 nrg = nrg / max_nrg if max_nrg > 0 else 0 ac_arr.append(ac) nrg_arr.append(nrg) ac_arr /= max(ac_arr) for nrg, ac in zip(nrg_arr, ac_arr): if nrg < 0.1: noise_pwr += nrg**2 noise_cnt += 1 else: if ac < ac_th: sig_pwr += nrg**2 sig_cnt += 1 else: noise_pwr += nrg**2 noise_cnt += 1 if noise_cnt == 0: SNR = np.inf elif sig_cnt == 0: SNR = 10 * np.log10(eps) else: sig_pwr /= sig_cnt noise_pwr /= noise_cnt SNR = 10 * np.log10(sig_pwr / noise_pwr) print("SNR: ", SNR) print("sig: {}, noise: {}".format(sig_cnt, noise_cnt)) print("conf: ", 1 - abs(noise_cnt - sig_cnt) / (sig_cnt + noise_cnt)) #print("Max Ent: ", max(ent_arr)) #arr /= max(arr) #arr_env = compute_envelope(arr, np.arange(len(arr))) _, ax = plt.subplots(3, 1, figsize=(15, 9)) ax[0].plot(audio) #ax[1].plot(arr_env) ax[1].plot(ac_arr) #ax[1].plot(ent_arr) #ax[1].hlines(entropy_th,xmin = 0, xmax = len(ent_arr)) ax[2].plot(nrg_arr) plt.show()