def getFbankFeatures(wavFile, destFile, median, variance, numFrames): HCopy('./fbank-48.conf', wavFile, destFile + '.htk') htk = HTKFile() htk.load(destFile + '.htk') fbank = np.asarray(htk.data) np.save(destFile + '.npy', fbank) num_frames = np.size(fbank[0, :]) numFrames += num_frames for i in range(nfilt): feat = fbank[i, :].flatten('F') median_ut = np.sum(feat) variance_ut = np.sum(feat * feat) median[i] += median_ut variance[i] += variance_ut return median, variance, numFrames
def predict_vcm(model, input, mean_var): ### read normalisation parameters assert os.path.exists(mean_var) with open(mean_var, 'rb') as f: mv = pickle.load(f) m, v = mv['mean'], mv['var'] std = lambda feat: (feat - m) / v # Load input feature and predict htk_reader = HTKFile() htk_reader.load(input) feat = std(np.array(htk_reader.data)) input = Variable(torch.from_numpy(feat.astype('float32'))) # .cuda() output_ling = model(input).data.data.cpu().numpy() prediction_confidence = output_ling.max() # post propability class_names_ling = ['NCS', 'CNS', 'CRY', 'OTH'] cls_ling = np.argmax(output_ling) predition_vcm = class_names_ling[cls_ling] # prediction return predition_vcm, prediction_confidence
def code_data_to_MFCC(filepath, outputFilePath, configPath, filenameList=None): filepath = filepath + "/" outputFilePath = outputFilePath + "/" if filenameList == None: filenameList = find_all_files(filepath, ".wav") f = open(filepath + "codetr.scp", "w+") allOutputFiles = [] for filename in sorted(filenameList): filename = filepath + re.search("(.+).wav", filename).group(1) inputFileName = filename + ".wav" outputFileName = filename + ".mfc" allOutputFiles.append(outputFileName) f.write(inputFileName + " " + outputFileName + "\n") f.close() HTK.HCopy(configPath, filepath + "codetr.scp") htk_reader = HTKFile() for filename in sorted(allOutputFiles): htk_reader.load(filename) result = numpy.array(htk_reader.data) filename_out = (outputFilePath + re.search(".+\/(.+).mfc", filename).group(1) + ".out") numpy.savetxt(filename_out, result, delimiter=",")
def datatest_generator(filelistpath, batch_size=32, shuffle=False): batch_index = 0 image_index = -1 filelist = open(filelistpath, 'r') filenames = filelist.readlines() filelist.close() dataset = (['Chernobyl.csv', 'PolandNFC.csv', 'warblrb10k-eval.csv']) labels_dict = {} for n in range(len(dataset)): labels_list = csv.reader(open(LABELPATH + dataset[n], 'r')) next(labels_list) for k, r, v in labels_list: labels_dict[r + '/' + k] = v while True: image_index = (image_index + 1) % len(filenames) # if shuffle and image_index = 0 # shuffling filelist if shuffle == True and image_index == 0: random.shuffle(filenames) file_id = filenames[image_index].rstrip() if batch_index == 0: # re-initialize spectrogram and label batch spect_batch = np.zeros( [batch_size, spect.shape[0], spect.shape[1], 1]) label_batch = np.zeros([batch_size, 1]) if features == 'h5': #file_prefix = file_id[:file_id.rfind("/")+1] #file_suffix = file_id[file_id.rfind("/")+1:] #hf = h5py.File(SPECTPATH + file_prefix + 'enhanced_'+ file_suffix + '.h5') hf = h5py.File(SPECTPATH + file_id[:-4] + '.h5', 'r') #[:-4]for evaluation dataset imagedata = hf.get('features') imagedata = np.array(imagedata) hf.close() # normalizing intensity values of spectrogram from [-15.0966 to 2.25745] to [0 to 1] range imagedata = (imagedata + 15.0966) / (15.0966 + 2.25745) elif features == 'mfc': htk_reader = HTKFile() #file_prefix = file_id[:file_id.rfind("/")+1] #file_suffix = file_id[file_id.rfind("/")+1:] #htk_reader.load(SPECTPATH + file_prefix + 'enhanced_'+ file_suffix[:-4] + '.mfc') htk_reader.load(SPECTPATH + file_id[:-8] + '.mfc') imagedata = np.array(htk_reader.data) imagedata = imagedata / 17.0 # processing files with shapes other than expected shape in warblr dataset if imagedata.shape[0] != expected_shape[0]: old_imagedata = imagedata imagedata = np.zeros(expected_shape) if old_imagedata.shape[0] < expected_shape[0]: diff_in_frames = expected_shape[0] - old_imagedata.shape[0] if diff_in_frames < expected_shape[0] / 2: imagedata = np.vstack((old_imagedata, old_imagedata[range( old_imagedata.shape[0] - diff_in_frames, old_imagedata.shape[0])])) elif diff_in_frames > expected_shape[0] / 2: count = np.floor(expected_shape[0] / old_imagedata.shape[0]) remaining_diff = (expected_shape[0] - old_imagedata.shape[0] * int(count)) imagedata = np.vstack(([old_imagedata] * int(count))) imagedata = np.vstack((imagedata, old_imagedata[range( old_imagedata.shape[0] - remaining_diff, old_imagedata.shape[0])])) elif old_imagedata.shape[0] > expected_shape[0]: diff_in_frames = old_imagedata.shape[0] - expected_shape[0] if diff_in_frames < expected_shape[0] / 2: imagedata[range(0, diff_in_frames + 1), :] = np.mean( np.array([ old_imagedata[range(0, diff_in_frames + 1), :], old_imagedata[range( old_imagedata.shape[0] - diff_in_frames - 1, old_imagedata.shape[0]), :] ]), axis=0) imagedata[range(diff_in_frames + 1, expected_shape[0]), :] = old_imagedata[ range(diff_in_frames + 1, expected_shape[0])] elif diff_in_frames > expected_shape[0] / 2: count = int( np.floor(old_imagedata.shape[0] / expected_shape[0])) remaining_diff = (old_imagedata.shape[0] - expected_shape[0] * count) for index in range(0, count): imagedata[range(0, expected_shape[0]), :] = np.sum( [ imagedata, old_imagedata[range( index * expected_shape[0], (index + 1) * expected_shape[0])] ], axis=0) / count imagedata[range(0, remaining_diff), :] = np.mean( np.array([ old_imagedata[range( old_imagedata.shape[0] - remaining_diff, old_imagedata.shape[0] ), :], imagedata[range(0, remaining_diff), :] ]), axis=0) if domain_adaptation == True: filedataset = file_id[:file_id.rfind('/')] #print('Domain adaptation is supposed to be off') if filedataset == 'BirdVox-DCASE-20k': imagedata = np.matmul(imagedata, transform_for_birdvox) imagedata = (imagedata - 3.4) / (6.95 - 3.4) #min: 3.4020782 - -max:6.9419036 elif filedataset == 'ff1010bird': imagedata = np.matmul(imagedata, transform_for_ff1010bird) imagedata = (imagedata - 1.4) / (7.37 - 1.4) # min:1.4374458--max:7.363845 elif filedataset == 'Chernobyl': imagedata = np.matmul(imagedata, transform_for_chern) imagedata = (imagedata - 3.75) / (7 - 3.75) #3.7511292--max:7.00125 elif filedataset == 'PolandNFC': imagedata = np.matmul(imagedata, transform_for_poland) imagedata = (imagedata + 10.8) / (10.8 + 7.40) # -10.796116--max:7.4045897 imagedata = np.reshape(imagedata, (1, imagedata.shape[0], imagedata.shape[1], 1)) spect_batch[batch_index, :, :, :] = imagedata batch_index += 1 if batch_index >= batch_size: batch_index = 0 inputs = [spect_batch] yield inputs
def data_generator(filelistpath, batch_size=16, shuffle=False): batch_index = 0 image_index = -1 filelist = open(filelistpath, 'r') filenames = filelist.readlines() filelist.close() # shuffling filelist if shuffle == True: random.shuffle(filenames) dataset = ['BirdVox-DCASE-20k.csv', 'ff1010bird.csv', 'warblrb10k.csv'] labels_dict = {} for n in range(len(dataset)): labels_list = csv.reader(open(LABELPATH + dataset[n], 'r')) next(labels_list) for k, r, v in labels_list: labels_dict[r + '/' + k + '.wav'] = v while True: image_index = (image_index + 1) % len(filenames) # if shuffle and image_index = 0 # shuffling filelist if shuffle == True and image_index == 0: random.shuffle(filenames) file_id = filenames[image_index].rstrip() if batch_index == 0: # re-initialize spectrogram and label batch spect_batch = np.zeros([1, spect.shape[0], spect.shape[1], 1]) label_batch = np.zeros([1, 1]) aug_spect_batch = np.zeros( [batch_size, spect.shape[0], spect.shape[1], 1]) aug_label_batch = np.zeros([batch_size, 1]) if features == 'h5': hf = h5py.File(SPECTPATH + file_id + '.h5', 'r') imagedata = hf.get('features') imagedata = np.array(imagedata) hf.close() # normalizing intensity values of spectrogram from [-15.0966 to 2.25745] to [0 to 1] range imagedata = (imagedata + 15.0966) / (15.0966 + 2.25745) elif features == 'mfc': htk_reader = HTKFile() htk_reader.load(SPECTPATH + file_id[:-4] + '.mfc') imagedata = np.array(htk_reader.data) imagedata = imagedata / 17.0 # processing files with shapes other than expected shape in warblr dataset if imagedata.shape[0] != expected_shape[0]: old_imagedata = imagedata imagedata = np.zeros(expected_shape) if old_imagedata.shape[0] < expected_shape[0]: diff_in_frames = expected_shape[0] - old_imagedata.shape[0] if diff_in_frames < expected_shape[0] / 2: imagedata = np.vstack((old_imagedata, old_imagedata[range( old_imagedata.shape[0] - diff_in_frames, old_imagedata.shape[0])])) elif diff_in_frames > expected_shape[0] / 2: count = np.floor(expected_shape[0] / old_imagedata.shape[0]) remaining_diff = (expected_shape[0] - old_imagedata.shape[0] * int(count)) imagedata = np.vstack(([old_imagedata] * int(count))) imagedata = np.vstack((imagedata, old_imagedata[range( old_imagedata.shape[0] - remaining_diff, old_imagedata.shape[0])])) elif old_imagedata.shape[0] > expected_shape[0]: diff_in_frames = old_imagedata.shape[0] - expected_shape[0] if diff_in_frames < expected_shape[0] / 2: imagedata[range(0, diff_in_frames + 1), :] = np.mean( np.array([ old_imagedata[range(0, diff_in_frames + 1), :], old_imagedata[range( old_imagedata.shape[0] - diff_in_frames - 1, old_imagedata.shape[0]), :] ]), axis=0) imagedata[range(diff_in_frames + 1, expected_shape[0]), :] = old_imagedata[ range(diff_in_frames + 1, expected_shape[0])] elif diff_in_frames > expected_shape[0] / 2: count = int( np.floor(old_imagedata.shape[0] / expected_shape[0])) remaining_diff = (old_imagedata.shape[0] - expected_shape[0] * count) for index in range(0, count): imagedata[range(0, expected_shape[0]), :] = np.sum( [ imagedata, old_imagedata[range( index * expected_shape[0], (index + 1) * expected_shape[0])] ], axis=0) / count imagedata[range(0, remaining_diff), :] = np.mean( np.array([ old_imagedata[range( old_imagedata.shape[0] - remaining_diff, old_imagedata.shape[0] ), :], imagedata[range(0, remaining_diff), :] ]), axis=0) imagedata = np.reshape(imagedata, (1, imagedata.shape[0], imagedata.shape[1], 1)) spect_batch[0, :, :, :] = imagedata label_batch[0, :] = labels_dict[file_id] gen_img = datagen.flow(imagedata, label_batch[0, :], batch_size=1, shuffle=False, save_to_dir=None) aug_spect_batch[batch_index, :, :, :] = imagedata aug_label_batch[batch_index, :] = label_batch[0, :] batch_index += 1 for n in range(AUGMENT_SIZE - 1): aug_spect_batch[batch_index, :, :, :], aug_label_batch[ batch_index, :] = gen_img.next() batch_index += 1 if batch_index >= batch_size: batch_index = 0 inputs = [aug_spect_batch] outputs = [aug_label_batch] yield inputs, outputs
def main(session_key, config_file, segment_size, step_size): # Get audiofilename audio_dir = "static/uploads/" + session_key + "/" for file_name in os.listdir(audio_dir): if file_name[0] != ".": audio_name = file_name break # Get full path audio_path = audio_dir + file_name # If mp3, convert to wav if audio_path[-3:] == "mp3": wav_audio = AudioSegment.from_mp3(audio_path) audio_path = audio_path[:-3:] + "wav" # set new audio_path wav_audio.export(audio_path, format="wav") # Get metadata loaded_sound = AudioSegment.from_wav(audio_path) audio_duration = len(loaded_sound) frame_rate = loaded_sound.frame_rate # If duration is longer than 1 hour, segment into chunks if audio_duration > 3600000: chunks = [] chunk_start_time = 0 while chunk_start_time * 1000 < audio_duration: subprocess.call(["sox", audio_path, audio_dir + str(int((chunk_start_time / 3600)+1)) + ".wav", "trim", str(chunk_start_time), "3600"]) chunks.append(audio_dir + str(int((chunk_start_time / 3600)+1)) + ".wav") chunk_start_time += 3600 else: chunks = [audio_path] # Create dir for ouput and set filenames output_dir = "static/data/" + session_key + "/" subprocess.call(["mkdir", output_dir]) output_path = output_dir + audio_name.split(".")[0] + ".mfcc.htk" if config_file == "spectrogram": waveform = wavfile.read(audio_path)[1] print(frame_rate) print(segment_size) print(int(frame_rate*segment_size)) f, t, Sxx = signal.spectrogram(waveform, fs=frame_rate, nperseg=int(frame_rate*(segment_size/10000)), noverlap=0) Sxx_transpose = Sxx.transpose() print("scipy shape: ", Sxx_transpose.shape) # Reduce dimensionality to 39 with svd svd = TruncatedSVD(n_components=39) result = svd.fit_transform(Sxx_transpose) print("scipy shape2: ", result.shape) else: # Prepend path to config file config_file = config_dir + config_file # Update config file with segment- and steplength, divided by 1000 to get second-format update_config(config_file, str(segment_size/10000), str(step_size/10000)) # Run opensmile to output features in output dir subprocess.call([smilextract, "-C", config_file, "-I", audio_path, "-O", output_path]) # Read file, and return formatted data htk_reader = HTKFile() htk_reader.load(output_path) result = np.array(htk_reader.data) # Flatten concatenate ten vectors at a time, resulting in 39*10 dimensionality per snippet new_result = [] temp_list = [] for vec in result: temp_list.append(vec) if len(temp_list) == 10: new_result.append(np.concatenate(tuple(temp_list), axis=0)) temp_list = [] result = np.array(new_result) # Run data through t-SNE tsne = TSNE(n_components=2, perplexity=25)#, random_state=None) Y1 = convert_range(tsne.fit_transform(result)) print("t-SNE done") # Run data through PCA pca = PCA(n_components=2) Y2 = convert_range(pca.fit_transform(result)) print("PCA done") # Run data through SOM som = True if som: print("SOM-grid-size: ", int(len(result)**0.5)) mapsize = [int(len(result)**0.5), int(len(result)**0.5)] if mapsize[0] > 100: mapsize = [100, 100] som = sompy.SOMFactory.build(result, mapsize, mask=None, mapshape='planar', lattice='rect', normalization='var', initialization='pca', neighborhood='gaussian', training='batch', name='sompy') # this will use the default parameters, but i can change the initialization and neighborhood methods som.train(n_job=1, verbose='info') # verbose='debug' will print more, and verbose=None wont print anything som_output = np.array(np.array([np.array(np.unravel_index(int(bmu), (mapsize[0],mapsize[0]))) for bmu in som._bmu[0]])) Y3 = convert_range(som_output) print("SOM done") else: Y3 = convert_range(np.array([np.array([random.randint(-50, 50), random.randint(-50, 50)]) for i in range(len(Y2))])) # Run data through UMAP run_umap = True if run_umap: Y4 = convert_range(umap.UMAP().fit_transform(result)) print("UMAP done") else: Y4 = convert_range(np.array([np.array([random.randint(-50, 50), random.randint(-50, 50)]) for i in range(len(Y2))])) # Run data through isomap IM = Isomap(n_components=2) Y5 = convert_range(IM.fit_transform(result)) print("Isomap done") # Experiment with autoencoder, bad results so commented for now # Run data through autoencoder # ae = False # if ae: # Y5 = convert_range(AE(result)) # else: # Y5 = convert_range(np.array([np.array([random.randint(-50, 50), random.randint(-50, 50)]) for i in range(len(Y2))])) # print("Autoencoder done") # K-means on raw features kmeans2 = KMeans(n_clusters=2, random_state=0).fit(result) print("kmeans2 done") kmeans3 = KMeans(n_clusters=3, random_state=0).fit(result) print("kmeans3 done") kmeans4 = KMeans(n_clusters=4, random_state=0).fit(result) print("kmeans4 done") kmeans5 = KMeans(n_clusters=5, random_state=0).fit(result) print("kmeans5 done") kmeans6 = KMeans(n_clusters=6, random_state=0).fit(result) print("kmeans6 done") kmeans7 = KMeans(n_clusters=7, random_state=0).fit(result) print("kmeans7 done") kmeans8 = KMeans(n_clusters=8, random_state=0).fit(result) print("kmeans8 done") kmeans20 = KMeans(n_clusters=20, random_state=0).fit(result) print("kmeans20 done") # Format t-SNE output to correct dictionary format data = [] i = 0 for coord1, coord2, coord3, coord4, coord5, cluster_index2, cluster_index3, cluster_index4, cluster_index5, cluster_index6, cluster_index7, cluster_index8, cluster_index20 in zip(Y1, Y2, Y3, Y4, Y5, kmeans2.labels_, kmeans3.labels_, kmeans4.labels_, kmeans5.labels_, kmeans6.labels_, kmeans7.labels_, kmeans8.labels_, kmeans20.labels_): data.append({ "id":i, "tsneX":float(coord1[0]), "tsneY":float(coord1[1]), "pcaX":float(coord2[0]), "pcaY":float(coord2[1]), "somX":float(coord3[0]), "somY":float(coord3[1]), "umapX":float(coord4[0]), "umapY":float(coord4[1]), "aeX":float(coord5[0]), "aeY":float(coord5[1]), "start":int(i*step_size), "active":1, "color":"black", "kcolor2":color_dict[str(cluster_index2)], "kcolor3":color_dict[str(cluster_index3)], "kcolor4":color_dict[str(cluster_index4)], "kcolor5":color_dict[str(cluster_index5)], "kcolor6":color_dict[str(cluster_index6)], "kcolor7":color_dict[str(cluster_index7)], "kcolor8":color_dict[str(cluster_index8)], "kcolor20":color_dict[str(cluster_index20)]}) #data.append({"id":i, "tsneX":random.randint(1,99), "tsneY":random.randint(1,99), "pcaX":random.randint(1,99), "pcaY":random.randint(1,99), "start":int(i*step_size), "active":1, "color":"black"}) i+=1 # Save data as csv to be able to load later keys = data[0].keys() with open(output_dir + "data.csv", 'w') as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(data) # Save metadata as csv to be able to load later metadata = [{"audio_duration":audio_duration, "audio_path":audio_path, "segment_size":segment_size, "step_size":step_size, "chunks":",".join(chunks)}] keys = metadata[0].keys() with open(output_dir + "metadata.csv", 'w') as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(metadata)
def retrain(valid_points, session_key, old_session_key, segment_size, step_size): # Get audiofilename audio_dir = "static/uploads/" + session_key + "/" for file_name in os.listdir(audio_dir): if file_name[0] != ".": audio_name = file_name break # Get full path audio_path = audio_dir + file_name # If mp3, convert to wav if audio_path[-3:] == "mp3": wav_audio = AudioSegment.from_mp3(audio_path) audio_path = audio_path[:-3:] + "wav" # set new audio_path wav_audio.export(audio_path, format="wav") # Get metadata audio_duration = len(AudioSegment.from_wav(audio_path)) # Create dir for ouput and set filenames output_dir = "static/data/" + session_key + "/" subprocess.call(["mkdir", output_dir]) # Copy audio path_to_old_htk = "static/data/" + old_session_key + "/" + audio_name.split(".")[0] + ".mfcc.htk" path_to_new_htk = "static/data/" + session_key + "/" + audio_name.split(".")[0] + ".mfcc.htk" subprocess.call(["cp", path_to_old_htk, path_to_new_htk]) # Read file, and return formatted data htk_reader = HTKFile() htk_reader.load(path_to_old_htk) result = np.array(htk_reader.data) new_result = [] valid_points_indexes = [i[0] for i in valid_points[1:]] start_times = [i[1] for i in valid_points[1:]] colors = [i[2] for i in valid_points[1:]] for i, line in enumerate(result): if i in valid_points_indexes: new_result.append(line) new_result = np.array(new_result) # Run data through t-SNE tsne = TSNE(n_components=2, perplexity=25)#, random_state=None) Y1 = convert_range(tsne.fit_transform(new_result)) print("t-SNE done") # Run data through PCA pca = PCA(n_components=2) Y2 = convert_range(pca.fit_transform(new_result)) print("PCA done") # Run data through SOM som = True if som: print("SOM-grid-size: ", int(len(new_result)**0.5)) mapsize = [int(len(new_result)**0.5), int(len(new_result)**0.5)] if mapsize[0] > 100: mapsize = [100, 100] som = sompy.SOMFactory.build(new_result, mapsize, mask=None, mapshape='planar', lattice='rect', normalization='var', initialization='pca', neighborhood='gaussian', training='batch', name='sompy') # this will use the default parameters, but i can change the initialization and neighborhood methods som.train(n_job=1, verbose='info') # verbose='debug' will print more, and verbose=None wont print anything #som_output = np.array([np.array([0, int(bmu)]) if int(bmu) < 10 else np.array([int(str(bmu)[0]), int(str(bmu)[1])]) for bmu in som._bmu[0]]) som_output = np.array(np.array([np.array(np.unravel_index(int(bmu), (mapsize[0],mapsize[0]))) for bmu in som._bmu[0]])) Y3 = convert_range(som_output) print("SOM done") else: Y3 = convert_range(np.array([np.array([random.randint(-50, 50), random.randint(-50, 50)]) for i in range(len(Y2))])) # Run data through UMAP run_umap = True if run_umap: Y4 = convert_range(umap.UMAP().fit_transform(new_result)) print("UMAP done") else: Y4 = convert_range(np.array([np.array([random.randint(-50, 50), random.randint(-50, 50)]) for i in range(len(Y2))])) # Run data through autoencoder ae = False if ae: Y5 = convert_range(AE(result)) else: Y5 = convert_range(np.array([np.array([random.randint(-50, 50), random.randint(-50, 50)]) for i in range(len(Y2))])) print("Autoencoder done") # K-means on raw features kmeans2 = KMeans(n_clusters=2, random_state=0).fit(new_result) print("kmeans2 done") kmeans3 = KMeans(n_clusters=3, random_state=0).fit(new_result) print("kmeans3 done") kmeans4 = KMeans(n_clusters=4, random_state=0).fit(new_result) print("kmeans4 done") kmeans5 = KMeans(n_clusters=5, random_state=0).fit(new_result) print("kmeans5 done") kmeans6 = KMeans(n_clusters=6, random_state=0).fit(new_result) print("kmeans6 done") kmeans7 = KMeans(n_clusters=7, random_state=0).fit(new_result) print("kmeans7 done") kmeans8 = KMeans(n_clusters=8, random_state=0).fit(new_result) print("kmeans8 done") # Format t-SNE output to correct dictionary format data = [] i = 0 for coord1, coord2, coord3, coord4, coord5, start_time, color, cluster_index2, cluster_index3, cluster_index4, cluster_index5, cluster_index6, cluster_index7, cluster_index8 in zip(Y1, Y2, Y3, Y4, Y5, start_times, colors, kmeans2.labels_, kmeans3.labels_, kmeans4.labels_, kmeans5.labels_, kmeans6.labels_, kmeans7.labels_, kmeans8.labels_): data.append({ "id":i, "tsneX":float(coord1[0]), "tsneY":float(coord1[1]), "pcaX":float(coord2[0]), "pcaY":float(coord2[1]), "somX":float(coord3[0]), "somY":float(coord3[1]), "umapX":float(coord4[0]), "umapY":float(coord4[1]), "aeX":float(coord5[0]), "aeY":float(coord5[1]), "start":start_time, "active":1, "color":color, "kcolor2":color_dict[str(cluster_index2)], "kcolor3":color_dict[str(cluster_index3)], "kcolor4":color_dict[str(cluster_index4)], "kcolor5":color_dict[str(cluster_index5)], "kcolor6":color_dict[str(cluster_index6)], "kcolor7":color_dict[str(cluster_index7)], "kcolor8":color_dict[str(cluster_index8)]}) #data.append({"id":i, "tsneX":random.randint(1,99), "tsneY":random.randint(1,99), "pcaX":random.randint(1,99), "pcaY":random.randint(1,99), "start":int(i*step_size), "active":1, "color":"black"}) i+=1 # Save data as csv to be able to load later keys = data[0].keys() with open(output_dir + "data.csv", 'w') as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(data) # Save metadata as csv to be able to load later metadata = [{"audio_duration":audio_duration, "audio_path":audio_path, "segment_size":segment_size, "step_size":step_size}] keys = metadata[0].keys() with open(output_dir + "metadata.csv", 'w') as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(metadata)
def dataval_generator(filelistpath, batch_size=32, shuffle=False): batch_index = 0 image_index = -1 filelist = open(filelistpath[0], 'r') filenames = filelist.readlines() filelist.close() labels_dict = {} for n in range(len(dataset)): labels_list = csv.reader(open(LABELPATH + dataset[n], 'r')) next(labels_list) for k, r, v in labels_list: labels_dict[r + '/' + k + '.wav'] = v while True: image_index = (image_index + 1) % len(filenames) # if shuffle and image_index = 0 # shuffling filelist if shuffle == True and image_index == 0: random.shuffle(filenames) file_id = filenames[image_index].rstrip() if batch_index == 0: # re-initialize spectrogram and label batch spect_batch1 = np.zeros( [batch_size, spect1.shape[0], spect1.shape[1], 1]) spect_batch2 = np.zeros( [batch_size, spect2.shape[0], spect2.shape[1], 1]) spect_batch3 = np.zeros( [batch_size, spect3.shape[0], spect3.shape[1], 1]) label_batch = np.zeros([batch_size, 1]) ####### feature matrix for network 1 ######################3 if features1 == 'h5': hf = h5py.File(SPECTPATH1 + file_id[:-4] + '.h5', 'r') imagedata1 = hf.get('features') imagedata1 = np.array(imagedata1) hf.close() imagedata1 = (imagedata1 + 15.0966) / (15.0966 + 2.25745) elif features1 == 'mfc': htk_reader = HTKFile() htk_reader.load(SPECTPATH1 + file_id[:-8] + '.mfc') imagedata1 = np.array(htk_reader.data) imagedata1 = imagedata1 / 18.0 imagedata1 = correct_dimensions(imagedata1, expected_shape1) imagedata1 = np.reshape( imagedata1, (1, imagedata1.shape[0], imagedata1.shape[1], 1)) spect_batch1[batch_index, :, :, :] = imagedata1 ####### feature matrix for network 2 ###################### if features2 == 'h5': hf = h5py.File(SPECTPATH2 + file_id[:-4] + '.h5', 'r') imagedata2 = hf.get('features') imagedata2 = np.array(imagedata2) hf.close() imagedata2 = (imagedata2 + 15.0966) / (15.0966 + 2.25745) elif features2 == 'mfc': htk_reader = HTKFile() htk_reader.load(SPECTPATH2 + file_id[:-8] + '.mfc') imagedata2 = np.array(htk_reader.data) imagedata2 = imagedata2 / 18.0 imagedata2 = correct_dimensions(imagedata2, expected_shape2) imagedata2 = np.reshape( imagedata2, (1, imagedata2.shape[0], imagedata2.shape[1], 1)) spect_batch2[batch_index, :, :, :] = imagedata2 ####### feature matrix for network 3 ###################### if features3 == 'h5': hf = h5py.File(SPECTPATH3 + file_id[:-4] + '.h5', 'r') imagedata3 = hf.get('features') imagedata3 = np.array(imagedata3) hf.close() imagedata3 = (imagedata3 + 15.0966) / (15.0966 + 2.25745) elif features3 == 'mfc': htk_reader = HTKFile() htk_reader.load(SPECTPATH3 + file_id[:-8] + '.mfc') imagedata3 = np.array(htk_reader.data) imagedata3 = imagedata3 / 18.0 imagedata3 = correct_dimensions(imagedata3, expected_shape3) imagedata3 = np.reshape( imagedata3, (1, imagedata3.shape[0], imagedata3.shape[1], 1)) spect_batch3[batch_index, :, :, :] = imagedata3 ########-----------------------------------################### batch_index += 1 if batch_index >= batch_size: batch_index = 0 inputs1 = spect_batch1 inputs2 = spect_batch2 inputs3 = spect_batch3 inp = [inputs1, inputs2, inputs3] yield inp
def datatest_generator(filelistpath, batch_size=32, shuffle=False): batch_index = 0 image_index = -1 filelist = open(filelistpath, 'r') filenames = filelist.readlines() filelist.close() # read labels and save in a dict labels_dict = {} labels_dict = {} for n in range(len(dataset)): labels_list = csv.reader(open(LABELPATH + dataset[n], 'r')) next(labels_list) for k, r, v in labels_list: labels_dict[r + '/' + k] = v while True: image_index = (image_index + 1) % len(filenames) # if shuffle and image_index = 0 # shuffling filelist if shuffle == True and image_index == 0: random.shuffle(filenames) file_id = filenames[image_index].rstrip() if batch_index == 0: # re-initialize spectrogram and label batch spect_batch = np.zeros( [batch_size, spect.shape[0], spect.shape[1], 1]) label_batch = np.zeros([batch_size, 1]) # load features with the select format if features == 'h5': hf = h5py.File(SPECTPATH + file_id + '.h5', 'r') #[:-4]for evaluation dataset imagedata = hf.get('features') imagedata = np.array(imagedata) hf.close() imagedata = (imagedata + 15.0966) / (15.0966 + 2.25745) elif features == 'npy': imagedata = np.load(SPECTPATH + file_id + '.npy') if max_value != 0 and min_value != 0: imagedata = (imagedata - min_value) / (max_value - min_value) elif features == 'mfc': htk_reader = HTKFile() htk_reader.load(SPECTPATH + file_id[:-8] + '.mfc') imagedata = np.array(htk_reader.data) imagedata = imagedata / 17.0 # processing files with shapes other than expected shape in warblr dataset if imagedata.shape[0] != expected_shape[0]: old_imagedata = imagedata imagedata = np.zeros(expected_shape) if old_imagedata.shape[0] < expected_shape[0]: diff_in_frames = expected_shape[0] - old_imagedata.shape[0] if diff_in_frames < expected_shape[0] / 2: imagedata = np.vstack((old_imagedata, old_imagedata[range( old_imagedata.shape[0] - diff_in_frames, old_imagedata.shape[0])])) elif diff_in_frames > expected_shape[0] / 2: count = np.floor(expected_shape[0] / old_imagedata.shape[0]) remaining_diff = (expected_shape[0] - old_imagedata.shape[0] * int(count)) imagedata = np.vstack(([old_imagedata] * int(count))) imagedata = np.vstack((imagedata, old_imagedata[range( old_imagedata.shape[0] - remaining_diff, old_imagedata.shape[0])])) elif old_imagedata.shape[0] > expected_shape[0]: diff_in_frames = old_imagedata.shape[0] - expected_shape[0] if diff_in_frames < expected_shape[0] / 2: imagedata[range(0, diff_in_frames + 1), :] = np.mean( np.array([ old_imagedata[range(0, diff_in_frames + 1), :], old_imagedata[range( old_imagedata.shape[0] - diff_in_frames - 1, old_imagedata.shape[0]), :] ]), axis=0) imagedata[range(diff_in_frames + 1, expected_shape[0]), :] = old_imagedata[ range(diff_in_frames + 1, expected_shape[0])] elif diff_in_frames > expected_shape[0] / 2: count = int( np.floor(old_imagedata.shape[0] / expected_shape[0])) remaining_diff = (old_imagedata.shape[0] - expected_shape[0] * count) for index in range(0, count): imagedata[range(0, expected_shape[0]), :] = np.sum( [ imagedata, old_imagedata[range( index * expected_shape[0], (index + 1) * expected_shape[0])] ], axis=0) / count imagedata[range(0, remaining_diff), :] = np.mean( np.array([ old_imagedata[range( old_imagedata.shape[0] - remaining_diff, old_imagedata.shape[0] ), :], imagedata[range(0, remaining_diff), :] ]), axis=0) imagedata = np.reshape(imagedata, (1, imagedata.shape[0], imagedata.shape[1], 1)) spect_batch[batch_index, :, :, :] = imagedata batch_index += 1 # create the batch with the features if batch_index >= batch_size: batch_index = 0 inputs = [spect_batch] yield inputs
win_shift=80) #here we load the raw audio file sig = mfcc.load_raw_signal('file.raw') #here we calculate the MFCC+energy, deltas and acceleration coefficients feat = mfcc.get_feats(sig) delta = mfcc.get_delta(feat, 2) acc = mfcc.get_delta(delta, 2) #here we merge the MFCCs and deltas together to get 39 features feat = np.hstack((feat, delta, acc)) #here we use HTK to calculate the same thing #you can comment this line if you don't have HTK installed HCopy('hcopy8k.conf', 'file.raw', 'file8k.htk') #here we load the features generate by the command above htk = HTKFile() htk.load('file8k.htk') #calculating the difference between features diff = feat - htk.data #computing and dsiplaying the maximum difference between the two methods print("Maximum difference: {}".format(np.max(np.abs(diff)))) #displaying the difference P.pcolormesh(diff.T) P.savefig('diff.png')
def data_generator(filelistpath, batch_size=16, shuffle=False): batch_index = 0 image_index = -1 filelist = open(filelistpath, 'r') filenames = filelist.readlines() filelist.close() # shuffling filelist if shuffle == True: random.shuffle(filenames) dataset = ['BirdVox-DCASE-20k.csv', 'ff1010bird.csv', 'warblrb10k.csv'] labels_dict = {} for n in range(len(dataset)): labels_list = csv.reader(open(LABELPATH + dataset[n], 'r')) next(labels_list) for k, r, v in labels_list: labels_dict[r + '/' + k + '.wav'] = v while True: image_index = (image_index + 1) % len(filenames) # if shuffle and image_index = 0 # shuffling filelist if shuffle == True and image_index == 0: random.shuffle(filenames) file_id = filenames[image_index].rstrip() if batch_index == 0: # re-initialize spectrogram and label batch spect_batch = np.zeros([1, spect.shape[0], spect.shape[1], 1]) label_batch = np.zeros([1, 1]) aug_spect_batch = np.zeros( [batch_size, spect.shape[0], spect.shape[1], 1]) aug_label_batch = np.zeros([batch_size, 1]) if features == 'h5': hf = h5py.File(SPECTPATH + file_id + '.h5', 'r') imagedata = hf.get('features') imagedata = np.array(imagedata) hf.close() # normalizing intensity values of spectrogram from [-15.0966 to 2.25745] to [0 to 1] range imagedata = (imagedata + 15.0966) / (15.0966 + 2.25745) elif features == 'mfc': htk_reader = HTKFile() htk_reader.load(SPECTPATH + file_id[:-4] + '.mfc') imagedata = np.array(htk_reader.data) imagedata = imagedata / 17.0 imagedata = np.reshape(imagedata, (1, imagedata.shape[0], imagedata.shape[1], 1)) spect_batch[0, :, :, :] = imagedata label_batch[0, :] = labels_dict[file_id] gen_img = datagen.flow(imagedata, label_batch[0, :], batch_size=1, shuffle=False, save_to_dir=None) aug_spect_batch[batch_index, :, :, :] = imagedata aug_label_batch[batch_index, :] = label_batch[0, :] batch_index += 1 for n in range(AUGMENT_SIZE - 1): aug_spect_batch[batch_index, :, :, :], aug_label_batch[ batch_index, :] = gen_img.next() batch_index += 1 if batch_index >= batch_size: batch_index = 0 inputs = [aug_spect_batch] outputs = [aug_label_batch] yield inputs, outputs
#setting up the main class mfcc=MFCC_HTK(filter_file='filter.csv') #here we load the raw audio file sig = mfcc.load_raw_signal('file.raw') #here we calculate the MFCC+energy, deltas and acceleration coefficients feat = mfcc.get_feats(sig) delta = mfcc.get_delta(feat,2) acc = mfcc.get_delta(delta,2) #here we merge the MFCCs and deltas together to get 39 features feat = np.hstack((feat,delta,acc)) #here we use HTK to calculate the same thing #you can comment this line if you don't have HTK installed HCopy('hcopy.conf','file.raw','file.htk') #here we load the features generate by the command above htk=HTKFile() htk.load('file.htk') #calculating the difference between features diff=feat-htk.data #computing and dsiplaying the maximum difference between the two methods print("Maximum difference: {}".format(np.max(np.abs(diff)))) #displaying the difference P.pcolormesh(diff.T) P.savefig('diff.png')
import numpy as np import os from HTK import HTKFile from keras.models import load_model from keras.preprocessing.sequence import pad_sequences # same as the labels in answer.mlf label_list = ["ling", "yi", "er", "san", "si", "wu", "liu", "qi", "ba", "jiu"] labels = {} MFCC = HTKFile() # load testing data test_x = [] input_lengths = [] filelist = os.listdir(os.getcwd() + "/MFCC/testing") filelist.sort() for filename in filelist: MFCC.load("MFCC/testing/" + filename) test_x.append(np.array(MFCC.data)) input_lengths.append(len(np.array(MFCC.data))) test_x = pad_sequences(test_x, dtype='float', padding='post') input_lengths = np.array(input_lengths) test_model = load_model("model.h5") pred = test_model.predict(test_x) # decode CTC and output answer with open("result/result_nn.mlf", "w") as f: f.write("#!MLF!#\n") for i in range(len(filelist)):
"liN": 0, "#i": 1, "#er": 2, "san": 3, "sy": 4, "#u": 5, "liou": 6, "qi": 7, "ba": 8, "jiou": 9, "blank": 10 } # load labels labels = {} MFCC = HTKFile() with open("labels/Clean08TR.mlf") as f: f.readline() key = None value = [] for line in f: if line[0] == '\"': key = line[4:-6] elif line[0] == '.': labels[key] = value value = [] elif line != "sil\n": value.append(label_map[line[:-1]]) # load training data train_x = []