def p_predict_cnn_thread(audio_path, name_list): # initialize variables X = np.empty(shape=(len(name_list), N_MFCC, AUDIO_LENGTH, 1)) # traverse through the name list and process this threads workload for i, fname in enumerate(name_list): # add a log message to be displayed after processing every 250 files. if i % 250 == 0: utils.write_log_msg("FEATURE_CNN_PREDICT - {0}...".format(i)) # read the sound file sound_clip, _ = librosa.load(audio_path + fname, sr=SAMPLE_RATE, res_type='kaiser_fast') # Random offset / Padding if len(sound_clip) > INPUT_LENGTH: max_offset = len(sound_clip) - INPUT_LENGTH offset = np.random.randint(max_offset) sound_clip = sound_clip[offset:(INPUT_LENGTH + offset)] else: if INPUT_LENGTH > len(sound_clip): max_offset = INPUT_LENGTH - len(sound_clip) offset = np.random.randint(max_offset) else: offset = 0 sound_clip = np.pad( sound_clip, (offset, INPUT_LENGTH - len(sound_clip) - offset), "constant") # extract mfcc features mfcc = librosa.feature.mfcc(sound_clip, sr=SAMPLE_RATE, n_mfcc=N_MFCC) mfcc = np.expand_dims(mfcc, axis=-1) X[i, ] = mfcc # return the extracted features to the calling program return np.array(X)
def run_mnn(dataset): ## DATASET = 0 => ALL DATASET ## DATASET = 1 => CONFUSION MATRIX train_csv = TRAIN_CSV if (dataset == 0) else TRAIN_CONF_CSV # print a log message for status update utils.write_log_msg("creating data dictionary...") # create a dictionary from the provided train.csv file dictionary = utils.create_dictionary(train_csv) # print a log message for status update utils.write_log_msg("extracting features of training data...") # call the feature extraction module to get audio features tr_mnn_features, tr_mnn_labels = features.parse_audio_files_train(TRAIN_AUDIO_PATH,train_csv,dictionary, 0) # print a log message for status update utils.write_log_msg("extracting features of prediction data...") # call the feature extraction module to get audio features if (dataset == 0) : ts_mnn_features, ts_mnn_name_list = features.parse_audio_files_predict(TEST_AUDIO_PATH,os.listdir(TEST_AUDIO_PATH), 0) else : test_csv = pd.read_csv(TEST_CONF_CSV) ts_mnn_features, ts_mnn_name_list = features.parse_audio_files_predict(TRAIN_AUDIO_PATH,test_csv["fname"].tolist(), 0) # print a log message for status update utils.write_log_msg("starting multi-layer neural network training...") # use the above extracted features for the training of the model mnn_y_pred, mnn_probs, mnn_pred = train.tensor_multilayer_neural_network(tr_mnn_features, tr_mnn_labels, ts_mnn_features, len(dictionary), training_epochs=500) # Get top 3 predictions ensembled_output = np.zeros(shape=(mnn_probs.shape[0], mnn_probs.shape[1])) for row, columns in enumerate(mnn_pred): for i, column in enumerate(columns): ensembled_output[row, column] += mnn_probs[row, i] top3 = ensembled_output.argsort()[:,-3:][:,::-1] # print the predicted results to a csv file. file_ = open(OUTPUT_CSV, "w") file_.write("fname,label\n") for i, value in enumerate(top3): if(dataset ==0): lbl_1 = [k for k, v in dictionary.items() if v == value[0]][0] lbl_2 = [k for k, v in dictionary.items() if v == value[1]][0] lbl_3 = [k for k, v in dictionary.items() if v == value[2]][0] file_.write("%s,%s %s %s\n" % (ts_mnn_name_list[i], lbl_1, lbl_2, lbl_3)) else : lbl_1 = [k for k, v in dictionary.items() if v == value[0]][0] file_.write("%s,%s\n" % (ts_mnn_name_list[i], lbl_1)) if (dataset ==0) : file_.write("0b0427e2.wav,Harmonica\n6ea0099f.wav,Harmonica\nb39975f5.wav,Harmonica") # print a log message for status update utils.write_log_msg("done...")
def parse_audio_files_train(audio_path, train_csv_path, label_dictionary, nn_type, file_ext="*.wav"): # initialize variables labels = np.empty(0) # read audio files using pandas and split it into chunks of 'CHUNK_SIZE' files each data = pd.read_csv(train_csv_path, chunksize=CHUNK_SIZE) # create a thread pool to process the workload thread_pool = [] # each chunk is the amount of data that will be processed by a single thread for chunk in data: if (nn_type == 0): features = np.empty((0, FEATURE_SIZE)) thread_pool.append( utils.ThreadWithReturnValue(target=p_train_thread, args=(audio_path, label_dictionary, chunk))) else: features = np.empty(shape=(0, N_MFCC, AUDIO_LENGTH, 1)) thread_pool.append( utils.ThreadWithReturnValue(target=p_train_cnn_thread, args=(audio_path, label_dictionary, chunk))) # print a log message for status update utils.write_log_msg("TRAIN: creating a total of {0} threads...".format( len(thread_pool))) # start the entire thread pool for single_thread in thread_pool: single_thread.start() # wait for thread pool to return their results of processing for single_thread in thread_pool: ft, lbl = single_thread.join() features = np.vstack([features, ft]) labels = np.append(labels, lbl) # perform final touches to extracted arrays features = np.array(features) #print(labels) labels = np.array(labels, dtype=np.int) # normalize data mean = np.mean(features, axis=0) std = np.std(features, axis=0) features = (features - mean) / std # return the extracted features to the calling program return features, labels
def p_train_thread(audio_path, label_dictionary, data): # initialize variables features, labels = np.empty((0, FEATURE_SIZE)), np.empty(0) # process this threads share of workload for i in range(data.shape[0]): # add a log message to be displayed after processing every 250 files. if i % 250 == 0: utils.write_log_msg("FEATURE_TRAIN - {0}...".format(i)) line = data.iloc[i] fn = audio_path + line["fname"] mfccs, chroma, mel, contrast, tonnetz = extract_feature(fn) ext_features = np.hstack([mfccs, chroma, mel, contrast, tonnetz]) features = np.vstack([features, ext_features]) labels = np.append(labels, label_dictionary[line["label"]]) # return the extracted features to the calling program return features, labels
def p_predict_thread(audio_path, name_list): # initialize variables features = np.empty((0, FEATURE_SIZE)) # traverse through the name list and process this threads workload for fname in name_list: X, sample_rate = librosa.load(audio_path + fname, res_type='kaiser_fast') mfccs, chroma, mel, contrast, tonnetz = extract_feature(audio_path + fname) ext_features = np.hstack([mfccs, chroma, mel, contrast, tonnetz]) features = np.vstack([features, ext_features]) # add a log message to be displayed after processing every 250 files. if len(features) % 250 == 0: utils.write_log_msg("FEATURE_PREDICT - {0}...".format( len(features))) # return the extracted features to the calling program return features
def parse_audio_files_predict(audio_path, name_list, nn_type, file_ext="*.wav"): # create a thread pool to process the workload thread_pool = [] # split the filename list into chunks of 'CHUNK_SIZE' files each data = utils.generate_chunks(name_list, CHUNK_SIZE) # each chunk is the amount of data that will be processed by a single thread for chunk in data: if nn_type == 0: features = np.empty((0, FEATURE_SIZE)) thread_pool.append( utils.ThreadWithReturnValue(target=p_predict_thread, args=(audio_path, chunk))) else: features = np.empty(shape=(0, N_MFCC, AUDIO_LENGTH, 1)) thread_pool.append( utils.ThreadWithReturnValue(target=p_predict_cnn_thread, args=(audio_path, chunk))) # print a log message for status update utils.write_log_msg("PREDICT: creating a total of {0} threads...".format( len(thread_pool))) # start the entire thread pool for single_thread in thread_pool: single_thread.start() # wait for thread pool to return their results of processing for single_thread in thread_pool: ft = single_thread.join() features = np.vstack([features, ft]) # perform final touches to extracted arrays features = np.array(features) # normalize data mean = np.mean(features, axis=0) std = np.std(features, axis=0) features = (features - mean) / std # return the extracted features to the calling program return features, name_list
def p_train_cnn_thread(audio_path, label_dictionary, data): # initialize variables labels = np.empty(0) X = np.empty(shape=(data.shape[0], N_MFCC, AUDIO_LENGTH, 1)) # process this threads share of workload for i in range(data.shape[0]): # add a log message to be displayed after processing every 250 files. if i % 250 == 0: utils.write_log_msg("FEATURE_CNN_TRAIN - {0}...".format(i)) line = data.iloc[i] fn = audio_path + line["fname"] sound_clip, _ = librosa.core.load(fn, sr=SAMPLE_RATE, res_type='kaiser_fast') # Random offset / Padding if len(sound_clip) > INPUT_LENGTH: max_offset = len(sound_clip) - INPUT_LENGTH offset = np.random.randint(max_offset) sound_clip = sound_clip[offset:(INPUT_LENGTH + offset)] else: if INPUT_LENGTH > len(sound_clip): max_offset = INPUT_LENGTH - len(sound_clip) offset = np.random.randint(max_offset) else: offset = 0 sound_clip = np.pad( sound_clip, (offset, INPUT_LENGTH - len(sound_clip) - offset), "constant") # extract mfcc features mfcc = librosa.feature.mfcc(sound_clip, sr=SAMPLE_RATE, n_mfcc=N_MFCC) mfcc = np.expand_dims(mfcc, axis=-1) X[i, ] = mfcc # populate the labels array labels = np.append(labels, label_dictionary[line["label"]]) # return the extracted features to the calling program return np.array(X), labels
def main(_load = False): # intialize the log file for current run of the code utils.initialize_log() # read audio files and parse them or simply load from pre-extracted feature files if _load: dictionary, tr_mnn_features, tr_mnn_labels, ts_mnn_features, ts_mnn_name_list, tr_cnn_features, tr_cnn_labels, ts_cnn_features, ts_cnn_name_list = read_audio_files() else: dictionary, tr_mnn_features, tr_mnn_labels, ts_mnn_features, ts_mnn_name_list, tr_cnn_features, tr_cnn_labels, ts_cnn_features, ts_cnn_name_list = features.read_features() # print a log message for status update utils.write_log_msg("starting multi-layer neural network training...") # use the above extracted features for the training of the model predictions_top3 = train.train(tr_mnn_features, tr_mnn_labels, ts_mnn_features, tr_cnn_features, tr_cnn_labels, ts_cnn_features, n_classes=len(dictionary)) # print a log message for status update utils.write_log_msg("outputing prediction results to a csv file...") # print the predicted results to a csv file. utils.print_csv_file(predictions_top3, ts_mnn_name_list, dictionary, OUTPUT_CSV) # print a log message for status update utils.write_log_msg("done...")
def read_audio_files(): # print a log message for status update utils.write_log_msg("creating data dictionary...") # create a dictionary from the provided train.csv file dictionary = utils.create_dictionary(TRAIN_CSV) # print a log message for status update utils.write_log_msg("extracting features of training data...") # call the feature extraction module to get audio features tr_mnn_features, tr_mnn_labels = features.parse_audio_files_train( TRAIN_AUDIO_PATH, TRAIN_CSV, dictionary, 0) # call the feature extraction module to get audio features tr_cnn_features, tr_cnn_labels = features.parse_audio_files_train( TRAIN_AUDIO_PATH, TRAIN_CSV, dictionary, 1) # print a log message for status update utils.write_log_msg( "processed {0} files of training data for mnn...".format( len(tr_mnn_features))) # print a log message for status update utils.write_log_msg( "processed {0} files of training data for cnn...".format( len(tr_cnn_features))) # print a log message for status update utils.write_log_msg("extracting features of prediction data...") # call the feature extraction module to get audio features ts_mnn_features, ts_mnn_name_list = features.parse_audio_files_predict( TEST_AUDIO_PATH, os.listdir(TEST_AUDIO_PATH), 0) # call the feature extraction module to get audio features ts_cnn_features, ts_cnn_name_list = features.parse_audio_files_predict( TEST_AUDIO_PATH, os.listdir(TEST_AUDIO_PATH), 1) # print a log message for status update utils.write_log_msg( "processed {0} files of prediction data for mnn...".format( len(ts_mnn_features))) # print a log message for status update utils.write_log_msg( "processed {0} files of prediction data for cnn...".format( len(ts_cnn_features))) # print a log message for status update utils.write_log_msg("storing features for future use...") # store features so that they can be used in future features.store_features(dictionary, tr_mnn_features, tr_mnn_labels, ts_mnn_features, ts_mnn_name_list, tr_cnn_features, tr_cnn_labels, ts_cnn_features, ts_cnn_name_list) # return the results to calling program return dictionary, tr_mnn_features, tr_mnn_labels, ts_mnn_features, ts_mnn_name_list, tr_cnn_features, tr_cnn_labels, ts_cnn_features, ts_cnn_name_list
def main(): utils.write_log_msg("Run CNN code ...") run_cnn(2)
def main(): utils.write_log_msg("Run MNN code ...") #limitedCsv() #run_mnn(0) create_confmatrix()
def tensor_multilayer_neural_network(tr_features, tr_labels, ts_features, n_classes, training_epochs): # initialize the beginning paramters. n_dim = tr_features.shape[1] n_hidden_units_1 = 200 #280 n_hidden_units_2 = 250 #300 n_hidden_units_3 = 300 #300 sd = 1 / np.sqrt(n_dim) # one hot encode from training labels tr_labels = to_categorical(tr_labels) X = tf.placeholder(tf.float32,[None,n_dim]) Y = tf.placeholder(tf.float32,[None,n_classes]) # initializing starting learning rate - will use decaying technique global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.exponential_decay(0.005, global_step, 500, 0.95, staircase=True) # initialize layer 1 parameters W_1 = tf.Variable(tf.random_normal([n_dim,n_hidden_units_1], mean = 0, stddev=sd)) b_1 = tf.Variable(tf.random_normal([n_hidden_units_1], mean = 0, stddev=sd)) h_1 = tf.nn.tanh(tf.matmul(X,W_1) + b_1) # initialize layer 2 parameters W_2 = tf.Variable(tf.random_normal([n_hidden_units_1,n_hidden_units_2], mean = 0, stddev=sd)) b_2 = tf.Variable(tf.random_normal([n_hidden_units_2], mean = 0, stddev=sd)) h_2 = tf.nn.sigmoid(tf.matmul(h_1,W_2) + b_2) # initialize layer 3 parameters W_3 = tf.Variable(tf.random_normal([n_hidden_units_2,n_hidden_units_3], mean = 0, stddev=sd)) b_3 = tf.Variable(tf.random_normal([n_hidden_units_3], mean = 0, stddev=sd)) h_3 = tf.nn.sigmoid(tf.matmul(h_2,W_3) + b_3) W = tf.Variable(tf.random_normal([n_hidden_units_3,n_classes], mean = 0, stddev=sd)) b = tf.Variable(tf.random_normal([n_classes], mean = 0, stddev=sd)) y_ = tf.nn.softmax(tf.matmul(h_3,W) + b) cost_function = -tf.reduce_sum(Y * tf.log(y_)) optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost_function, global_step=global_step) init = tf.global_variables_initializer() cost_history = np.empty(shape=[1],dtype=float) y_pred = None with tf.Session() as sess: sess.run(init) for epoch in range(training_epochs): # print a log message for status update utils.write_log_msg("running the mnn training epoch {0}...".format(epoch+1)) # running the training_epoch numbered epoch _,cost = sess.run([optimizer,cost_function],feed_dict={X:tr_features,Y:tr_labels}) cost_history = np.append(cost_history,cost) # predict results based on the trained model y_pred = sess.run(tf.argmax(y_,1),feed_dict={X: ts_features}) y_k_probs, y_k_pred = sess.run(tf.nn.top_k(y_, k=n_classes), feed_dict={X: ts_features}) # plot cost history df = pd.DataFrame(cost_history) df.to_csv("../data/cost_history_mnn.csv") # return the predicted values back to the calling program return y_pred, y_k_probs, y_k_pred