def predict_test_labels(classifier, std_scale, pca_std): """ This function predicts labels for test data using the trained classifier and writes output to file :param classifier: classifier trained on train data :param std_scale :param pca_std """ top_level_test_dirs = os.listdir(TEST_DATA_DIR) top_level_test_dirs = [x for x in top_level_test_dirs if not x.startswith('.')] predictions_list = [['timestamp', 'label']] # Remove prediction data file if exists try: os.remove(FINAL_PREDICTIONS) except OSError: pass # Walks through directories in test directory and reads touch.csv, audio.wav files # Extracts timestamp and mode from directory names for dir_name in top_level_test_dirs: mode = dir_name.split('-')[0] timestamped_dirs = os.listdir(os.path.join(TEST_DATA_DIR, dir_name)) timestamped_dirs = [x for x in timestamped_dirs if not x.startswith('.')] for folder_name in timestamped_dirs: timestamp = folder_name audio_file = os.path.join(TEST_DATA_DIR, dir_name, folder_name, 'audio.wav') touch_file = os.path.join(TEST_DATA_DIR, dir_name, folder_name, 'touch.csv') touch_features = pd.read_csv(touch_file, sep=',', skiprows=1, header=None) (fs, frame) = sciw.read(audio_file, mmap=False) # Creating test feature matrix feature_matrix = create_feature_vector(touch_features, frame, fs, mode) # Transform each test feature into the same space as training data test_feature = pca_std.transform(std_scale.transform(feature_matrix)) # Predict labels for each test feature and write string mappings of labels label = 'knuckle' if classifier.predict(test_feature) == np.array([1]) else 'pad' predictions_list.append([timestamp, label]) # Write predictions to the output file with open(FINAL_PREDICTIONS, 'w') as pred_fh: writer = csv.writer(pred_fh, delimiter=',') writer.writerows(predictions_list)
def prepare_training_data(): """ This function walks through directories in train, and reads touch.csv and audio.wav files. It invokes create_feature_vector function to extract features, extracts labels and writes them to corresponding csv files. :return: """ top_level_train_dirs = os.listdir(TRAIN_DATA_DIR) top_level_train_dirs = [x for x in top_level_train_dirs if not x.startswith('.')] # Remove prep data files if exist try: os.remove(TRAIN_FEATURES_FILE) os.remove(TRAIN_LABELS_FILE) except OSError: pass # Write header to the csv file that should contain feature matrix with open(TRAIN_FEATURES_FILE, 'wb') as write_header: writer = csv.writer(write_header) writer.writerow(['x', 'y', 'major', 'minor', 'pressure', 'orientation', 'signal energy', 'signal energy entropy', 'spectral centroid', 'spectral spread', 'spectral entropy', 'spectral roll off', 'mode']) # Walk through directories in train directory and extract features from files, # along with label from directory names for dir_name in top_level_train_dirs: mode = dir_name.split('-')[0] timestamped_dirs = os.listdir(os.path.join(TRAIN_DATA_DIR, dir_name)) timestamped_dirs = [x for x in timestamped_dirs if not x.startswith('.')] for folder_name in timestamped_dirs: split_details = folder_name.split('-') label = split_details[1] # timestamp = split_details[0] audio_file = os.path.join(TRAIN_DATA_DIR, dir_name, folder_name, 'audio.wav') touch_file = os.path.join(TRAIN_DATA_DIR, dir_name, folder_name, 'touch.csv') touch_features = pd.read_csv(touch_file, sep=',', skiprows=1, header=None) (fs, frame) = sciw.read(audio_file, mmap=False) feature_matrix = create_feature_vector(touch_features, frame, fs, mode) with open(TRAIN_FEATURES_FILE, 'a') as f_handle: np.savetxt(f_handle, feature_matrix, delimiter=',') with open(TRAIN_LABELS_FILE, 'a') as f_handle: np.savetxt(f_handle, [1] if label == 'knuckle' else [-1], delimiter=',')
# ASSUMING feature_extraction.py is in same directory import feature_extraction as fe # IMPORT other necessities from sklearn import linear_model from sklearn import tree from sklearn import svm # GET entire dataset data = fe.extract_from_tsv() # GET feature vectors with supplied extraction functions extraction_functions = [fe.basic_numerical_feature_extractor, fe.filter_selection, fe.filter_rarity] x_data, y_data = fe.create_feature_vector(data, extractor_funcs=extraction_functions) # VISUALIZE pca graph of dataset fe.apply_machine_learning_algorithm(x_data, y_data, graph_pca=True) # TOGGLE pca preprocessing print "|======================================|" print "Printing w/ PCA PREPROCESS toggled off and on:" print fe.apply_machine_learning_algorithm(x_data, y_data) print fe.apply_machine_learning_algorithm(x_data, y_data, pca_preprocess=True) print "|======================================|" # INPUT pca dimensionality reduction print "|======================================|" print "Printing w/ PCA REDUCTION at normal and reduced dimension 2:" print fe.apply_machine_learning_algorithm(x_data, y_data)