def predict_test_labels(classifier, std_scale, pca_std):
    """
    This function predicts labels for test data using the trained classifier and writes output to file
    :param classifier: classifier trained on train data
    :param std_scale
    :param pca_std

    """

    top_level_test_dirs = os.listdir(TEST_DATA_DIR)
    top_level_test_dirs = [x for x in top_level_test_dirs if not x.startswith('.')]

    predictions_list = [['timestamp', 'label']]

    # Remove prediction data file if exists
    try:
        os.remove(FINAL_PREDICTIONS)
    except OSError:
        pass

    # Walks through directories in test directory and reads touch.csv, audio.wav files
    # Extracts timestamp and mode from directory names
    for dir_name in top_level_test_dirs:
        mode = dir_name.split('-')[0]
        timestamped_dirs = os.listdir(os.path.join(TEST_DATA_DIR, dir_name))
        timestamped_dirs = [x for x in timestamped_dirs if not x.startswith('.')]
        for folder_name in timestamped_dirs:
            timestamp = folder_name
            audio_file = os.path.join(TEST_DATA_DIR, dir_name, folder_name, 'audio.wav')
            touch_file = os.path.join(TEST_DATA_DIR, dir_name, folder_name, 'touch.csv')
            touch_features = pd.read_csv(touch_file, sep=',', skiprows=1, header=None)
            (fs, frame) = sciw.read(audio_file, mmap=False)

            # Creating test feature matrix
            feature_matrix = create_feature_vector(touch_features, frame, fs, mode)

            # Transform each test feature into the same space as training data
            test_feature = pca_std.transform(std_scale.transform(feature_matrix))

            # Predict labels for each test feature and write string mappings of labels
            label = 'knuckle' if classifier.predict(test_feature) == np.array([1]) else 'pad'
            predictions_list.append([timestamp, label])

    # Write predictions to the output file
    with open(FINAL_PREDICTIONS, 'w') as pred_fh:
        writer = csv.writer(pred_fh, delimiter=',')
        writer.writerows(predictions_list)
def prepare_training_data():
    """
    This function walks through directories in train, and reads touch.csv and audio.wav files.
    It invokes create_feature_vector function to extract features, extracts labels and writes them to
    corresponding csv files.
    :return:
    """

    top_level_train_dirs = os.listdir(TRAIN_DATA_DIR)
    top_level_train_dirs = [x for x in top_level_train_dirs if not x.startswith('.')]

    # Remove prep data files if exist
    try:
        os.remove(TRAIN_FEATURES_FILE)
        os.remove(TRAIN_LABELS_FILE)
    except OSError:
        pass

    # Write header to the csv file that should contain feature matrix
    with open(TRAIN_FEATURES_FILE, 'wb') as write_header:
        writer = csv.writer(write_header)
        writer.writerow(['x', 'y', 'major', 'minor', 'pressure', 'orientation', 'signal energy',
                         'signal energy entropy', 'spectral centroid', 'spectral spread', 'spectral entropy',
                         'spectral roll off', 'mode'])

    # Walk through directories in train directory and extract features from files,
    # along with label from directory names

    for dir_name in top_level_train_dirs:
        mode = dir_name.split('-')[0]
        timestamped_dirs = os.listdir(os.path.join(TRAIN_DATA_DIR, dir_name))
        timestamped_dirs = [x for x in timestamped_dirs if not x.startswith('.')]
        for folder_name in timestamped_dirs:
            split_details = folder_name.split('-')
            label = split_details[1]
            # timestamp = split_details[0]
            audio_file = os.path.join(TRAIN_DATA_DIR, dir_name, folder_name, 'audio.wav')
            touch_file = os.path.join(TRAIN_DATA_DIR, dir_name, folder_name, 'touch.csv')
            touch_features = pd.read_csv(touch_file, sep=',', skiprows=1, header=None)
            (fs, frame) = sciw.read(audio_file, mmap=False)
            feature_matrix = create_feature_vector(touch_features, frame, fs, mode)

            with open(TRAIN_FEATURES_FILE, 'a') as f_handle:
                np.savetxt(f_handle, feature_matrix, delimiter=',')

            with open(TRAIN_LABELS_FILE, 'a') as f_handle:
                np.savetxt(f_handle, [1] if label == 'knuckle' else [-1], delimiter=',')
Ejemplo n.º 3
0
# ASSUMING feature_extraction.py is in same directory
import feature_extraction as fe

# IMPORT other necessities
from sklearn import linear_model
from sklearn import tree
from sklearn import svm

# GET entire dataset
data = fe.extract_from_tsv()

# GET feature vectors with supplied extraction functions
extraction_functions = [fe.basic_numerical_feature_extractor,
                        fe.filter_selection,
                        fe.filter_rarity]
x_data, y_data = fe.create_feature_vector(data, extractor_funcs=extraction_functions)

# VISUALIZE pca graph of dataset
fe.apply_machine_learning_algorithm(x_data, y_data, graph_pca=True)

# TOGGLE pca preprocessing
print "|======================================|"
print "Printing w/ PCA PREPROCESS toggled off and on:"
print fe.apply_machine_learning_algorithm(x_data, y_data)
print fe.apply_machine_learning_algorithm(x_data, y_data, pca_preprocess=True)
print "|======================================|"

# INPUT pca dimensionality reduction
print "|======================================|"
print "Printing w/ PCA REDUCTION at normal and reduced dimension 2:"
print fe.apply_machine_learning_algorithm(x_data, y_data)