Esempio n. 1
0
def get_all_handler():
    categories = ['Questionnaire', 'Demographics', 'Examination', 'Laboratory']
    label_handler = LabelHandler()
    for category in categories:
        label_handler.read(
            os.path.join('data/2015-2016/', '{}.txt'.format(category)))
    return label_handler
Esempio n. 2
0
def get_2015_Questionnaire_data(target_data,
                                symbol_list=[],
                                csv='data_preprocess/Questionnaire.csv',
                                label='data/2015-2016/Questionnaire.txt'):
    raw_csv = pd.read_csv(csv)
    label_handler = LabelHandler(label)

    train_data = select_feature(raw_csv, symbol_list)
    # Remove features and data for too much mmissing
    train_data = filter_data(train_data)
    # Align train data and target data according to 'SEQN'
    train_data, target_data = align_data_with_target(train_data, target_data)

    # Replace feautre names with meaningful contents, and remove unknowns
    columns = train_data.columns
    contents, noresults = label_handler.symbols_to_contents(columns)
    train_data.columns = contents
    train_data = train_data.drop(noresults, axis=1)

    # Remove Cigaratte feature, too many emptybyte string and aren't caught
    if symbol_list == []:
        cigarette_feature = [
            'Cig 12-digit Universal Product Code-UPC',
            'Cigarette Brand/sub-brand'
        ]
        train_data = train_data.drop(cigarette_feature, axis=1)

    train_data = process_nan(train_data)
    return train_data, target_data
Esempio n. 3
0
def get_2015_sleep_data(
    target,
    csv='data_preprocess/Sleep.csv',
    label='data/2015-2016/Questionnaire.txt',
):

    data = pd.read_csv(csv)
    label_handler = LabelHandler(label)
    data = data.drop('Unnamed: 0', 1)

    if target == 'all':
        columns = data.columns
        contents, noresults = label_handler.symbols_to_contents(columns)
        data.columns = contents

        # Convert time : e.g. b'23:00' -> -60
        data[contents[1]] = normalize_time(data[contents[1]])
        data[contents[2]] = normalize_time(data[contents[2]])
        return data

    data = data[['SEQN', target]]
    data = filter_data(data, 1, 0)

    if target == 'SLQ300' or target == 'SLQ310':
        data[target] = normalize_time(data[target])

    elif target == 'SLD012':
        pass

    elif target == 'SLQ030':
        data = data[data[target] != 7]
        data = data[data[target] != 9]
        data[target][data[target] < 2] = 0
        data[target][data[target] >= 2] = 1

    elif target == 'SLQ040':
        data = data[data[target] != 7]
        data = data[data[target] != 9]
        data[target][data[target] == 0] = 0
        data[target][data[target] > 0] = 1

    elif target == 'SLQ050':
        data = data[data[target] != 9]
        data[target][data[target] == 1] = 1
        data[target][data[target] == 2] = 0

    elif target == 'SLQ120':
        data = data[data[target] != 9]
        data[target][data[target] < 3] = 0
        data[target][data[target] >= 3] = 1

    elif target == 'DPQ030':
        data = data[data[target] != 7]
        data = data[data[target] != 9]
        data[target][data[target] == 0] = 0
        data[target][data[target] > 0] = 1

    return data
Esempio n. 4
0
def test_get_questionnaire():
    label_handler = LabelHandler('data/2015-2016/Questionnaire.txt')
    cat = label_handler.get_categories()[1]
    symbols = label_handler.get_symbols_by_category(cat)
    symbols = []

    target_data = get_2015_sleep_data(target='SLQ050')
    train_data, target_data = get_2015_Questionnaire_data(target_data, symbols)
    print(train_data)
Esempio n. 5
0
def get_2015_Examination_data(target_data,
                              symbol_list=[],
                              csv='data_preprocess/Examination.csv',
                              label='data/2015-2016/Examination.txt'):
    raw_csv = pd.read_csv(csv)
    label_handler = LabelHandler(label)

    train_data = select_feature(raw_csv, symbol_list)
    train_data = filter_data(train_data)
    train_data, target_data = align_data_with_target(train_data, target_data)

    columns = train_data.columns
    contents, noresults = label_handler.symbols_to_contents(columns)
    train_data.columns = contents
    if noresults:
        train_data = train_data.drop(noresults, axis=1)

    train_data = process_nan(train_data)
    return train_data, target_data
Esempio n. 6
0
import numpy as np
from keras.models import load_model
from keras import backend as K

from label_handler import LabelHandler
from aps_handler import APSHandler

APS_FOLDER = '/media/ben/Data/kaggle/passenger_screening_dataset/stage1/stage1_aps/'
TARGET_LABELS = '/media/ben/Data/kaggle/passenger_screening_dataset/stage1/stage1_sample_submission.csv'

label = LabelHandler(TARGET_LABELS)
subject_ids = label.get_subject_ids()
N = len(subject_ids)
print(N)


def predict(zone):
    model_path = '/media/ben/Data/kaggle/passenger_screening_dataset/stage1/models/epoch50_99_percent/{0}.h5'.format(
        zone)
    model = load_model(model_path)

    x_test = np.zeros((N, 16, 25, 25))

    i = 0
    for id_ in subject_ids:
        f = APS_FOLDER + id_ + '.aps'
        image = APSHandler(f)
        x = image.get_x(zone)
        x_test[i] = x
        i += 1
Esempio n. 7
0
import numpy as np
import os
from matplotlib import pyplot as plt
import cv2
import pandas as pd
import seaborn as sns
import scipy.stats as stats
from label_handler import LabelHandler
from aps_handler import APSHandler

COLORMAP = 'gray'
APS_FOLDER = '/media/ben/Data/kaggle/passenger_screening_dataset/stage1/stage1_aps/'
BODY_ZONES = '/home/ben/Documents/kaggle/passenger_screening/body_zones.png'
THREAT_LABELS = '/media/ben/Data/kaggle/passenger_screening_dataset/stage1/stage1_labels.csv'

label = LabelHandler(THREAT_LABELS)
subject_ids = label.get_subject_ids()
N = len(subject_ids)
print(N)


def make_npz(zone):
    X = np.zeros((N, 16, 25, 25))
    print(X.shape)
    i = 0
    for id_ in subject_ids:
        f = APS_FOLDER + id_ + '.aps'
        image = APSHandler(f)
        x = image.get_x(zone)
        X[i] = x
        i += 1