def get_all_handler(): categories = ['Questionnaire', 'Demographics', 'Examination', 'Laboratory'] label_handler = LabelHandler() for category in categories: label_handler.read( os.path.join('data/2015-2016/', '{}.txt'.format(category))) return label_handler
def get_2015_Questionnaire_data(target_data, symbol_list=[], csv='data_preprocess/Questionnaire.csv', label='data/2015-2016/Questionnaire.txt'): raw_csv = pd.read_csv(csv) label_handler = LabelHandler(label) train_data = select_feature(raw_csv, symbol_list) # Remove features and data for too much mmissing train_data = filter_data(train_data) # Align train data and target data according to 'SEQN' train_data, target_data = align_data_with_target(train_data, target_data) # Replace feautre names with meaningful contents, and remove unknowns columns = train_data.columns contents, noresults = label_handler.symbols_to_contents(columns) train_data.columns = contents train_data = train_data.drop(noresults, axis=1) # Remove Cigaratte feature, too many emptybyte string and aren't caught if symbol_list == []: cigarette_feature = [ 'Cig 12-digit Universal Product Code-UPC', 'Cigarette Brand/sub-brand' ] train_data = train_data.drop(cigarette_feature, axis=1) train_data = process_nan(train_data) return train_data, target_data
def get_2015_sleep_data( target, csv='data_preprocess/Sleep.csv', label='data/2015-2016/Questionnaire.txt', ): data = pd.read_csv(csv) label_handler = LabelHandler(label) data = data.drop('Unnamed: 0', 1) if target == 'all': columns = data.columns contents, noresults = label_handler.symbols_to_contents(columns) data.columns = contents # Convert time : e.g. b'23:00' -> -60 data[contents[1]] = normalize_time(data[contents[1]]) data[contents[2]] = normalize_time(data[contents[2]]) return data data = data[['SEQN', target]] data = filter_data(data, 1, 0) if target == 'SLQ300' or target == 'SLQ310': data[target] = normalize_time(data[target]) elif target == 'SLD012': pass elif target == 'SLQ030': data = data[data[target] != 7] data = data[data[target] != 9] data[target][data[target] < 2] = 0 data[target][data[target] >= 2] = 1 elif target == 'SLQ040': data = data[data[target] != 7] data = data[data[target] != 9] data[target][data[target] == 0] = 0 data[target][data[target] > 0] = 1 elif target == 'SLQ050': data = data[data[target] != 9] data[target][data[target] == 1] = 1 data[target][data[target] == 2] = 0 elif target == 'SLQ120': data = data[data[target] != 9] data[target][data[target] < 3] = 0 data[target][data[target] >= 3] = 1 elif target == 'DPQ030': data = data[data[target] != 7] data = data[data[target] != 9] data[target][data[target] == 0] = 0 data[target][data[target] > 0] = 1 return data
def test_get_questionnaire(): label_handler = LabelHandler('data/2015-2016/Questionnaire.txt') cat = label_handler.get_categories()[1] symbols = label_handler.get_symbols_by_category(cat) symbols = [] target_data = get_2015_sleep_data(target='SLQ050') train_data, target_data = get_2015_Questionnaire_data(target_data, symbols) print(train_data)
def get_2015_Examination_data(target_data, symbol_list=[], csv='data_preprocess/Examination.csv', label='data/2015-2016/Examination.txt'): raw_csv = pd.read_csv(csv) label_handler = LabelHandler(label) train_data = select_feature(raw_csv, symbol_list) train_data = filter_data(train_data) train_data, target_data = align_data_with_target(train_data, target_data) columns = train_data.columns contents, noresults = label_handler.symbols_to_contents(columns) train_data.columns = contents if noresults: train_data = train_data.drop(noresults, axis=1) train_data = process_nan(train_data) return train_data, target_data
import numpy as np from keras.models import load_model from keras import backend as K from label_handler import LabelHandler from aps_handler import APSHandler APS_FOLDER = '/media/ben/Data/kaggle/passenger_screening_dataset/stage1/stage1_aps/' TARGET_LABELS = '/media/ben/Data/kaggle/passenger_screening_dataset/stage1/stage1_sample_submission.csv' label = LabelHandler(TARGET_LABELS) subject_ids = label.get_subject_ids() N = len(subject_ids) print(N) def predict(zone): model_path = '/media/ben/Data/kaggle/passenger_screening_dataset/stage1/models/epoch50_99_percent/{0}.h5'.format( zone) model = load_model(model_path) x_test = np.zeros((N, 16, 25, 25)) i = 0 for id_ in subject_ids: f = APS_FOLDER + id_ + '.aps' image = APSHandler(f) x = image.get_x(zone) x_test[i] = x i += 1
import numpy as np import os from matplotlib import pyplot as plt import cv2 import pandas as pd import seaborn as sns import scipy.stats as stats from label_handler import LabelHandler from aps_handler import APSHandler COLORMAP = 'gray' APS_FOLDER = '/media/ben/Data/kaggle/passenger_screening_dataset/stage1/stage1_aps/' BODY_ZONES = '/home/ben/Documents/kaggle/passenger_screening/body_zones.png' THREAT_LABELS = '/media/ben/Data/kaggle/passenger_screening_dataset/stage1/stage1_labels.csv' label = LabelHandler(THREAT_LABELS) subject_ids = label.get_subject_ids() N = len(subject_ids) print(N) def make_npz(zone): X = np.zeros((N, 16, 25, 25)) print(X.shape) i = 0 for id_ in subject_ids: f = APS_FOLDER + id_ + '.aps' image = APSHandler(f) x = image.get_x(zone) X[i] = x i += 1