else: ft_row = np.append(ft_row, [np.nan,np.nan,np.nan]) else: ft_row = np.append(ft_row, [0,0,0]) else: ft_row = np.append(ft_row, [0,0,0]) # activity if 'act.csv' in sensors: data = pd.read_csv(sensor_dir+'act.csv', delimiter='\t', header=None) n = float(data[0][:].size) per_still = np.sum(data[1][:]=='STILL')/n per_tilt = np.sum(data[1][:]=='TILTING')/n per_onfoot = np.sum(data[1][:]=='ONFOOT')/n per_unknown = np.sum(data[1][:]=='UNKNOWN')/n n_trans1 = count_transitions(data[1][:],'STILL','ONFOOT')/n n_trans2 = count_transitions(data[1][:],'STILL','TILTING')/n n_trans3 = count_transitions(data[1][:],'STILL','UNKNOWN')/n n_trans4 = count_transitions(data[1][:],'ONFOOT','UNKNOWN')/n ft_row = np.append(ft_row, [per_still, per_tilt, per_onfoot, per_unknown, n_trans1, n_trans2, n_trans3, n_trans4]) else: ft_row = np.append(ft_row, [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]) # apps if 'app.csv' in sensors: data = pd.read_csv(sensor_dir+'app.csv', delimiter='\t', header=None) ft_row = np.append(ft_row, [np.sum(data[2][:]=='Messaging'), np.sum(data[2][:]=='Facebook'), np.sum(data[2][:]=='Chrome'), np.sum(data[2][:]=='Mobilyze'), np.sum(data[2][:]=='Phone'), np.sum(data[2][:]=='Gmail'), np.sum(data[2][:]=='Contacts'), np.sum(data[2][:]=='Internet'), np.sum(data[2][:]=='Gallery'), np.sum(data[2][:]=='Email'), np.sum(data[2][:]=='Settings'), np.sum(data[2][:]=='Messenger'), np.sum(data[2][:]=='Camera'), np.sum(data[2][:]=='Clock'), np.sum(data[2][:]=='Maps'), np.sum(data[2][:]=='Calendar'), np.sum(data[2][:]=='Youtube'), np.sum(data[2][:]=='Calculator'), np.sum(data[2][:]=='Purple Robot'), np.sum(data[2][:]=='System UI')]) else: ft_row = np.append(ft_row, np.zeros([1,20])) # communication
def extract_features(subjects): from preprocess import preprocess_location, preprocess_reason import csv import os import numpy as np from get_data_at_location import get_data_at_location from calculate_confusion_matrix import calculate_confusion_matrix import math import pickle import pandas as pd import datetime from scipy import stats from count_transitions import count_transitions from sklearn.preprocessing import OneHotEncoder from sklearn import preprocessing save_results = True break_locations = False # generate separate locations for each of the meniotned locations remove_vehicle = True data_dir = "data/" # data_dir_orig = '/home/sohrob/Dropbox/Data/CS120/' fsq_map = { "Nightlife Spot": "Nightlife Spot (Bar, Club)", "Outdoors & Recreation": "Outdoors & Recreation", "Arts & Entertainment": "Arts & Entertainment (Theater, Music Venue, Etc.)", "Professional & Other Places": "Professional or Medical Office", "Food": "Food (Restaurant, Cafe)", "Residence": "Home", "Shop & Service": "Shop or Store", "Travel & Transport": "Travel or Transport (Airport, Bus Stop, Train Station, Etc.)", } # building one hot encoder for foursquare locations (as extra features) state7 = np.array(fsq_map.values() + ["Unknown"]) le = preprocessing.LabelEncoder() le.fit(state7) state7_code = le.transform(state7) enc = OneHotEncoder() enc.fit(state7_code.reshape(-1, 1)) # subjects = os.listdir(data_dir) for subj in subjects: print subj subject_dir = data_dir + subj + "/" samples = os.listdir(subject_dir) # checking in the original directory if the subject has app data # sensors = os.listdir(data_dir_orig+subj) # if 'app.csv' in sensors: # has_app_data = True # else: # has_app_data = False # initialization feature = pd.DataFrame() target = pd.DataFrame() ind_last = 0 for (i, samp) in enumerate(samples): sensor_dir = subject_dir + samp + "/" sensors = os.listdir(sensor_dir) # reading semantic location data and skipping if it does not exist if "eml.csv" in sensors: filename = sensor_dir + "eml.csv" data = pd.read_csv(filename, delimiter="\t", header=None) # removing Vehicle category if remove_vehicle and data.loc[0, 6] == '["Vehicle"]': print "vehicle category skipped" continue if break_locations: target.loc[ind_last, "location"] = data.loc[0, 6] else: target.loc[ind_last, "location"] = preprocess_location(data.loc[0, 6], parse=False) target.loc[ind_last, "reason"] = preprocess_reason(data.loc[0, 7], parse=False) target.loc[ind_last, "accomplishment"] = data.loc[0, 8] target.loc[ind_last, "pleasure"] = data.loc[0, 9] else: print "subject {} does not have location report data at i. skipping".format(subject, samp) continue if "fsq2.csv" in sensors: data_fsq = pd.read_csv(sensor_dir + "fsq2.csv", delimiter="\t", header=None) loc_fsq = data_fsq.loc[10, 1] distance_fsq = float(data_fsq.loc[11, 1]) # converting foursquare category name to standard name if loc_fsq in fsq_map: loc_fsq = fsq_map[loc_fsq] else: loc_fsq = "Unknown" else: loc_fsq = "Unknown" distance_fsq = np.nan target.loc[ind_last, "fsq"] = loc_fsq ## sensor features # light if "lgt.csv" in sensors: data = pd.read_csv(sensor_dir + "lgt.csv", delimiter="\t", header=None) lgt = data[:][1] feature.loc[ind_last, "lgt mean"] = np.nanmean(lgt) feature.loc[ind_last, "lgt std"] = np.nanstd(lgt) feature.loc[ind_last, "lgt off"] = np.sum(lgt == 0) / float(lgt.size) feature.loc[ind_last, "lgt zcrossing"] = np.sum(np.diff(np.sign(lgt - np.nanmean(lgt)))) / float( lgt.size ) feature.loc[ind_last, "lgt skew"] = stats.skew(lgt) feature.loc[ind_last, "lgt kurt"] = stats.kurtosis(lgt) else: feature.loc[ind_last, "lgt mean"] = np.nan feature.loc[ind_last, "lgt std"] = np.nan feature.loc[ind_last, "lgt off"] = np.nan feature.loc[ind_last, "lgt zcrossing"] = np.nan feature.loc[ind_last, "lgt skew"] = np.nan feature.loc[ind_last, "lgt kurt"] = np.nan # audio if "aud.csv" in sensors: data = pd.read_csv(sensor_dir + "aud.csv", delimiter="\t", header=None) feature.loc[ind_last, "aud mean"] = np.nanmean(data[:][1]) feature.loc[ind_last, "aud std"] = np.nanstd(data[:][1]) feature.loc[ind_last, "aud skew"] = stats.skew(data[:][1]) feature.loc[ind_last, "aud kurt"] = stats.kurtosis(data[:][1]) feature.loc[ind_last, "aud frq mean"] = np.nanmean(data[:][2]) feature.loc[ind_last, "aud frq std"] = np.nanstd(data[:][2]) feature.loc[ind_last, "aud frq skew"] = stats.skew(data[:][2]) feature.loc[ind_last, "aud frq kurt"] = stats.kurtosis(data[:][2]) else: feature.loc[ind_last, "aud mean"] = np.nan feature.loc[ind_last, "aud std"] = np.nan feature.loc[ind_last, "aud skew"] = np.nan feature.loc[ind_last, "aud kurt"] = np.nan feature.loc[ind_last, "aud frq mean"] = np.nan feature.loc[ind_last, "aud frq std"] = np.nan feature.loc[ind_last, "aud frq skew"] = np.nan feature.loc[ind_last, "aud frq kurt"] = np.nan # screen if "scr.csv" in sensors: data = pd.read_csv(sensor_dir + "scr.csv", delimiter="\t", header=None) if data[:][0].size >= 2: deltat = data[0][data[0][:].size - 1] - data[0][0] if deltat != 0: scr_dur = np.array([]) scr_frq = 0 for j in range(data[1][:].size - 1): if data[1][j] == "True" and data[1][j + 1] == "False": scr_dur = np.append(scr_dur, data[0][j + 1] - data[0][j]) scr_frq += 1 feature.loc[ind_last, "scr frq"] = scr_frq / float(deltat) feature.loc[ind_last, "scr dur mean"] = np.mean(scr_dur) feature.loc[ind_last, "scr dur std"] = np.std(scr_dur) else: feature.loc[ind_last, "scr frq"] = np.nan feature.loc[ind_last, "scr dur mean"] = np.nan feature.loc[ind_last, "scr dur std"] = np.nan else: feature.loc[ind_last, "scr frq"] = 0 feature.loc[ind_last, "scr dur mean"] = 0 feature.loc[ind_last, "scr dur std"] = np.nan else: feature.loc[ind_last, "scr frq"] = 0 feature.loc[ind_last, "scr dur mean"] = 0 feature.loc[ind_last, "scr dur std"] = np.nan # activity if "act.csv" in sensors: data = pd.read_csv(sensor_dir + "act.csv", delimiter="\t", header=None) n = float(data[0][:].size) feature.loc[ind_last, "still"] = np.sum(data[1][:] == "STILL") / n feature.loc[ind_last, "tilting"] = np.sum(data[1][:] == "TILTING") / n feature.loc[ind_last, "walking"] = np.sum(data[1][:] == "ONFOOT") / n feature.loc[ind_last, "unknown act"] = np.sum(data[1][:] == "UNKNOWN") / n feature.loc[ind_last, "still-walking"] = count_transitions(data[1][:], "STILL", "ONFOOT") / n feature.loc[ind_last, "still-tilting"] = count_transitions(data[1][:], "STILL", "TILTING") / n feature.loc[ind_last, "still-unknown"] = count_transitions(data[1][:], "STILL", "UNKNOWN") / n feature.loc[ind_last, "walking-unknown"] = count_transitions(data[1][:], "ONFOOT", "UNKNOWN") / n else: feature.loc[ind_last, "still"] = np.nan feature.loc[ind_last, "tilting"] = np.nan feature.loc[ind_last, "walking"] = np.nan feature.loc[ind_last, "unknown act"] = np.nan feature.loc[ind_last, "still-walking"] = np.nan feature.loc[ind_last, "still-tilting"] = np.nan feature.loc[ind_last, "still-unknown"] = np.nan feature.loc[ind_last, "walking-unknown"] = np.nan # apps # if 'app.csv' in sensors: # data = pd.read_csv(sensor_dir+'app.csv', delimiter='\t', header=None) # feature.loc[ind_last, 'messaging'] = np.sum(data[2][:]=='Messaging') # feature.loc[ind_last, 'facebook'] = np.sum(data[2][:]=='Facebook') # feature.loc[ind_last, 'chrome'] = np.sum(data[2][:]=='Chrome') # feature.loc[ind_last, 'mobilyze'] = np.sum(data[2][:]=='Mobilyze') # feature.loc[ind_last, 'phone'] = np.sum(data[2][:]=='Phone') # feature.loc[ind_last, 'gmail'] = np.sum(data[2][:]=='Gmail') # feature.loc[ind_last, 'contacts'] = np.sum(data[2][:]=='Contacts') # feature.loc[ind_last, 'internet'] = np.sum(data[2][:]=='Internet') # feature.loc[ind_last, 'gallery'] = np.sum(data[2][:]=='Gallery') # feature.loc[ind_last, 'email'] = np.sum(data[2][:]=='Email') # feature.loc[ind_last, 'settings'] = np.sum(data[2][:]=='Settings') # feature.loc[ind_last, 'messenger'] = np.sum(data[2][:]=='Messenger') # feature.loc[ind_last, 'camera'] = np.sum(data[2][:]=='Camera') # feature.loc[ind_last, 'clock'] = np.sum(data[2][:]=='Clock') # feature.loc[ind_last, 'maps'] = np.sum(data[2][:]=='Maps') # feature.loc[ind_last, 'calendar'] = np.sum(data[2][:]=='Calendar') # feature.loc[ind_last, 'youtube'] = np.sum(data[2][:]=='Youtube') # feature.loc[ind_last, 'calculator'] = np.sum(data[2][:]=='Calculator') # feature.loc[ind_last, 'purple robot'] = np.sum(data[2][:]=='Purple Robot') # feature.loc[ind_last, 'system ui'] = np.sum(data[2][:]=='System UI') # else: # if has_app_data: # if not, leave them as NaN # feature.loc[ind_last, 'messaging'] = 0 # feature.loc[ind_last, 'facebook'] = 0 # feature.loc[ind_last, 'chrome'] = 0 # feature.loc[ind_last, 'mobilyze'] = 0 # feature.loc[ind_last, 'phone'] = 0 # feature.loc[ind_last, 'gmail'] = 0 # feature.loc[ind_last, 'contacts'] = 0 # feature.loc[ind_last, 'internet'] = 0 # feature.loc[ind_last, 'gallery'] = 0 # feature.loc[ind_last, 'email'] = 0 # feature.loc[ind_last, 'settings'] = 0 # feature.loc[ind_last, 'messenger'] = 0 # feature.loc[ind_last, 'camera'] = 0 # feature.loc[ind_last, 'clock'] = 0 # feature.loc[ind_last, 'maps'] = 0 # feature.loc[ind_last, 'calendar'] = 0 # feature.loc[ind_last, 'youtube'] = 0 # feature.loc[ind_last, 'calculator'] = 0 # feature.loc[ind_last, 'purple robot'] = 0 # feature.loc[ind_last, 'system ui'] = 0 # else: # feature.loc[ind_last, 'messaging'] = np.nan # feature.loc[ind_last, 'facebook'] = np.nan # feature.loc[ind_last, 'chrome'] = np.nan # feature.loc[ind_last, 'mobilyze'] = np.nan # feature.loc[ind_last, 'phone'] = np.nan # feature.loc[ind_last, 'gmail'] = np.nan # feature.loc[ind_last, 'contacts'] = np.nan # feature.loc[ind_last, 'internet'] = np.nan # feature.loc[ind_last, 'gallery'] = np.nan # feature.loc[ind_last, 'email'] = np.nan # feature.loc[ind_last, 'settings'] = np.nan # feature.loc[ind_last, 'messenger'] = np.nan # feature.loc[ind_last, 'camera'] = np.nan # feature.loc[ind_last, 'clock'] = np.nan # feature.loc[ind_last, 'maps'] = np.nan # feature.loc[ind_last, 'calendar'] = np.nan # feature.loc[ind_last, 'youtube'] = np.nan # feature.loc[ind_last, 'calculator'] = np.nan # feature.loc[ind_last, 'purple robot'] = np.nan # feature.loc[ind_last, 'system ui'] = np.nan # communication if "coe.csv" in sensors: data = pd.read_csv(sensor_dir + "coe.csv", delimiter="\t", header=None) feature.loc[ind_last, "call in"] = np.sum( np.logical_and(data[3][:] == "PHONE", data[4][:] == "INCOMING") ) feature.loc[ind_last, "call out"] = np.sum( np.logical_and(data[3][:] == "PHONE", data[4][:] == "OUTGOING") ) feature.loc[ind_last, "sms in"] = np.sum(np.logical_and(data[3][:] == "SMS", data[4][:] == "INCOMING")) feature.loc[ind_last, "sms out"] = np.sum(np.logical_and(data[3][:] == "SMS", data[4][:] == "OUTGOING")) feature.loc[ind_last, "call missed"] = np.sum(data[4][:] == "MISSED") else: feature.loc[ind_last, "call in"] = 0 feature.loc[ind_last, "call out"] = 0 feature.loc[ind_last, "sms in"] = 0 feature.loc[ind_last, "sms out"] = 0 feature.loc[ind_last, "call missed"] = 0 # wifi if "wif.csv" in sensors: data = pd.read_csv(sensor_dir + "wif.csv", delimiter="\t", header=None) feature.loc[ind_last, "n wifi"] = np.mean(data[3][:]) else: feature.loc[ind_last, "n wifi"] = np.nan # weather if "wtr.csv" in sensors: data = pd.read_csv(sensor_dir + "wtr.csv", delimiter="\t", header=None) wtr_cond = stats.mode(data[9][:])[0][0] if not isinstance(wtr_cond, basestring): wtr_cond = str(wtr_cond) feature.loc[ind_last, "temperature"] = np.mean(data[1][:]) feature.loc[ind_last, "dew point"] = np.mean(data[3][:]) feature.loc[ind_last, "weather"] = sum(ord(c) for c in wtr_cond) else: feature.loc[ind_last, "temperature"] = np.nan feature.loc[ind_last, "dew point"] = np.nan feature.loc[ind_last, "weather"] = np.nan # GPS and time if "fus.csv" in sensors: data = pd.read_csv(sensor_dir + "fus.csv", delimiter="\t", header=None) t_start = data[0][0] t_end = data[0][data[0][:].size - 1] feature.loc[ind_last, "lat mean"] = np.mean(data[1][:]) feature.loc[ind_last, "lng mean"] = np.mean(data[2][:]) feature.loc[ind_last, "loc var"] = np.log(np.var(data[1][:]) + np.var(data[2][:]) + 1e-16) feature.loc[ind_last, "duration"] = t_end - t_start feature.loc[ind_last, "midtime"] = (t_end + t_start) / 2.0 feature.loc[ind_last, "midhour"] = ((t_end + t_start) / 2.0) % 86400 feature.loc[ind_last, "dow start"] = datetime.datetime.fromtimestamp(t_start).weekday() feature.loc[ind_last, "dow end"] = datetime.datetime.fromtimestamp(t_end).weekday() feature.loc[ind_last, "n gps"] = data.shape[0] else: feature.loc[ind_last, "lat mean"] = np.nan feature.loc[ind_last, "lng mean"] = np.nan feature.loc[ind_last, "loc var"] = np.nan feature.loc[ind_last, "duration"] = np.nan feature.loc[ind_last, "midtime"] = np.nan feature.loc[ind_last, "midhour"] = np.nan feature.loc[ind_last, "dow start"] = np.nan feature.loc[ind_last, "dow end"] = np.nan feature.loc[ind_last, "n gps"] = 0.0 # foursquare location in binary form loc_fsq_code = le.transform(loc_fsq) loc_fsq_bin = enc.transform(loc_fsq_code.reshape(-1, 1)).toarray() loc_fsq_bin = loc_fsq_bin[0] for j in range(loc_fsq_bin.size): feature.loc[ind_last, "fsq {}".format(j)] = loc_fsq_bin[j] # distance to closest foursquare location (m) feature.loc[ind_last, "fsq distance"] = distance_fsq # break locations and generate duplicate data for other sensors if break_locations: locs = target.loc[ind_last, "location"] locs = locs[1:-1] # remove brackets locs = locs.split('", "') locs = [l.replace('"', "") for l in locs] locs = filter(None, locs) # remove any empty strings # first repeating everything for i in range(len(locs) - 1): target.loc[ind_last + 1 + i, :] = target.loc[ind_last, :] feature.loc[ind_last + 1 + i, :] = feature.loc[ind_last, :] # noew replacing locations with new values for (i, _) in enumerate(locs): target.loc[ind_last + i, "location"] = locs[i] # last index ind_last += len(locs) else: ind_last += 1 if save_results: with open("features/" + subj + ".dat", "w") as file_out: pickle.dump([feature, target], file_out) file_out.close() return 0