else:
             ft_row = np.append(ft_row, [np.nan,np.nan,np.nan])
     else:
         ft_row = np.append(ft_row, [0,0,0])
 else:
     ft_row = np.append(ft_row, [0,0,0])
 
 # activity
 if 'act.csv' in sensors:
     data = pd.read_csv(sensor_dir+'act.csv', delimiter='\t', header=None)
     n = float(data[0][:].size)
     per_still = np.sum(data[1][:]=='STILL')/n
     per_tilt = np.sum(data[1][:]=='TILTING')/n
     per_onfoot = np.sum(data[1][:]=='ONFOOT')/n
     per_unknown = np.sum(data[1][:]=='UNKNOWN')/n
     n_trans1 = count_transitions(data[1][:],'STILL','ONFOOT')/n
     n_trans2 = count_transitions(data[1][:],'STILL','TILTING')/n
     n_trans3 = count_transitions(data[1][:],'STILL','UNKNOWN')/n
     n_trans4 = count_transitions(data[1][:],'ONFOOT','UNKNOWN')/n
     ft_row = np.append(ft_row, [per_still, per_tilt, per_onfoot, per_unknown, n_trans1, n_trans2,                                       n_trans3, n_trans4])
 else:
     ft_row = np.append(ft_row, [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan])
 
 # apps
 if 'app.csv' in sensors:
     data = pd.read_csv(sensor_dir+'app.csv', delimiter='\t', header=None)
     ft_row = np.append(ft_row, [np.sum(data[2][:]=='Messaging'),                                        np.sum(data[2][:]=='Facebook'),                                        np.sum(data[2][:]=='Chrome'),                                        np.sum(data[2][:]=='Mobilyze'),                                        np.sum(data[2][:]=='Phone'),                                        np.sum(data[2][:]=='Gmail'),                                        np.sum(data[2][:]=='Contacts'),                                        np.sum(data[2][:]=='Internet'),                                        np.sum(data[2][:]=='Gallery'),                                        np.sum(data[2][:]=='Email'),                                        np.sum(data[2][:]=='Settings'),                                        np.sum(data[2][:]=='Messenger'),                                        np.sum(data[2][:]=='Camera'),                                        np.sum(data[2][:]=='Clock'),                                        np.sum(data[2][:]=='Maps'),                                        np.sum(data[2][:]=='Calendar'),                                        np.sum(data[2][:]=='Youtube'),                                        np.sum(data[2][:]=='Calculator'),                                        np.sum(data[2][:]=='Purple Robot'),                                        np.sum(data[2][:]=='System UI')])
 else:
     ft_row = np.append(ft_row, np.zeros([1,20]))
     
 # communication
def extract_features(subjects):

    from preprocess import preprocess_location, preprocess_reason
    import csv
    import os
    import numpy as np
    from get_data_at_location import get_data_at_location
    from calculate_confusion_matrix import calculate_confusion_matrix
    import math
    import pickle
    import pandas as pd
    import datetime
    from scipy import stats
    from count_transitions import count_transitions
    from sklearn.preprocessing import OneHotEncoder
    from sklearn import preprocessing

    save_results = True
    break_locations = False  # generate separate locations for each of the meniotned locations
    remove_vehicle = True

    data_dir = "data/"
    #     data_dir_orig = '/home/sohrob/Dropbox/Data/CS120/'

    fsq_map = {
        "Nightlife Spot": "Nightlife Spot (Bar, Club)",
        "Outdoors & Recreation": "Outdoors & Recreation",
        "Arts & Entertainment": "Arts & Entertainment (Theater, Music Venue, Etc.)",
        "Professional & Other Places": "Professional or Medical Office",
        "Food": "Food (Restaurant, Cafe)",
        "Residence": "Home",
        "Shop & Service": "Shop or Store",
        "Travel & Transport": "Travel or Transport (Airport, Bus Stop, Train Station, Etc.)",
    }

    # building one hot encoder for foursquare locations (as extra features)
    state7 = np.array(fsq_map.values() + ["Unknown"])
    le = preprocessing.LabelEncoder()
    le.fit(state7)
    state7_code = le.transform(state7)
    enc = OneHotEncoder()
    enc.fit(state7_code.reshape(-1, 1))

    # subjects = os.listdir(data_dir)

    for subj in subjects:

        print subj

        subject_dir = data_dir + subj + "/"
        samples = os.listdir(subject_dir)

        # checking in the original directory if the subject has app data
        #         sensors = os.listdir(data_dir_orig+subj)
        #         if 'app.csv' in sensors:
        #             has_app_data = True
        #         else:
        #             has_app_data = False

        # initialization
        feature = pd.DataFrame()
        target = pd.DataFrame()

        ind_last = 0

        for (i, samp) in enumerate(samples):

            sensor_dir = subject_dir + samp + "/"
            sensors = os.listdir(sensor_dir)

            # reading semantic location data and skipping if it does not exist
            if "eml.csv" in sensors:
                filename = sensor_dir + "eml.csv"
                data = pd.read_csv(filename, delimiter="\t", header=None)
                # removing Vehicle category
                if remove_vehicle and data.loc[0, 6] == '["Vehicle"]':
                    print "vehicle category skipped"
                    continue
                if break_locations:
                    target.loc[ind_last, "location"] = data.loc[0, 6]
                else:
                    target.loc[ind_last, "location"] = preprocess_location(data.loc[0, 6], parse=False)
                target.loc[ind_last, "reason"] = preprocess_reason(data.loc[0, 7], parse=False)
                target.loc[ind_last, "accomplishment"] = data.loc[0, 8]
                target.loc[ind_last, "pleasure"] = data.loc[0, 9]
            else:
                print "subject {} does not have location report data at i. skipping".format(subject, samp)
                continue

            if "fsq2.csv" in sensors:
                data_fsq = pd.read_csv(sensor_dir + "fsq2.csv", delimiter="\t", header=None)
                loc_fsq = data_fsq.loc[10, 1]
                distance_fsq = float(data_fsq.loc[11, 1])

                # converting foursquare category name to standard name
                if loc_fsq in fsq_map:
                    loc_fsq = fsq_map[loc_fsq]
                else:
                    loc_fsq = "Unknown"

            else:
                loc_fsq = "Unknown"
                distance_fsq = np.nan

            target.loc[ind_last, "fsq"] = loc_fsq

            ## sensor features
            # light
            if "lgt.csv" in sensors:
                data = pd.read_csv(sensor_dir + "lgt.csv", delimiter="\t", header=None)
                lgt = data[:][1]
                feature.loc[ind_last, "lgt mean"] = np.nanmean(lgt)
                feature.loc[ind_last, "lgt std"] = np.nanstd(lgt)
                feature.loc[ind_last, "lgt off"] = np.sum(lgt == 0) / float(lgt.size)
                feature.loc[ind_last, "lgt zcrossing"] = np.sum(np.diff(np.sign(lgt - np.nanmean(lgt)))) / float(
                    lgt.size
                )
                feature.loc[ind_last, "lgt skew"] = stats.skew(lgt)
                feature.loc[ind_last, "lgt kurt"] = stats.kurtosis(lgt)
            else:
                feature.loc[ind_last, "lgt mean"] = np.nan
                feature.loc[ind_last, "lgt std"] = np.nan
                feature.loc[ind_last, "lgt off"] = np.nan
                feature.loc[ind_last, "lgt zcrossing"] = np.nan
                feature.loc[ind_last, "lgt skew"] = np.nan
                feature.loc[ind_last, "lgt kurt"] = np.nan

            # audio
            if "aud.csv" in sensors:
                data = pd.read_csv(sensor_dir + "aud.csv", delimiter="\t", header=None)
                feature.loc[ind_last, "aud mean"] = np.nanmean(data[:][1])
                feature.loc[ind_last, "aud std"] = np.nanstd(data[:][1])
                feature.loc[ind_last, "aud skew"] = stats.skew(data[:][1])
                feature.loc[ind_last, "aud kurt"] = stats.kurtosis(data[:][1])
                feature.loc[ind_last, "aud frq mean"] = np.nanmean(data[:][2])
                feature.loc[ind_last, "aud frq std"] = np.nanstd(data[:][2])
                feature.loc[ind_last, "aud frq skew"] = stats.skew(data[:][2])
                feature.loc[ind_last, "aud frq kurt"] = stats.kurtosis(data[:][2])
            else:
                feature.loc[ind_last, "aud mean"] = np.nan
                feature.loc[ind_last, "aud std"] = np.nan
                feature.loc[ind_last, "aud skew"] = np.nan
                feature.loc[ind_last, "aud kurt"] = np.nan
                feature.loc[ind_last, "aud frq mean"] = np.nan
                feature.loc[ind_last, "aud frq std"] = np.nan
                feature.loc[ind_last, "aud frq skew"] = np.nan
                feature.loc[ind_last, "aud frq kurt"] = np.nan

            # screen
            if "scr.csv" in sensors:
                data = pd.read_csv(sensor_dir + "scr.csv", delimiter="\t", header=None)
                if data[:][0].size >= 2:
                    deltat = data[0][data[0][:].size - 1] - data[0][0]
                    if deltat != 0:
                        scr_dur = np.array([])
                        scr_frq = 0
                        for j in range(data[1][:].size - 1):
                            if data[1][j] == "True" and data[1][j + 1] == "False":
                                scr_dur = np.append(scr_dur, data[0][j + 1] - data[0][j])
                                scr_frq += 1
                        feature.loc[ind_last, "scr frq"] = scr_frq / float(deltat)
                        feature.loc[ind_last, "scr dur mean"] = np.mean(scr_dur)
                        feature.loc[ind_last, "scr dur std"] = np.std(scr_dur)
                    else:
                        feature.loc[ind_last, "scr frq"] = np.nan
                        feature.loc[ind_last, "scr dur mean"] = np.nan
                        feature.loc[ind_last, "scr dur std"] = np.nan
                else:
                    feature.loc[ind_last, "scr frq"] = 0
                    feature.loc[ind_last, "scr dur mean"] = 0
                    feature.loc[ind_last, "scr dur std"] = np.nan
            else:
                feature.loc[ind_last, "scr frq"] = 0
                feature.loc[ind_last, "scr dur mean"] = 0
                feature.loc[ind_last, "scr dur std"] = np.nan

            # activity
            if "act.csv" in sensors:
                data = pd.read_csv(sensor_dir + "act.csv", delimiter="\t", header=None)
                n = float(data[0][:].size)
                feature.loc[ind_last, "still"] = np.sum(data[1][:] == "STILL") / n
                feature.loc[ind_last, "tilting"] = np.sum(data[1][:] == "TILTING") / n
                feature.loc[ind_last, "walking"] = np.sum(data[1][:] == "ONFOOT") / n
                feature.loc[ind_last, "unknown act"] = np.sum(data[1][:] == "UNKNOWN") / n
                feature.loc[ind_last, "still-walking"] = count_transitions(data[1][:], "STILL", "ONFOOT") / n
                feature.loc[ind_last, "still-tilting"] = count_transitions(data[1][:], "STILL", "TILTING") / n
                feature.loc[ind_last, "still-unknown"] = count_transitions(data[1][:], "STILL", "UNKNOWN") / n
                feature.loc[ind_last, "walking-unknown"] = count_transitions(data[1][:], "ONFOOT", "UNKNOWN") / n
            else:
                feature.loc[ind_last, "still"] = np.nan
                feature.loc[ind_last, "tilting"] = np.nan
                feature.loc[ind_last, "walking"] = np.nan
                feature.loc[ind_last, "unknown act"] = np.nan
                feature.loc[ind_last, "still-walking"] = np.nan
                feature.loc[ind_last, "still-tilting"] = np.nan
                feature.loc[ind_last, "still-unknown"] = np.nan
                feature.loc[ind_last, "walking-unknown"] = np.nan

            # apps
            #             if 'app.csv' in sensors:
            #                 data = pd.read_csv(sensor_dir+'app.csv', delimiter='\t', header=None)
            #                 feature.loc[ind_last, 'messaging'] = np.sum(data[2][:]=='Messaging')
            #                 feature.loc[ind_last, 'facebook'] = np.sum(data[2][:]=='Facebook')
            #                 feature.loc[ind_last, 'chrome'] = np.sum(data[2][:]=='Chrome')
            #                 feature.loc[ind_last, 'mobilyze'] = np.sum(data[2][:]=='Mobilyze')
            #                 feature.loc[ind_last, 'phone'] = np.sum(data[2][:]=='Phone')
            #                 feature.loc[ind_last, 'gmail'] = np.sum(data[2][:]=='Gmail')
            #                 feature.loc[ind_last, 'contacts'] = np.sum(data[2][:]=='Contacts')
            #                 feature.loc[ind_last, 'internet'] = np.sum(data[2][:]=='Internet')
            #                 feature.loc[ind_last, 'gallery'] = np.sum(data[2][:]=='Gallery')
            #                 feature.loc[ind_last, 'email'] = np.sum(data[2][:]=='Email')
            #                 feature.loc[ind_last, 'settings'] = np.sum(data[2][:]=='Settings')
            #                 feature.loc[ind_last, 'messenger'] = np.sum(data[2][:]=='Messenger')
            #                 feature.loc[ind_last, 'camera'] = np.sum(data[2][:]=='Camera')
            #                 feature.loc[ind_last, 'clock'] = np.sum(data[2][:]=='Clock')
            #                 feature.loc[ind_last, 'maps'] = np.sum(data[2][:]=='Maps')
            #                 feature.loc[ind_last, 'calendar'] = np.sum(data[2][:]=='Calendar')
            #                 feature.loc[ind_last, 'youtube'] = np.sum(data[2][:]=='Youtube')
            #                 feature.loc[ind_last, 'calculator'] = np.sum(data[2][:]=='Calculator')
            #                 feature.loc[ind_last, 'purple robot'] = np.sum(data[2][:]=='Purple Robot')
            #                 feature.loc[ind_last, 'system ui'] = np.sum(data[2][:]=='System UI')
            #             else:
            #                 if has_app_data: # if not, leave them as NaN
            #                     feature.loc[ind_last, 'messaging'] = 0
            #                     feature.loc[ind_last, 'facebook'] = 0
            #                     feature.loc[ind_last, 'chrome'] = 0
            #                     feature.loc[ind_last, 'mobilyze'] = 0
            #                     feature.loc[ind_last, 'phone'] = 0
            #                     feature.loc[ind_last, 'gmail'] = 0
            #                     feature.loc[ind_last, 'contacts'] = 0
            #                     feature.loc[ind_last, 'internet'] = 0
            #                     feature.loc[ind_last, 'gallery'] = 0
            #                     feature.loc[ind_last, 'email'] = 0
            #                     feature.loc[ind_last, 'settings'] = 0
            #                     feature.loc[ind_last, 'messenger'] = 0
            #                     feature.loc[ind_last, 'camera'] = 0
            #                     feature.loc[ind_last, 'clock'] = 0
            #                     feature.loc[ind_last, 'maps'] = 0
            #                     feature.loc[ind_last, 'calendar'] = 0
            #                     feature.loc[ind_last, 'youtube'] = 0
            #                     feature.loc[ind_last, 'calculator'] = 0
            #                     feature.loc[ind_last, 'purple robot'] = 0
            #                     feature.loc[ind_last, 'system ui'] = 0
            #                 else:
            #                     feature.loc[ind_last, 'messaging'] = np.nan
            #                     feature.loc[ind_last, 'facebook'] = np.nan
            #                     feature.loc[ind_last, 'chrome'] = np.nan
            #                     feature.loc[ind_last, 'mobilyze'] = np.nan
            #                     feature.loc[ind_last, 'phone'] = np.nan
            #                     feature.loc[ind_last, 'gmail'] = np.nan
            #                     feature.loc[ind_last, 'contacts'] = np.nan
            #                     feature.loc[ind_last, 'internet'] = np.nan
            #                     feature.loc[ind_last, 'gallery'] = np.nan
            #                     feature.loc[ind_last, 'email'] = np.nan
            #                     feature.loc[ind_last, 'settings'] = np.nan
            #                     feature.loc[ind_last, 'messenger'] = np.nan
            #                     feature.loc[ind_last, 'camera'] = np.nan
            #                     feature.loc[ind_last, 'clock'] = np.nan
            #                     feature.loc[ind_last, 'maps'] = np.nan
            #                     feature.loc[ind_last, 'calendar'] = np.nan
            #                     feature.loc[ind_last, 'youtube'] = np.nan
            #                     feature.loc[ind_last, 'calculator'] = np.nan
            #                     feature.loc[ind_last, 'purple robot'] = np.nan
            #                     feature.loc[ind_last, 'system ui'] = np.nan

            # communication
            if "coe.csv" in sensors:
                data = pd.read_csv(sensor_dir + "coe.csv", delimiter="\t", header=None)
                feature.loc[ind_last, "call in"] = np.sum(
                    np.logical_and(data[3][:] == "PHONE", data[4][:] == "INCOMING")
                )
                feature.loc[ind_last, "call out"] = np.sum(
                    np.logical_and(data[3][:] == "PHONE", data[4][:] == "OUTGOING")
                )
                feature.loc[ind_last, "sms in"] = np.sum(np.logical_and(data[3][:] == "SMS", data[4][:] == "INCOMING"))
                feature.loc[ind_last, "sms out"] = np.sum(np.logical_and(data[3][:] == "SMS", data[4][:] == "OUTGOING"))
                feature.loc[ind_last, "call missed"] = np.sum(data[4][:] == "MISSED")
            else:
                feature.loc[ind_last, "call in"] = 0
                feature.loc[ind_last, "call out"] = 0
                feature.loc[ind_last, "sms in"] = 0
                feature.loc[ind_last, "sms out"] = 0
                feature.loc[ind_last, "call missed"] = 0

            # wifi
            if "wif.csv" in sensors:
                data = pd.read_csv(sensor_dir + "wif.csv", delimiter="\t", header=None)
                feature.loc[ind_last, "n wifi"] = np.mean(data[3][:])
            else:
                feature.loc[ind_last, "n wifi"] = np.nan

            # weather
            if "wtr.csv" in sensors:
                data = pd.read_csv(sensor_dir + "wtr.csv", delimiter="\t", header=None)
                wtr_cond = stats.mode(data[9][:])[0][0]
                if not isinstance(wtr_cond, basestring):
                    wtr_cond = str(wtr_cond)
                feature.loc[ind_last, "temperature"] = np.mean(data[1][:])
                feature.loc[ind_last, "dew point"] = np.mean(data[3][:])
                feature.loc[ind_last, "weather"] = sum(ord(c) for c in wtr_cond)
            else:
                feature.loc[ind_last, "temperature"] = np.nan
                feature.loc[ind_last, "dew point"] = np.nan
                feature.loc[ind_last, "weather"] = np.nan

            # GPS and time
            if "fus.csv" in sensors:
                data = pd.read_csv(sensor_dir + "fus.csv", delimiter="\t", header=None)
                t_start = data[0][0]
                t_end = data[0][data[0][:].size - 1]
                feature.loc[ind_last, "lat mean"] = np.mean(data[1][:])
                feature.loc[ind_last, "lng mean"] = np.mean(data[2][:])
                feature.loc[ind_last, "loc var"] = np.log(np.var(data[1][:]) + np.var(data[2][:]) + 1e-16)
                feature.loc[ind_last, "duration"] = t_end - t_start
                feature.loc[ind_last, "midtime"] = (t_end + t_start) / 2.0
                feature.loc[ind_last, "midhour"] = ((t_end + t_start) / 2.0) % 86400
                feature.loc[ind_last, "dow start"] = datetime.datetime.fromtimestamp(t_start).weekday()
                feature.loc[ind_last, "dow end"] = datetime.datetime.fromtimestamp(t_end).weekday()
                feature.loc[ind_last, "n gps"] = data.shape[0]
            else:
                feature.loc[ind_last, "lat mean"] = np.nan
                feature.loc[ind_last, "lng mean"] = np.nan
                feature.loc[ind_last, "loc var"] = np.nan
                feature.loc[ind_last, "duration"] = np.nan
                feature.loc[ind_last, "midtime"] = np.nan
                feature.loc[ind_last, "midhour"] = np.nan
                feature.loc[ind_last, "dow start"] = np.nan
                feature.loc[ind_last, "dow end"] = np.nan
                feature.loc[ind_last, "n gps"] = 0.0

            # foursquare location in binary form
            loc_fsq_code = le.transform(loc_fsq)
            loc_fsq_bin = enc.transform(loc_fsq_code.reshape(-1, 1)).toarray()
            loc_fsq_bin = loc_fsq_bin[0]
            for j in range(loc_fsq_bin.size):
                feature.loc[ind_last, "fsq {}".format(j)] = loc_fsq_bin[j]

            # distance to closest foursquare location (m)
            feature.loc[ind_last, "fsq distance"] = distance_fsq

            # break locations and generate duplicate data for other sensors
            if break_locations:
                locs = target.loc[ind_last, "location"]
                locs = locs[1:-1]  # remove brackets
                locs = locs.split('", "')
                locs = [l.replace('"', "") for l in locs]
                locs = filter(None, locs)  # remove any empty strings
                # first repeating everything
                for i in range(len(locs) - 1):
                    target.loc[ind_last + 1 + i, :] = target.loc[ind_last, :]
                    feature.loc[ind_last + 1 + i, :] = feature.loc[ind_last, :]
                # noew replacing locations with new values
                for (i, _) in enumerate(locs):
                    target.loc[ind_last + i, "location"] = locs[i]
                # last index
                ind_last += len(locs)

            else:
                ind_last += 1

        if save_results:
            with open("features/" + subj + ".dat", "w") as file_out:
                pickle.dump([feature, target], file_out)
            file_out.close()

    return 0