def test_great_circle_dist(): """ test great circle distance funtion """ point1 = (1.3, 103.0) # distance from a point to itself is zero assert_almost_equal(0.0, great_circle_dist(point1, point1)) # distance from CREATE to SUTD (rounded to 100ms) p_create = (1.303826, 103.773890) p_sutd = (1.341221, 103.963234) assert_almost_equal(21.5, round(great_circle_dist(p_create, p_sutd), ndigits=1)) assert_almost_equal(21.5, round(great_circle_dist(p_sutd, p_create), ndigits=1))
def find_nearest_station(lat, lon, rtree, threshold=100): """Find neares station(s) given a lat/lon position. Input is a lat/lon location and the Rtree spatial index of stations. Return a list of tuples. Each tuple contains (station id, distance to the station) if the distance is smaller threshold in meters. If there is more than one nearest equidistant stations the list contains more than one tuple otherwise just one tuple. """ # rtree_entry.object is the string station id # NOTE: bounds coordinates are 'interleaved' by default # insert as [x, y, x, y] and returned as [xmin, xmax, ymin, ymax] # See http://toblerity.org/rtree/tutorial.html return [ (rtree_entry.object, great_circle_dist((lat,lon),(rtree_entry.bounds[0], rtree_entry.bounds[2]), unit="meters")) for rtree_entry in rtree.nearest((lat,lon,lat,lon), num_results=1, objects=True) if great_circle_dist((lat,lon),(rtree_entry.bounds[0], rtree_entry.bounds[2]), unit="meters") < threshold ]
def notWalkSegProcess(hw_mode_refined, ave_vel, delta_t, timestamp, lat, lon, dist, num_pt_total, NOT_STOP_V, IS_STOP_V, TIME_SET_STOPPED, VC_MIN_DIST): # function used to pick up all the none walking segments in the given mode vector # and call notWalkSegRefine() function to refine those modes # input: whole modes of the device # output: refined modes of the device # get the start and end idx of non walking segment idx_nonwalking = np.where((hw_mode_refined!=MODE_WALK_IN) & (hw_mode_refined!=MODE_WALK_OUT))[0] if len(idx_nonwalking)==0: num_nonwalking_seg = 0 else: start_idx_nonwalking,end_idx_nonwalking,num_nonwalking_seg = getStartEndIdx(idx_nonwalking) # go through each not walking segment #print "nonwalking_seg:",num_nonwalking_seg for i_seg in range(0,num_nonwalking_seg): #print i_seg,start_idx_nonwalking[i_seg],end_idx_nonwalking[i_seg] timestamp_seg = timestamp[start_idx_nonwalking[i_seg]:end_idx_nonwalking[i_seg]+1] # check whether it's indoor or outdoor if start_idx_nonwalking[i_seg]>0: # set indoor/outdoor as previous state if hw_mode_refined[start_idx_nonwalking[i_seg]-1]==MODE_WALK_IN: indoor_seg = 1 elif hw_mode_refined[start_idx_nonwalking[i_seg]-1]==MODE_WALK_OUT: indoor_seg = 0 else: indoor_seg = 1 # assign default value raise Exception("The segment before this none walking segment is not walking segment!") elif end_idx_nonwalking[i_seg]<num_pt_total-1: # set indoor/outdoor as next state if hw_mode_refined[end_idx_nonwalking[i_seg]+1]==MODE_WALK_IN: indoor_seg = 1 elif hw_mode_refined[end_idx_nonwalking[i_seg]+1]==MODE_WALK_OUT: indoor_seg = 0 else: indoor_seg = 1 # assign default value raise Exception("The segment before this none walking segment is not walking segment!") else: indoor_seg = 1 # assign default value # refine the modes of none-walking period hw_mode_refined_seg = hw_mode_refined[start_idx_nonwalking[i_seg]:end_idx_nonwalking[i_seg]+1] v_pt_filtered_seg = ave_vel[start_idx_nonwalking[i_seg]: end_idx_nonwalking[i_seg]+1] delta_t_seg = delta_t[start_idx_nonwalking[i_seg]: end_idx_nonwalking[i_seg]+1] travel_dist = np.nansum(dist[start_idx_nonwalking[i_seg]: end_idx_nonwalking[i_seg]+1]) jump_dist = great_circle_dist([lat[start_idx_nonwalking[i_seg]],lon[start_idx_nonwalking[i_seg]]],[lat[end_idx_nonwalking[i_seg]],lon[end_idx_nonwalking[i_seg]]],'meters') # call the function to refind the non-walking modes hw_mode_refined_seg = notWalkSegRefine(timestamp_seg,hw_mode_refined_seg,v_pt_filtered_seg,delta_t_seg,travel_dist,jump_dist,NOT_STOP_V,IS_STOP_V,indoor_seg,TIME_SET_STOPPED,VC_MIN_DIST) # update the modes into the entire trip hw_mode_refined[start_idx_nonwalking[i_seg]:end_idx_nonwalking[i_seg]+1] = hw_mode_refined_seg return hw_mode_refined
def distanceFcn(df,lat,lon,tripStart=True,homeIdx=True): #finds the index and value of the min distance from a dataframe to a certain point search_range_school = 50; # m, the radios of searching for the start/end point of a trip (school) search_range_home = 0; # m, the radios of searching for the start/end point of a trip (home) latlon = df[['WLATITUDE','WLONGITUDE']].values.tolist() dist_list = map(lambda x: great_circle_dist(x, [lat,lon], unit="meters"), latlon) dist_array = np.array(dist_list) if homeIdx: # if the location is home, search range is smaller idx_in_range = np.where(dist_array<=search_range_home)[0] if len(idx_in_range)==0: dist = np.nanmin(dist_array) idx = dist_list.index(dist) if tripStart: dist_col = np.array(dist_list) idx = max(np.where(dist_col== dist)[0]) else: if tripStart: idx = idx_in_range[len(idx_in_range)-1] dist = dist_array[idx] else: idx = idx_in_range[0] dist = dist_array[idx] else: # if the location is school, search range is larger idx_in_range = np.where(dist_array<=search_range_school)[0] if len(idx_in_range)==0: dist = np.nanmin(dist_array) idx = dist_list.index(dist) if tripStart: dist_col = np.array(dist_list) idx = max(np.where(dist_col== dist)[0]) else: if tripStart: idx = idx_in_range[len(idx_in_range)-1] dist = dist_array[idx] else: idx = idx_in_range[0] dist = dist_array[idx] return {'dist':dist,'idx': idx}
def trip_segment(data_frame, stopped_thresh=0.5, stopped_dwell=480): """Find POI's in the data frame, return a list of POIs and a list of indices of the poi points inside data_frame. Implements logic from Yuren's Matlab code. """ def store_poi(data_frame, idx_buffer, stop_time): """Identify the lat/lon location and the indices in the data frame for a POI. idx_buffer has the indices in the data frame for this POI. """ # if the total stop time is larger than stopped_dwell, # then a poi is detected, otherwise return if stop_time <= stopped_dwell: return # select lat/lon locations for this POI, assignment makes a copy of the rows df_stop_all = data_frame.loc[idx_buffer][['WLATITUDE', 'WLONGITUDE']] # check if all location are nan lat_stop = df_stop_all['WLATITUDE'].values if np.all(np.isnan(lat_stop)): return df_stop_all = df_stop_all.apply(round_values) # get the most frequent poi of this stop segment df_poi_cnt = df_stop_all.groupby(['WLATITUDE', 'WLONGITUDE']).size() poi_lat, poi_lon = df_poi_cnt.idxmax() # record poi lat/lon and the indices that correspond to it pois['poi_latlon'].append([poi_lat,poi_lon]) pois['poi_idx'].append(idx_buffer) pois['last_time'].append(stop_time) # pois[(poi_lat,poi_lon)].extend(idx_buffer) # start of trip_segment() pois = defaultdict(list) round_decimal = 4; dist_2comb = 30; # m, distance close to which the two pois are combined round_values = partial(pd.Series.round, decimals=round_decimal) stop_time = 0 idx_buffer = [] for index, row in data_frame.iterrows(): if row['AVE_VELOCITY'] < stopped_thresh: # if stop time is zero we are at the beginning of a new stop # reset the idx_buffer if stop_time == 0: idx_buffer = [] # remember that we are stopped this index idx_buffer.append(index) # add the time delta of this row to the time we are stopped here stop_time += row['TIME_DELTA'] else: # we are moving, check if we have just concluded a POI # and store the POI if stop_time > 0: store_poi(data_frame, idx_buffer, stop_time) stop_time = 0 # process the last stop after exiting loop if stop_time > 0: store_poi(data_frame, idx_buffer, stop_time) # combine the close pois pois_latlon = pois['poi_latlon'] num_pois = len(pois_latlon) if num_pois>1: cur_poi_latlon = pois_latlon[0] unique_pois = defaultdict(list) unique_pois['poi_latlon'].append(pois['poi_latlon'][0]) unique_pois['poi_idx'].append(pois['poi_idx'][0]) unique_pois['last_time'].append(pois['last_time'][0]) i_unique_poi = 0 last_time_prev = 0 for i_poi in xrange(1,num_pois): dist = great_circle_dist(cur_poi_latlon,pois_latlon[i_poi],'meters') if dist<dist_2comb: # if the two pois are close, combine as one based on the time unique_pois['poi_idx'][i_unique_poi].extend(pois['poi_idx'][i_poi]) unique_pois['last_time'][i_unique_poi]=unique_pois['last_time'][i_unique_poi]+pois['last_time'][i_poi] if pois['last_time'][i_poi]>last_time_prev: unique_pois['poi_latlon'][i_unique_poi]=pois['poi_latlon'][i_poi] cur_poi_latlon = pois_latlon[i_poi] last_time_prev=pois['last_time'][i_poi] else: # if the two pois are far, insert a new unique poi i_unique_poi = i_unique_poi+1 cur_poi_latlon = pois_latlon[i_poi] unique_pois['poi_latlon'].append(pois['poi_latlon'][i_poi]) unique_pois['poi_idx'].append(pois['poi_idx'][i_poi]) unique_pois['last_time'].append(pois['last_time'][i_poi]) last_time_prev=pois['last_time'][i_poi] return unique_pois['poi_latlon'], unique_pois['poi_idx'] # keys() and values() in a dictionary respect the same order # return pois.keys(), pois.values() return pois['poi_latlon'], pois['poi_idx']
def identify_home_school(pois, idx_of_pois, data_frame, school_start=9, school_end=13, home_start=22, home_end=5, max_school_thresh=100, round_decimals=4, poi_cover_range = 30): """Identify home and school locations. Input is a list of point of interests tuples. Each tuple has the number of measurements, start and end index in the original data frame (starting from zero), start and end timestamp, start and end latitude and longitude. Find the tuples that fall between start and end time and average the locations. Return two tuples representing home (lat, lon) and school (lat, lon) locations. If there are no POIs identified for home or school, the tuple will be (None, None). pois is a list of all the unique pois generated by trip_segment idx_of_pois is a list of all the indices of the correspoing pois data_frame is the pandas data frame with the original data. school_start is the hour of the day when school starts. Default 9am. school_end is the hour of the day when school end. Default 1pm. home_start is the first hour of the day when students are assumed to be home at night. Default 10pm. home_end is the last hour of the day when students are assumed to be home at night. Default 5am. max_school_thresh is the threshold for school/home distances smaller than which the home/school poi is rejected - to negate the possibility of creating home/school links for sensors left at school by mistake round_decimals is the number of decimals used in the max_freq heuristic for rounding lat / lon values before taking the most frequent value time_offset is the offset in hours to subtract from each timestamp. Default 8. """ ########## identify home/school completely same as MATLAB version ############ # check the length of pois if len(pois)<1: logging.info("No poi is passed in") return (None, None), (None, None) # get out time data all_timestamp = data_frame[['TIMESTAMP']].values all_delta_time = data_frame[['TIME_DELTA']].values # initialization time_at_school = [] time_at_home = [] lat_pois = [] lon_pois = [] # go though each poi for i_poi in xrange(0,len(pois)): poi= pois[i_poi] idx_of_poi = idx_of_pois[i_poi] # lat/lon values to return lat_pois.append(poi[0]) lon_pois.append(poi[1]) # calculate the distance between each point and the poi latlon = data_frame[['WLATITUDE', 'WLONGITUDE']].values dist_to_poi = map(lambda x: great_circle_dist(x, poi, unit="meters"), latlon) # find the indices of points that are near to this poi idx_near_poi = [x for x in xrange(0,len(dist_to_poi)) if dist_to_poi[x]<=poi_cover_range] # idx_near_poi = np.where(np.array(dist_to_poi)<=poi_cover_range)[0].tolist() # another way to find the indices, has warning # combine the indices found with the earlier poi indices idx_of_pois_new = np.unique(idx_of_poi+idx_near_poi).tolist() poi_timestamp = all_timestamp[idx_of_pois_new] poi_delta_time = all_delta_time[idx_of_pois_new] poi_hourtime = np.array(map(lambda x: get_hour_SGT(x), poi_timestamp)) # find the idx of points among current idx_poi which fit school time range idx_at_school = np.where((poi_hourtime >= school_start) & (poi_hourtime < school_end))[0] time_at_school.append(np.nansum(poi_delta_time[idx_at_school])) # find the idx of points among current idx_poi which fit home time range idx_at_home = np.where((poi_hourtime < home_end) | (poi_hourtime >= home_start))[0] time_at_home.append(np.nansum(poi_delta_time[idx_at_home])) if len(time_at_school)==0 and len(time_at_home)==0: raise Exception("No poi is passed in!") return (None, None), (None, None) else: # get school max_sch_cnt = max(time_at_school) idx_max_sch_cnt = np.argmax(time_at_school) # get home max_home_cnt = max(time_at_home) idx_max_home_cnt = np.argmax(time_at_home) # if max_sch_cnt and max_home_cnt are all zero, get all loc as None if max_sch_cnt==0 and max_home_cnt==0: return (None, None), (None, None) # if the same pois is detected as home or school # decide by the school/home time elif idx_max_sch_cnt==idx_max_home_cnt: if max_sch_cnt > max_home_cnt: school_lat = lat_pois[idx_max_sch_cnt] school_lon = lon_pois[idx_max_sch_cnt] home_lat = np.nan home_lon = np.nan elif max_sch_cnt < max_home_cnt: home_lat = lat_pois[idx_max_home_cnt] home_lon = lon_pois[idx_max_home_cnt] school_lat = np.nan school_lon = np.nan else: return (None, None), (None, None) else: # only if there are hits for school time, assign school if max_sch_cnt>0: school_lat = lat_pois[idx_max_sch_cnt] school_lon = lon_pois[idx_max_sch_cnt] else: school_lat = np.nan school_lon = np.nan # only if there are hits for home time, assign home if max_home_cnt>0: home_lat = lat_pois[idx_max_home_cnt] home_lon = lon_pois[idx_max_home_cnt] else: home_lat = np.nan home_lon = np.nan # sort out home/school pairings < YY km away - these are anomolies school_home_dist = great_circle_dist([school_lat,school_lon],[home_lat,home_lon],unit="meters") if school_home_dist < max_school_thresh: logging.info("home school distance:" + str(school_home_dist)) return (None, None), (None, None) if ~np.isnan(home_lat) and ~np.isnan(school_lat): return (home_lat, home_lon), (school_lat, school_lon) elif ~np.isnan(home_lat) and np.isnan(school_lat): return (home_lat, home_lon), (None, None) elif np.isnan(home_lat) and ~np.isnan(school_lat): return (None, None), (school_lat, school_lon) else: return (None, None), (None, None)
def segFind(df, trip_return, mode_thresh=120, isAM = True, dist_lim = 45.3, max_mode=6, max_walk=4.0): # definition of mode code MODE_WALK_IN = 3; MODE_WALK_OUT = 2; MODE_STOP_IN = 1; MODE_STOP_OUT = 0; MODE_CAR = 6; MODE_TRAIN = 4; # thresholds for calculating distance of mode segs ALL_WALK_TIME=15*60 # time shorter than which the distance of walk mode segment is calculated using all points real_to_jump_dist = 2; # for short walking seg, limit the distance by 2 times the jump distance pred_modes = df[['PRED_MODE']].values[:,0] # take out the predicted modes # change all STOP_IN, STOP_OUT, WALK_OUT to WALK_IN pred_modes[(pred_modes==MODE_STOP_IN) | (pred_modes==MODE_STOP_OUT) | (pred_modes==MODE_WALK_OUT)]=MODE_WALK_IN mode_segs = list(chunks(pred_modes,True)) # take the mode chunks num_mode_segs = len(mode_segs) num_valid_mode_seg = 0 logging.debug("Mode Segs: " + str(mode_segs)) time_span = [] valid_mode_segs = [] prev_mode = 0 # go through each mode chunk for mode_seg in mode_segs: time_span = np.sum(df['TIME_DELTA'].values[mode_seg[0]:mode_seg[1]]) # abandon if the total segment time is less than threshold, and shorten the list down to 5 mode segments at most if time_span < mode_thresh or num_valid_mode_seg > max_mode-1: continue else: latlon_start = [df['WLATITUDE'].values[mode_seg[0]],df['WLONGITUDE'].values[mode_seg[0]]] latlon_end = [df['WLATITUDE'].values[mode_seg[1]-1],df['WLONGITUDE'].values[mode_seg[1]-1]] jump_dist = great_circle_dist(latlon_start,latlon_end,'meters') num_valid_mode_seg += 1 if isAM: mode_key = 'am_mode' dist_key = 'am_distance' else: mode_key = 'pm_mode' dist_key = 'pm_distance' # calculate the distance of this mode segment if int(mode_seg[2]) == MODE_WALK_IN: modes_cur_seg = df['PRED_MODE'].values[mode_seg[0]:mode_seg[1]] dist_cur_seg = df['DISTANCE_DELTA'].values[mode_seg[0]:mode_seg[1]] if time_span<ALL_WALK_TIME: # if the time span is too small, consider all 0-3 modes as walking dist_seg = np.nansum(dist_cur_seg) dist_seg=checkDist(dist_seg,jump_dist*real_to_jump_dist) else: # else if the time span is not too small, only consider 2 and 3 modes dist_seg = np.nansum(dist_cur_seg[np.where((modes_cur_seg==MODE_WALK_IN) | (modes_cur_seg==MODE_WALK_OUT))[0]]) dist_seg=checkDist(dist_seg,max_walk*1000) else: dist_seg = np.nansum(df['DISTANCE_DELTA'].values[mode_seg[0]:mode_seg[1]]) if dist_seg==0 or np.isnan(dist_seg): # filter out the zero or nan values of dist_seg continue if mode_seg[2]==prev_mode: # if the current mode is same to the previous one, combine the two distance prev_dist = trip_return[dist_key][len(trip_return[dist_key])-1] cur_dist = checkDist((prev_dist*1000+dist_seg) / 1000,dist_lim) trip_return[dist_key][len(trip_return[dist_key])-1]=cur_dist continue trip_return[mode_key].append(int(mode_seg[2])) # append the mode trip_return[dist_key].append(checkDist(dist_seg / 1000,dist_lim)) prev_mode = mode_seg[2] return num_valid_mode_seg
def pointDist(row, lat,lon): vlat = row['WLATITUDE'] vlon = row['WLONGITUDE'] return great_circle_dist([vlat,vlon],[lat,lon])
def modeSmooth(hw_mode,timestamp,delta_t,lat,lon,vel,ave_vel,delta_steps,dist): # this function smooths the hw_mode code # Output: s_hw_mode : smoothed hw_code # Input: # - hw_mode: a vector of hw_code # - timestamp: a vector of timestamp # - lat,lon: vectors of lat and lon representing location # - vel: vector of geographical velocity, m/s # - ave_vel: 5-window moving average of geographical velocity, m/s NUM_AFT_WALKING = 3 # num of points after walking segment to be set as invalid hw mode TIME_NOT_HIDE = 60*5 # sec, time longer than which the several points after each walking segment won't be set as TBD TIME_SET_STOPPED = 60*1 # sec, time shorter than which the not walking seg is set as stopped NOT_STOP_V = 5.0 # m/s, mean velocity above which the not walking seg is considered as not stopped IS_STOP_V = 1.0 # m/s, mean velocity below which the not walking seg is considered as stopped WALK_MAX_V_AVE = 7.0 # m/s, moving average velocity above which it's considered as TBD WALK_MAX_V_PT = 7.0 # m/s, single point velocity above which it's considered as TBD SINGLE_WALK_MAX_V = 1.5 # m/s, single point velocity above which it's considered as TBD for single walking point SLEEPING_TIME = 60 # s, time larger than which the mode is check and reset SLEEP_MAX_V = 2.0 # m/s, vel_ave or vel above which the sleeping point will be assigned as TBD_VC SLEEP_TO_WALK_STEPS = 1 # steps to time ratio below which the sleeping point will be assigned as stopped TBD_VC_TIME = 200 # s, time above which the point will be considered as vehicle mode TBD_VC_DIST = 300 # m, distance above which the point will be considered as vehicle mode SHORT_WALK = 200 # s, time below which the walking segment between two vehicle seg will be considered as invalid FEW_STEPS = 50 # steps below which the walking segment between two vehicle seg will be considered as invalid SHORT_WALK_MAX_V = 1.5 # m/s, single point velocity above which it's considered as TBD for single walking point VC_MIN_DIST = 100 #m, distance smaller than which the vehicle mode segment is needed to reprocess WALK_IN_MAX_DIST = 150 #m, jump distance larger than which the walking mode segment is considered as outdoor #mode representation # MODE_WALK_IN = 3 # MODE_WALK_OUT = 2 # MODE_STOP_OUT = 0 # MODE_STOP_IN = 1 # MODE_TBD = 10 # MODE_TBD_VC = 11 # initialization num_pt_total = len(hw_mode) # total number of points in this trip hw_mode_refined = hw_mode.copy() # initialize the refined mode vector # check the long delta timestamp points # assign points with long delta timestamp but low velocity as "stopped indoor" idx_sleep = np.where(delta_t>SLEEPING_TIME)[0].tolist() prev_i_sp = 0 for i_sp in idx_sleep: if ave_vel[i_sp] > SLEEP_MAX_V or vel[i_sp] > SLEEP_MAX_V: hw_mode_refined[i_sp] = MODE_TBD_VC if i_sp<num_pt_total-1: hw_mode_refined[i_sp+1] = hw_mode_refined[i_sp] elif delta_t[i_sp]>TBD_VC_TIME and delta_t[i_sp]*vel[i_sp]>TBD_VC_DIST: hw_mode_refined[i_sp] = MODE_TBD_VC if i_sp<num_pt_total-1: hw_mode_refined[i_sp+1] = hw_mode_refined[i_sp] elif (delta_steps[i_sp]/delta_t[i_sp]) < SLEEP_TO_WALK_STEPS: if hw_mode_refined[i_sp]==MODE_WALK_OUT: hw_mode_refined[i_sp] = MODE_STOP_OUT if delta_t[i_sp]>500: if i_sp>0 and i_sp-1!=prev_i_sp: hw_mode_refined[i_sp-1] = MODE_WALK_OUT vel[i_sp-1] = 0 ave_vel[i_sp-1] = 0 if i_sp<num_pt_total-1: if delta_t[i_sp+1]<SLEEPING_TIME: hw_mode_refined[i_sp+1] = MODE_WALK_OUT vel[i_sp+1] = 0 ave_vel[i_sp+1] = 0 else: hw_mode_refined[i_sp] = MODE_STOP_IN if delta_t[i_sp]>500: if i_sp>0 and i_sp-1!=prev_i_sp: hw_mode_refined[i_sp-1] = MODE_WALK_IN vel[i_sp-1] = 0 ave_vel[i_sp-1] = 0 if i_sp<num_pt_total-1: if delta_t[i_sp+1]<SLEEPING_TIME: hw_mode_refined[i_sp+1] = MODE_WALK_IN vel[i_sp+1] = 0 ave_vel[i_sp+1] = 0 prev_i_sp = i_sp # refine the walking mode points by checking the moving average of velocity idx_walking = np.where((hw_mode_refined == MODE_WALK_IN) | (hw_mode_refined == MODE_WALK_OUT))[0].tolist() for i_walk in idx_walking: if ave_vel[i_walk] > WALK_MAX_V_AVE or vel[i_walk] > WALK_MAX_V_PT: hw_mode_refined[i_walk] = MODE_TBD_VC idx_walking = np.where((hw_mode_refined == MODE_WALK_IN) | (hw_mode_refined == MODE_WALK_OUT))[0].tolist() # get the start and end idx of each walking segment if(len(idx_walking)==0): num_walking_seg = 0 else: start_idx_walking,end_idx_walking,num_walking_seg = getStartEndIdx(idx_walking) # check the single walking point, if vel>3m/s, set as TBD_VC idx_single_walking = list(set(start_idx_walking).intersection(end_idx_walking)) for i_sw in idx_single_walking: if ave_vel[i_sw] > SINGLE_WALK_MAX_V or vel[i_sw] > SINGLE_WALK_MAX_V: hw_mode_refined[i_sw] = MODE_TBD_VC start_idx_walking.remove(i_sw) end_idx_walking.remove(i_sw) num_walking_seg = num_walking_seg-1 # go through each walking segment and change indoor to outdoor if dist larger than a threshold for i_walk_seg in xrange(0,num_walking_seg): jump_dist = great_circle_dist([lat[start_idx_walking[i_walk_seg]],lon[start_idx_walking[i_walk_seg]]],[lat[end_idx_walking[i_walk_seg]],lon[end_idx_walking[i_walk_seg]]],'meters') if jump_dist>WALK_IN_MAX_DIST: walk_seg_length = end_idx_walking[i_walk_seg]+1-start_idx_walking[i_walk_seg] hw_mode_refined[start_idx_walking[i_walk_seg]:end_idx_walking[i_walk_seg]+1] = np.ones(walk_seg_length)*MODE_WALK_OUT # # go through each walking segment # # modify modes of the several pts before and after the walking segment # # updated in hw_mode_trip_refined and idx_walking_trip # #print "walking_seg:",num_walking_seg # for i_seg in xrange(0,num_walking_seg): # if(i_seg<num_walking_seg-1): # start_next_seg = start_idx_walking[i_seg+1] # else: # start_next_seg = num_pt_total # # #print start_idx_walking[i_seg],end_idx_walking[i_seg] # # if (end_idx_walking[i_seg]+NUM_AFT_WALKING < start_next_seg) and (timestamp[end_idx_walking[i_seg]+NUM_AFT_WALKING]-timestamp[end_idx_walking[i_seg]] < TIME_NOT_HIDE): # # make several pts after walking seg as MODE_TBD # hw_mode_refined[end_idx_walking[i_seg]+1:end_idx_walking[i_seg]+NUM_AFT_WALKING+1] = MODE_TBD # elif(timestamp[start_next_seg-1]-timestamp[end_idx_walking[i_seg]] < TIME_NOT_HIDE): # hw_mode_refined[end_idx_walking[i_seg]+1:start_next_seg] = MODE_TBD hw_mode_refined = notWalkSegProcess(hw_mode_refined, ave_vel, delta_t, timestamp, lat, lon, dist, num_pt_total, NOT_STOP_V, IS_STOP_V, TIME_SET_STOPPED, VC_MIN_DIST) # try to combine mode segments like: vehicle + stop/walk + vehicle temp_modes = np.array(hw_mode_refined.copy()) temp_modes[(temp_modes==MODE_STOP_IN) | (temp_modes==MODE_STOP_OUT) | (temp_modes==MODE_WALK_OUT)]=MODE_WALK_IN mode_segs = list(chunks(temp_modes,True)) # take the mode chunk num_mode_segs = len(mode_segs) # go through each mode chunk for i_seg in xrange(1,num_mode_segs-1): mode_seg = mode_segs[i_seg] mode_seg_prev = mode_segs[i_seg-1] mode_seg_aft = mode_segs[i_seg+1] # check the steps and average velocity of walking seg between two vehicle seg if mode_seg[2]==MODE_WALK_IN: if mode_seg_prev[2]!=MODE_WALK_IN and mode_seg_aft[2]!=MODE_WALK_IN: time_span = np.sum(delta_t[mode_seg[0]:mode_seg[1]]) tot_steps = np.nansum(delta_steps[mode_seg[0]:mode_seg[1]]) v_mean_mode_seg = aveVelCalc(ave_vel[mode_seg[0]:mode_seg[1]], delta_t[mode_seg[0]:mode_seg[1]]) if time_span<SHORT_WALK and (tot_steps<FEW_STEPS or v_mean_mode_seg>SHORT_WALK_MAX_V): hw_mode_refined[mode_seg[0]:mode_seg[1]] = MODE_TBD_VC hw_mode_refined = notWalkSegProcess(hw_mode_refined, ave_vel, delta_t, timestamp, lat, lon, dist, num_pt_total, NOT_STOP_V, IS_STOP_V, TIME_SET_STOPPED, VC_MIN_DIST) return hw_mode_refined
def process(nid, analysis_date): """Process device nid for given date (%Y-%m-%d) and save the results to the backend API. Return pandas data frame with the device data, the predicted travle modes, identified trips, home location and school location """ # get analysis status of that device, skip device if already processed # if getStatus(url, nid, analysis_date): # logging.info("STATUS = 1, ALREADY PROCESSED FOR NODE: %d" % nid) # return # convert analysis_date into unix timestamp in UTC time analysis_unix = calendar.timegm(analysis_date_tuple.timetuple()) # get the starting and end indices for querying the data, for pilot2, pilot3 and synthetic data only # start_get = 0 #int(getFirstSecondOfDay(analysis_unix-8*3600)) #first second of the analysis day # end_get = 1443154915 #int(start_get+24*3600-1) #last second of the analysis day # start_get = int(getFirstSecondOfDay(analysis_unix)) #first second of the analysis day # start_get += 8*3600 # change utc to sgt, for pilot2, pilot3 and synthetic data only # start_get += 12*3600 # starting the query at 12 pm # end_get = int(start_get+24*3600-1) #last second of the analysis day # start_get += 8*3600 # for 603447 and 603815 only start_get = int(getFirstSecondOfDay(analysis_unix))+12*3600 #12 pm of the analysis day end_get = int(start_get+24*3600-1) #12 pm of the day after the analysis day # retrieve unprocessed device data from the backend logging.info("Get data for device %d on the day %s" % (nid, analysis_date)) data_frame = getData(url, nid, start_get, end_get) # num_pt = len(data_frame) # logging.debug("There are %d points in the data base for %d" % (num_pt, nid)) # time_start = pd.to_datetime(data_frame['TIMESTAMP'].values[0]+8*3600,unit='s') # logging.debug("The starting time of this device's data: %f" % (time_start)) # time_end = pd.to_datetime(data_frame['TIMESTAMP'].values[0]+8*3600,unit='s') # timespan = data_frame['TIMESTAMP'].values[num_pt-1]-data_frame['TIMESTAMP'].values[0] # local_ts = ts_date+28800 # add offset to change to SGT # local_ts_date = pd.to_datetime(local_ts,unit='s') # convert local time in second to local datetime if data_frame is None: logging.info("No data returned for device %d, skip." % nid) return elif len(data_frame)<10: # if the data frame size is smaller than a certain threshold, then abandon the data logging.warning("Too little data returned for device %d, skip." % nid) return # clean data to reduce noise logging.info("Clean data for device %d" % nid) clean_data(data_frame, valid_lat_low=valid_lat_low, valid_lat_up=valid_lat_up, valid_lon_low=valid_lon_low, valid_lon_up=valid_lon_up, location_accuracy_thresh=location_accuracy_thresh) # calculate additional features logging.info("Calculate features for device %d" % nid) calculate_features(data_frame, high_velocity_thresh=high_velocity_thresh) # predict the travel mode for each measurement logging.info("Predict modes for device %d" % nid) hw_modes = data_frame['MODE'].values smooth_modes = smooth_heuristic.predict(data_frame, hw_modes) # predicted_modes = smooth_modes predicted_modes = train_heuristic.predict(data_frame, smooth_modes) predicted_modes = bus_heuristic.predict(data_frame, predicted_modes) # predicted_modes = bus_heuristic.predict(data_frame, smooth_modes) # identify trips from the data trips, home_loc, school_loc = tripParse.process(predicted_modes, data_frame, stopped_thresh=stopped_thresh, poi_dwell_time=poi_dwell_time, school_start=school_start, school_end=school_end, home_start=home_start, home_end=home_end, max_school_thresh = max_school_thresh, home_school_round_decimals=home_school_round_decimals, mode_thresh=mode_thresh, poi_cover_range = poi_cover_range) logging.warning("NID: " + str(nid) + "; HOME: " + str(home_loc)) logging.warning("NID: " + str(nid) + "; SCHOOL: " + str(school_loc)) logging.warning("NID: " + str(nid) + "; TRIPS: " + str(trips)) logging.info("Save modes for device %d" % nid) if home_loc!=(None,None) and school_loc!=(None,None): school_home_dist = great_circle_dist([school_loc[0],school_loc[1]],[home_loc[0],home_loc[1]],unit="meters") valid_loc_nid.append(nid) valid_loc_info.append({'home loc':home_loc,'school loc':school_loc,'distance':school_home_dist}) nids_record.append(nid) am_modes_record.append(trips['am_mode']) pm_modes_record.append(trips['pm_mode']) # save detected mode to backend # timestamps = data_frame['TIMESTAMP'].values # modes_saved = saveMode(url, nid, timestamps, predicted_modes) # # save trips to backend. only save if AM or PM mode was detected # logging.info("Save trips for device %d" % nid) # logging.info("TRIP SAVE:\n %s" % str(trips)) # trips_saved = saveTrips(url, nid, analysis_date, trips) # # if both mode save actions are successful, set the analysis flag to success # saved_status = 1 if modes_saved and trips_saved else 0 # setStatus(url, nid, analysis_date, saved_status) return data_frame, predicted_modes, trips, home_loc, school_loc
def calculate_features(data_frame, high_velocity_thresh=40): """Calculate additional features and attributes from the raw hardware data. New attributes are added as new columns in the data frame in place. high_velocity_thresh : maximum threshold for velocities in m/s, higher values are rejected. Default 40m/s (= 144 km/h) """ # calculate time delta since the last measurement, in seconds consec_timestamps = izip(data_frame[['TIMESTAMP']].values[:-1], data_frame[['TIMESTAMP']].values[1:]) delta_timestamps = map(lambda x: x[1][0]-x[0][0], consec_timestamps) # add a zero value for the first measurement where no delta is available delta_timestamps = [0] + delta_timestamps data_frame['TIME_DELTA'] = pd.Series(delta_timestamps) # calculate steps delta since the last measurement consec_steps = izip(data_frame[['STEPS']].values[:-1], data_frame[['STEPS']].values[1:]) delta_steps = map(lambda x: x[1][0]-x[0][0], consec_steps) # add a zero value for the first measurement where no delta is available data_frame['STEPS_DELTA'] = pd.Series([0] + delta_steps) # select rows in data frame that have valid locations df_validloc = data_frame.loc[~np.isnan(data_frame['WLATITUDE']) & ~np.isnan(data_frame['WLONGITUDE'])] # calculate distance delta from pairs of valid lat/lon locations that follow each other valid_latlon = df_validloc[['WLATITUDE', 'WLONGITUDE']].values # dist_delta = map(lambda loc_pair: great_circle_dist(np.floor(loc_pair[0]*10000)/10000, np.floor(loc_pair[1]*10000)/10000, unit="meters"), izip(valid_latlon[:-1], valid_latlon[1:])) dist_delta = map(lambda loc_pair: great_circle_dist(np.round(loc_pair[0],4), np.round(loc_pair[1],4), unit="meters"), izip(valid_latlon[:-1], valid_latlon[1:])) dist_delta2 = map(lambda loc_pair: great_circle_dist(loc_pair[0], loc_pair[1], unit="meters"), izip(valid_latlon[:-1], valid_latlon[1:])) # calculate time delta from pairs of valid timestamps valid_times = df_validloc['TIMESTAMP'].values time_delta = valid_times[1:] - valid_times[:-1] # calculate velocity, m/s velocity = dist_delta / time_delta velocity2 = dist_delta2 / time_delta # create new columns for delta distance, time delta and velocity, initialzied with NaN data_frame['DISTANCE_DELTA'] = pd.Series(dist_delta, df_validloc.index[1:]) # distance in m data_frame['DISTANCE_DELTA2'] = pd.Series(dist_delta2, df_validloc.index[1:]) # distance in m data_frame['VELOCITY'] = pd.Series(velocity, df_validloc.index[1:]) # velocity in m/s data_frame['VELOCITY2'] = pd.Series(velocity2, df_validloc.index[1:]) # velocity in m/s # replace very high velocity values which are due to wifi # localizations errors with NaN in VELOCITY column label_too_high_vel = data_frame['VELOCITY'] > high_velocity_thresh idx_too_high = label_too_high_vel[label_too_high_vel==True].index.tolist() idx_bef_too_high = (np.array(idx_too_high)-1).tolist() data_frame.loc[idx_too_high,['WLATITUDE', 'WLONGITUDE','DISTANCE_DELTA','VELOCITY']] = np.nan data_frame.loc[idx_bef_too_high,['WLATITUDE', 'WLONGITUDE','DISTANCE_DELTA','VELOCITY']] = np.nan # calculate the moving average of velocity, m/s LARGE_TIME_JUMP = 60 window_size = 5 velocity_all = data_frame['VELOCITY'].values ave_velocity_all = [] for idx in xrange(0,len(velocity_all)): if idx<window_size: ave_velocity_all.append(aveWithNan(velocity_all[0:idx])) else: ave_velocity_all.append(aveWithNan(velocity_all[idx-window_size+1:idx])) ave_velocity_all = np.array(ave_velocity_all) # set moving average velocity of large time jump points as point velocity idx_large_jump = np.where(np.array(delta_timestamps)>LARGE_TIME_JUMP)[0].tolist() ave_velocity_all[idx_large_jump] = velocity_all[idx_large_jump] data_frame['AVE_VELOCITY'] = pd.Series(ave_velocity_all.tolist()) # velocity in m/s # calculate the moving average of velocity, m/s window_size = 5 velocity_all2 = data_frame['VELOCITY2'].values ave_velocity_all2 = [] for idx in xrange(0,len(velocity_all2)): if idx<window_size: ave_velocity_all2.append(aveWithNan(velocity_all2[0:idx])) else: ave_velocity_all2.append(aveWithNan(velocity_all2[idx-window_size+1:idx])) ave_velocity_all2 = np.array(ave_velocity_all2) idx_large_jump = np.where(np.array(delta_timestamps)>LARGE_TIME_JUMP)[0].tolist() ave_velocity_all2[idx_large_jump] = velocity_all2[idx_large_jump] data_frame['AVE_VELOCITY2'] = pd.Series(ave_velocity_all2.tolist()) # velocity in m/s # calculate the moving average of steps window_size = 5 delta_steps_all = data_frame['STEPS_DELTA'].values ave_delta_steps_all = [] for idx in xrange(0,len(delta_steps_all)): if idx<window_size: ave_delta_steps_all.append(aveWithNan(delta_steps_all[0:idx])) else: ave_delta_steps_all.append(aveWithNan(delta_steps_all[idx-window_size+1:idx])) data_frame['AVE_STEPS'] = pd.Series(ave_delta_steps_all) # moving average of steps