def assign_target(crime_data, grid_2d, cellsize_3d, mask=None, num_chunks=None, labeling=True, class_label=(0, 1)): """ Return a target variable vector where each element correpondings to one example in 3D grid (x,y,t) For binary classification problem, the label class is determinded by number of crime incident in the grid cell (#crime>0: class 1; #crime=0: class 0) For regression problem, the count values will be used """ crimepts = crime_data[['X_COORD','Y_COORD','GROUP']] #grd_t = np.unique(crimepts['GROUP'].values) g = crimepts['GROUP'].values grd_t = np.arange(np.min(g),np.max(g)+1) crimepts = crimepts.values grd_x, grd_y = grid_2d grid_3d = (grd_x,grd_y,grd_t) binned_pts = ks.bin_point_data_3d(crimepts, grid_3d, cellsize_3d, stat='count', geoIm=False) if num_chunks is None: num_chunks = len(np.unique(crime_data['GROUP'])) if mask is None: mask = np.ones(len(grd_x)*len(grd_y)).astype('bool') target = binned_pts.ravel(order='F')[np.tile(mask,reps=num_chunks)] if labeling: label = np.zeros(len(target)) label[target==0] = class_label[0] label[target!=0] = class_label[1] else: label = None return target, label
def incident_feature(data, grid_2d, cellsize_3d, group_seq, mask=None, num_chunks=None, binary=True): """ """ IncPts = data.ix[(data['GROUP'] >= group_seq[0]) & (data['GROUP'] <= group_seq[-1]), ['X_COORD', 'Y_COORD', 'GROUP']] #grd_t = np.unique(IncPts['GROUP'].values) grd_t = group_seq IncPts = IncPts.values grd_x, grd_y = grid_2d grid_3d = (grd_x, grd_y, grd_t) if num_chunks is None: num_chunks = len(group_seq) if mask is None: mask = np.ones(len(grd_x) * len(grd_y)).astype('bool') binned_pts = ks.bin_point_data_3d(IncPts, grid_3d, cellsize_3d, stat='count', geoIm=False) if binary: # crime present/absent binned_pts = (binned_pts > 0).astype(int) inc_cnt = binned_pts.ravel(order='F')[np.tile(mask, reps=num_chunks), np.newaxis] return inc_cnt
def long_term_intensity_feature_subgroup(timeIdx, crime_data, group_seq, period, grid_2d, filter_2d, mask=None, density=True): group = group_seq[timeIdx] crimepts = crime_data.ix[(crime_data['GROUP']>=group-period[0]) & (crime_data['GROUP']<=group-period[1]), ['X_COORD','Y_COORD']].values KS_LT = ks.kernel_smooth_2d_conv(crimepts, grid_2d, filter_2d, flatten=False) # flatten; np.newaxis will change shape of array from (**,) to (**,1); np.squeeze will do the opposite KS_LT = KS_LT.ravel(order='F')[mask,np.newaxis] if density==True: KS_LT = KS_LT/np.sum(KS_LT) return KS_LT
def short_term_intensity_feature_subgroup(timeIdx, crime_data, group_seq, period, grid_2d, filter_3d, mask=None, density=True): group = group_seq[timeIdx] crimepts = crime_data.ix[(crime_data['GROUP']>=group-period[0]) & (crime_data['GROUP']<=group-period[1]), ['X_COORD','Y_COORD','GROUP']] #grd_t = np.unique(crimepts['GROUP'].values) grd_t = np.arange(group-period[0],group-period[1]+1) crimepts = crimepts.values grd_x, grd_y = grid_2d grid_3d = (grd_x,grd_y,grd_t) KS_ST = ks.kernel_smooth_separable_3d_conv(crimepts, grid_3d, filter_3d, flatten=False) KS_ST = KS_ST[:,:,-1] # take out last time slice # flatten; np.newaxis will change shape of array from (**,) to (**,1); np.squeeze will do the opposite KS_ST = KS_ST.ravel(order='F')[mask,np.newaxis] if density==True: KS_ST = KS_ST/np.sum(KS_ST) return KS_ST
def intensity_model_subgroup(timeIdx, crime_data, group_seq, period, grid_2d, filter_2d, mask=None, density=True): group = group_seq[timeIdx] CrimePts = crime_data.ix[(crime_data['GROUP'] >= group - period[0]) & (crime_data['GROUP'] <= group - period[1]), ['X_COORD', 'Y_COORD']].values KS = ks.kernel_smooth_2d_conv(CrimePts, grid_2d, filter_2d, flatten=False) # flatten KS = KS.ravel(order='F')[mask, np.newaxis] if density == True: KS = KS / np.sum(KS) return KS
def consec_presence_feature(data, grid_2d, cellsize_3d, group_seq, buffer_period=0, mask=None, num_chunks=None, presence=True,truncate=True): """ Count the time groups (e.g. weeks) of a consecutive presence/absence. For example, the 11 groups of crime count for a certain cell is as follows [1, 0, 0, 1, 0, 0, 0, 2, 0, 1, 1]. The corresponding consecutive zeros are [ 0, 0, 1, 2, 0, 1, 2, 3, 0, 1, 0] """ IncPts = data.ix[(data['GROUP']>=group_seq[0]-buffer_period) & (data['GROUP']<=group_seq[-1]),['X_COORD','Y_COORD','GROUP']] #grd_t = np.unique(IncPts['GROUP'].values) grd_t = np.arange(group_seq[0]-buffer_period,group_seq[-1]+1) IncPts = IncPts.values grd_x, grd_y = grid_2d grid_3d = (grd_x,grd_y,grd_t) if num_chunks is None: num_chunks = len(group_seq) if mask is None: mask = np.ones(len(grd_x)*len(grd_y)).astype('bool') binned_pts = ks.bin_point_data_3d(IncPts, grid_3d, cellsize_3d, stat='count', geoIm=False) if presence: # count consecutive presence binned_pts = (binned_pts>0).astype(int) else: # count consecutive absense binned_pts = (binned_pts==0).astype(int) consec_cnt = np.apply_along_axis(count_consec_val, 2, binned_pts, val=1) consec_cnt = consec_cnt[:,:,-len(group_seq):] #truncate buffer groups consec_cnt = consec_cnt.ravel(order='F')[np.tile(mask,reps=num_chunks),np.newaxis] if truncate: # Since consecutive numbers are unbounded, some will be affected by the buffer_period. consec_cnt[consec_cnt>buffer_period] = buffer_period return consec_cnt
if 'geometry' in SpatialFeature.columns: SpatialFeature.drop('geometry', axis=1, inplace=True) # remove 'geometry' column # load POD proximity data pod_dist_pkl = filePath_spfeature+'PODdist_dataframe.pkl' with open(pod_dist_pkl,'rb') as input_file: POD_data['dist'] = pickle.load(input_file) # Set up parameters _, grd_x, grd_y, _, mask_grdInCity, _ = load_grid(grid_pkl) grid_2d = (grd_x,grd_y) cellsize_2d = (grd_x[1]-grd_x[0],grd_y[1]-grd_y[0]) cellsize_3d = cellsize_2d+(1,) # (size_x, size_y,size_t) gauss_filter = ks.gaussian_filter_2d(bandwidth=sigma, window_size=(4*2*sigma[0]+1,4*2*sigma[0]+1)) gauss_exp_filter = ks.gaussian_exponential_filter_3d(bandwidth=(sigma[0],sigma[1],lam),\ window_size=(4*2*sigma[0]+1,4*2*sigma[1]+1,period_ST[0]-period_ST[1]))['space-time'] varName_space = SpatialFeature.columns.values.tolist() varName = np.array(varName_space + varName_time + varName_pod + varName_weather + varName_STcrime + varName_LTcrime +\ varName_311 + varName_crime_pres + varName_crime_abs) var_type_idx = {} keys = ['space','time','POD','weather','LT','ST','311','pres','abs'] subnames = [varName_space, varName_time, varName_pod, varName_weather, varName_LTcrime, varName_STcrime,\ varName_311, varName_crime_pres, varName_crime_abs] for key,subname in zip(keys,subnames): var_type_idx[key]=np.in1d(varName,subname) #------------------------------
SpatialFeature.drop('geometry', axis=1, inplace=True) # remove 'geometry' column # load POD proximity data pod_dist_pkl = filePath_spfeature + 'PODdist_dataframe.pkl' with open(pod_dist_pkl, 'rb') as input_file: POD_data['dist'] = pickle.load(input_file) # Set up parameters _, grd_x, grd_y, _, mask_grdInCity, _ = load_grid(grid_pkl) grid_2d = (grd_x, grd_y) cellsize_2d = (grd_x[1] - grd_x[0], grd_y[1] - grd_y[0]) cellsize_3d = cellsize_2d + (1, ) # (size_x, size_y,size_t) gauss_filter = ks.gaussian_filter_2d(bandwidth=sigma, window_size=(4 * 2 * sigma[0] + 1, 4 * 2 * sigma[0] + 1)) varName_space = SpatialFeature.columns.values.tolist() varName = np.array(varName_space + varName_time + varName_pod + varName_weather + varName_LTcrime +\ varName_311 + varName_crime_inc) var_type_idx = {} keys = ['space', 'time', 'POD', 'weather', 'LT', '311', 'crimeInc'] subnames = [ varName_space, varName_time, varName_pod, varName_weather, varName_LTcrime, varName_311, varName_crime_inc ] for key, subname in zip(keys, subnames): var_type_idx[key] = np.in1d(varName, subname)