def get_eval_region_mask(region, region_num, nsamples=None, regionlabel_file=None, regionmask_file=None): # create evaluation region mask (regarding the city vector grid) if region=='city': region_mask = np.ones(nsamples).astype(bool) mask_grdInRegion = None elif region=='cluster': cluster_label, _ = ld.load_clusters(regionlabel_file) region_mask = np.in1d(cluster_label,region_num) mask_grdInRegion = ld.load_cluster_mask(regionmask_file,region_num) elif region=='district': district_label = ld.load_districts(regionlabel_file) region_mask = np.in1d(district_label,region_num) mask_grdInRegion = ld.load_district_mask(regionmask_file,region_num) return region_mask, mask_grdInRegion
'([a-zA-Z]+ )((?:\d*\.\d+\s*|\d+\s*){3})', eval_str) # e.g. [('PAI ','0 1 100'), ('PEI ','0.1 0.5 50')] evalIdx_names = [] areaPct = {} for eval_tup in eval_tuplist: interval_params = list(map( float, eval_tup[1].split())) # convert string to list of numbers intervals = np.linspace(interval_params[0], interval_params[1], interval_params[2]) ev_idx = eval_tup[0].rstrip() evalIdx_names.append(ev_idx) areaPct[ev_idx] = intervals #------------------ Load data ---------------------------------------- _, grd_x, grd_y, _, mask_grdInCity, _ = ld.load_grid(grid_pkl) grid_2d = (grd_x, grd_y) groups_test = ld.load_train_test_group(group_pkl)['test_groups'] CrimeData = ld.load_crime_data(crime_pkl) baseline_test = ld.load_baseline_data(baseline_pkl, target_crime) score_list = [] for p_file, m_file in zip(predscore_filelist, samplemask_filelist): with open(m_file, 'rb') as input_file: samplemask = pickle.load(input_file) predscores = np.loadtxt(p_file, delimiter=',') predscores_city = np.tile(np.zeros(np.sum(mask_grdInCity)), len(groups_test)) #assign prediction scores to the entire city
# cluster_Nos=cluster_Nos, district_Nos=district_Nos, \ # balance=True, rand_seed=r_seed, load_city=False,\ # save_format='h5',crime_type=target_crime,target='Label') # sample_mask = loading_info['sample_mask'] # X_train, y_train = loading_info['train_data_subset'] # X_test, _ = loading_info['test_data_subset'] # # clf = get_model_params(clf_name,rand_seed=r_seed,X=X_train)['model'] # tuning_params = get_model_params(clf_name,rand_seed=r_seed,X=X_train)['tuning_params'] # tuning_param_dicts = dict([(clf_name+'__'+key, val) for key,val in tuning_params.items()]) sample_mask = {} if train_region == 'city': filename_dict['train'] = train_data X_train, y_train = ld.load_struct_data_h5(filename_dict['train'], target_crime, 'Label', split='train') clf = get_model_params(clf_name, rand_seed=r_seed, X=X_train)['model'] tuning_params = get_model_params(clf_name, rand_seed=r_seed, X=X_train)['tuning_params'] tuning_param_dicts = dict([(clf_name + '__' + key, val) for key, val in tuning_params.items()]) # train models fitting = model_fit(X_train, y_train, clf, clf_name, tuning_param_dicts, CV_skf, scaling) model, cv_results, best_params = fitting['model'], fitting[ 'CV_result'], fitting['best_param'] print('CV results (' + train_region + train_region_num_str + '):') print(cv_results) print('Best_parameters (' + train_region + train_region_num_str + '):')
def pair_train_test_sample(filenames, mask_sample_region={ 'train': None, 'test': None }, cluster_Nos={ 'train': None, 'test': None }, district_Nos={ 'train': None, 'test': None }, chunk_size={ 'train': None, 'test': None }): """ filenames: a dict with keys 'district'/'cluster' and 'group' with their filenames being the corresponding values """ # load cluster info if 'cluster' in filenames.keys(): cluster_label, _ = ld.load_clusters(filenames['cluster']) cluster_mask = { 'train': np.in1d(cluster_label, cluster_Nos['train']) if cluster_Nos['train'] is not None else None, 'test': np.in1d(cluster_label, cluster_Nos['test']) if cluster_Nos['test'] is not None else None } # load district info if 'district' in filenames.keys(): district_label = ld.load_districts(filenames['district']) district_mask = { 'train': np.in1d(district_label, district_Nos['train']) if district_Nos['train'] is not None else None, 'test': np.in1d(district_label, district_Nos['test']) if district_Nos['test'] is not None else None } # load time interval info group_info = ld.load_train_test_group(filenames['group']) groups_train, groups_test = group_info['train_groups'], group_info[ 'test_groups'] sample_mask = {'train': [], 'test': []} if mask_sample_region['train'] == 'cluster': M = cluster_mask['train'] elif mask_sample_region['train'] == 'district': M = district_mask['train'] for i in range(groups_train[0], groups_train[-1], chunk_size['train']): if i < groups_train[-1] and i + chunk_size['train'] > groups_train[-1]: # the end chunk may has smaller size sample_mask['train'].append(np.tile(M, groups_train[-1] + 1 - i)) else: sample_mask['train'].append(np.tile(M, chunk_size['train'])) if mask_sample_region['test'] == 'cluster': M = cluster_mask['test'] elif mask_sample_region['test'] == 'district': M = district_mask['test'] for i in range(groups_test[0], groups_test[-1], chunk_size['test']): if i < groups_test[-1] and i + chunk_size['test'] > groups_test[-1]: # the end chunk may has smaller size sample_mask['test'].append(np.tile(M, groups_test[-1] + 1 - i)) else: sample_mask['test'].append(np.tile(M, chunk_size['test'])) return sample_mask
if cluster_pkl != 'NA': filename_dict['cluster'] = cluster_pkl if district_pkl != 'NA': filename_dict['district'] = district_pkl #----------------------------------------# mask_sample_region = {'train': train_region, 'test': test_region} start = time.time() sample_mask = pair_train_test_sample(filename_dict, mask_sample_region, cluster_Nos, district_Nos, chunk_size) if train_region == 'city': X_train, y_train = ld.load_struct_data_h5(train_data_list[0], target_crime, 'Label', split='train_city') sample_mask['train'] = np.ones(len(X_train)).astype(bool) else: X_train_stacked = [] y_train_stacked = [] for i, fn in enumerate(train_data_list): X_train, y_train = ld.load_struct_data_h5(fn, target_crime, 'Label', split='train_chunk') X_train_stacked.append(X_train[sample_mask['train'][i], :]) y_train_stacked.append(y_train[sample_mask['train'][i]])
filePath_save = outpath if outpath is not None else "../SharedData/FeatureData/" prop_threshold = int(re.search('(?<=Nfeatures=)(\d+)', params).group(1)) methods = [ 'univar (f-score)', 'RFE (logit)', 'rand-L1 (logit)', 'RF', 'GBM' ] weights = np.ones(len(methods)) Kbest = None colidx = ['avg rank', 'frac'] rank_voting_avgrank = collections.defaultdict(dict) # a nested dict rank_voting_prop = collections.defaultdict(dict) feature_names, feature_ranks = ld.load_feature_ranks(featureRank_pkl) # select by average ranks sel_features, sel_idx, avg_ranks = select_by_avg_ranks(feature_ranks, feature_names, K_best=Kbest, weights=weights) rank_voting_avgrank = dict(selected_features=sel_features, selected_idx=sel_idx, \ feature_names=feature_names, feature_scores=avg_ranks) # select by proportion of times sel_features, sel_idx, frac = select_by_proportion(feature_ranks,feature_names,K_best=Kbest,\ prop_threshold=prop_threshold,weights=weights) rank_voting_prop = dict(selected_features=sel_features, selected_idx=sel_idx, \ feature_names=feature_names, feature_scores=frac)
bootstrap=True, oob_score=True, random_state=r_seed, n_jobs=-1) GBM_params = dict(n_estimators=200, max_depth=5, learning_rate=0.05, subsample=0.9, max_features=0.5, min_samples_leaf=1, random_state=r_seed) params = dict(CVobj=CV_skf, pct_best=100, percentiles=np.linspace(10,100,10), Cs_l1=np.logspace(-2, 2, 9), C_l2=100, rand_L1_params=rand_L1_params, RF_params=RF_params, GBM_params=GBM_params, scaling='minmax',rand_seed=r_seed, plot=True, save_fig=True, show=False) fig_categories = ['univar_rank','univar_CV','L1_path','L1_CV','L1_rank','feature_imp'] fig_savefile = {} ranked_features = collections.defaultdict(dict) # a nested dict feature_ranks = collections.defaultdict(dict) # a nested dict # ************************ load data *****************************# X_train, y_train, _, _, feature_names = ld.load_train_test_data(feature_pkl,'h5',target_crime,'Label') # ****************** bagged feature selection ************************# for fig_cat in fig_categories: fig_savefile[fig_cat] = filePath_save+'Figures/FeatureSelection/'+target_crime+'/'+fig_cat+'.png' params.update(dict(X=X_train,y=y_train,feature_names=feature_names,fig_names=fig_savefile)) start = time.time() ranked_features, feature_ranks = bag_feature_selection(**params) end = time.time() print(end-start) # Save ranked features # From each crime type, arrange ranked features in a dataframe where each feature name is a row index,