Exemple #1
0
def get_eval_region_mask(region, region_num, nsamples=None, regionlabel_file=None, regionmask_file=None):
    # create evaluation region mask (regarding the city vector grid)
    if region=='city':
        region_mask = np.ones(nsamples).astype(bool)
        mask_grdInRegion = None
    elif region=='cluster':
        cluster_label, _ = ld.load_clusters(regionlabel_file) 
        region_mask = np.in1d(cluster_label,region_num)
        mask_grdInRegion = ld.load_cluster_mask(regionmask_file,region_num)
    elif region=='district':
        district_label = ld.load_districts(regionlabel_file) 
        region_mask = np.in1d(district_label,region_num)
        mask_grdInRegion = ld.load_district_mask(regionmask_file,region_num)
    return region_mask, mask_grdInRegion
Exemple #2
0
        '([a-zA-Z]+ )((?:\d*\.\d+\s*|\d+\s*){3})',
        eval_str)  # e.g. [('PAI ','0 1 100'), ('PEI ','0.1 0.5 50')]

    evalIdx_names = []
    areaPct = {}
    for eval_tup in eval_tuplist:
        interval_params = list(map(
            float, eval_tup[1].split()))  # convert string to list of numbers
        intervals = np.linspace(interval_params[0], interval_params[1],
                                interval_params[2])
        ev_idx = eval_tup[0].rstrip()
        evalIdx_names.append(ev_idx)
        areaPct[ev_idx] = intervals

    #------------------ Load data ----------------------------------------
    _, grd_x, grd_y, _, mask_grdInCity, _ = ld.load_grid(grid_pkl)
    grid_2d = (grd_x, grd_y)

    groups_test = ld.load_train_test_group(group_pkl)['test_groups']

    CrimeData = ld.load_crime_data(crime_pkl)
    baseline_test = ld.load_baseline_data(baseline_pkl, target_crime)

    score_list = []
    for p_file, m_file in zip(predscore_filelist, samplemask_filelist):
        with open(m_file, 'rb') as input_file:
            samplemask = pickle.load(input_file)
        predscores = np.loadtxt(p_file, delimiter=',')
        predscores_city = np.tile(np.zeros(np.sum(mask_grdInCity)),
                                  len(groups_test))
        #assign prediction scores to the entire city
Exemple #3
0
    #                                                      cluster_Nos=cluster_Nos, district_Nos=district_Nos, \
    #                                                      balance=True, rand_seed=r_seed, load_city=False,\
    #                                                      save_format='h5',crime_type=target_crime,target='Label')
    #        sample_mask = loading_info['sample_mask']
    #        X_train, y_train = loading_info['train_data_subset']
    #        X_test, _ = loading_info['test_data_subset']
    #
    #        clf = get_model_params(clf_name,rand_seed=r_seed,X=X_train)['model']
    #        tuning_params = get_model_params(clf_name,rand_seed=r_seed,X=X_train)['tuning_params']
    #        tuning_param_dicts = dict([(clf_name+'__'+key, val) for key,val in tuning_params.items()])

    sample_mask = {}
    if train_region == 'city':
        filename_dict['train'] = train_data
        X_train, y_train = ld.load_struct_data_h5(filename_dict['train'],
                                                  target_crime,
                                                  'Label',
                                                  split='train')

        clf = get_model_params(clf_name, rand_seed=r_seed, X=X_train)['model']
        tuning_params = get_model_params(clf_name, rand_seed=r_seed,
                                         X=X_train)['tuning_params']
        tuning_param_dicts = dict([(clf_name + '__' + key, val)
                                   for key, val in tuning_params.items()])
        # train models
        fitting = model_fit(X_train, y_train, clf, clf_name,
                            tuning_param_dicts, CV_skf, scaling)
        model, cv_results, best_params = fitting['model'], fitting[
            'CV_result'], fitting['best_param']
        print('CV results (' + train_region + train_region_num_str + '):')
        print(cv_results)
        print('Best_parameters (' + train_region + train_region_num_str + '):')
Exemple #4
0
def pair_train_test_sample(filenames,
                           mask_sample_region={
                               'train': None,
                               'test': None
                           },
                           cluster_Nos={
                               'train': None,
                               'test': None
                           },
                           district_Nos={
                               'train': None,
                               'test': None
                           },
                           chunk_size={
                               'train': None,
                               'test': None
                           }):
    """
    filenames: a dict with keys 'district'/'cluster' and 'group' with their filenames being the corresponding values
    """

    # load cluster info
    if 'cluster' in filenames.keys():
        cluster_label, _ = ld.load_clusters(filenames['cluster'])
        cluster_mask = {
            'train':
            np.in1d(cluster_label, cluster_Nos['train'])
            if cluster_Nos['train'] is not None else None,
            'test':
            np.in1d(cluster_label, cluster_Nos['test'])
            if cluster_Nos['test'] is not None else None
        }
    # load district info
    if 'district' in filenames.keys():
        district_label = ld.load_districts(filenames['district'])
        district_mask = {
            'train':
            np.in1d(district_label, district_Nos['train'])
            if district_Nos['train'] is not None else None,
            'test':
            np.in1d(district_label, district_Nos['test'])
            if district_Nos['test'] is not None else None
        }

    # load time interval info
    group_info = ld.load_train_test_group(filenames['group'])
    groups_train, groups_test = group_info['train_groups'], group_info[
        'test_groups']

    sample_mask = {'train': [], 'test': []}
    if mask_sample_region['train'] == 'cluster':
        M = cluster_mask['train']
    elif mask_sample_region['train'] == 'district':
        M = district_mask['train']
    for i in range(groups_train[0], groups_train[-1], chunk_size['train']):
        if i < groups_train[-1] and i + chunk_size['train'] > groups_train[-1]:
            # the end chunk may has smaller size
            sample_mask['train'].append(np.tile(M, groups_train[-1] + 1 - i))
        else:
            sample_mask['train'].append(np.tile(M, chunk_size['train']))

    if mask_sample_region['test'] == 'cluster':
        M = cluster_mask['test']
    elif mask_sample_region['test'] == 'district':
        M = district_mask['test']
    for i in range(groups_test[0], groups_test[-1], chunk_size['test']):
        if i < groups_test[-1] and i + chunk_size['test'] > groups_test[-1]:
            # the end chunk may has smaller size
            sample_mask['test'].append(np.tile(M, groups_test[-1] + 1 - i))
        else:
            sample_mask['test'].append(np.tile(M, chunk_size['test']))

    return sample_mask
Exemple #5
0
    if cluster_pkl != 'NA':
        filename_dict['cluster'] = cluster_pkl
    if district_pkl != 'NA':
        filename_dict['district'] = district_pkl

    #----------------------------------------#
    mask_sample_region = {'train': train_region, 'test': test_region}

    start = time.time()

    sample_mask = pair_train_test_sample(filename_dict, mask_sample_region,
                                         cluster_Nos, district_Nos, chunk_size)

    if train_region == 'city':
        X_train, y_train = ld.load_struct_data_h5(train_data_list[0],
                                                  target_crime,
                                                  'Label',
                                                  split='train_city')
        sample_mask['train'] = np.ones(len(X_train)).astype(bool)

    else:
        X_train_stacked = []
        y_train_stacked = []

        for i, fn in enumerate(train_data_list):
            X_train, y_train = ld.load_struct_data_h5(fn,
                                                      target_crime,
                                                      'Label',
                                                      split='train_chunk')

            X_train_stacked.append(X_train[sample_mask['train'][i], :])
            y_train_stacked.append(y_train[sample_mask['train'][i]])
    filePath_save = outpath if outpath is not None else "../SharedData/FeatureData/"

    prop_threshold = int(re.search('(?<=Nfeatures=)(\d+)', params).group(1))

    methods = [
        'univar (f-score)', 'RFE (logit)', 'rand-L1 (logit)', 'RF', 'GBM'
    ]
    weights = np.ones(len(methods))
    Kbest = None

    colidx = ['avg rank', 'frac']

    rank_voting_avgrank = collections.defaultdict(dict)  # a nested dict
    rank_voting_prop = collections.defaultdict(dict)

    feature_names, feature_ranks = ld.load_feature_ranks(featureRank_pkl)

    # select by average ranks
    sel_features, sel_idx, avg_ranks = select_by_avg_ranks(feature_ranks,
                                                           feature_names,
                                                           K_best=Kbest,
                                                           weights=weights)
    rank_voting_avgrank = dict(selected_features=sel_features, selected_idx=sel_idx, \
                               feature_names=feature_names, feature_scores=avg_ranks)

    # select by proportion of times
    sel_features, sel_idx, frac = select_by_proportion(feature_ranks,feature_names,K_best=Kbest,\
                                                       prop_threshold=prop_threshold,weights=weights)
    rank_voting_prop = dict(selected_features=sel_features, selected_idx=sel_idx, \
                            feature_names=feature_names, feature_scores=frac)
                     bootstrap=True, oob_score=True, random_state=r_seed, n_jobs=-1)
    GBM_params = dict(n_estimators=200, max_depth=5, learning_rate=0.05, subsample=0.9,
                      max_features=0.5, min_samples_leaf=1, random_state=r_seed)                 
    params = dict(CVobj=CV_skf, pct_best=100, percentiles=np.linspace(10,100,10), Cs_l1=np.logspace(-2, 2, 9), 
                  C_l2=100, rand_L1_params=rand_L1_params, RF_params=RF_params, GBM_params=GBM_params, 
                  scaling='minmax',rand_seed=r_seed, plot=True, save_fig=True, show=False)    
        
    
    fig_categories = ['univar_rank','univar_CV','L1_path','L1_CV','L1_rank','feature_imp']
    fig_savefile = {}

    ranked_features = collections.defaultdict(dict) # a nested dict
    feature_ranks = collections.defaultdict(dict) # a nested dict
    
    # ************************ load data *****************************#
    X_train, y_train, _, _, feature_names = ld.load_train_test_data(feature_pkl,'h5',target_crime,'Label')

    
    # ****************** bagged feature selection ************************#
    for fig_cat in fig_categories:
        fig_savefile[fig_cat] = filePath_save+'Figures/FeatureSelection/'+target_crime+'/'+fig_cat+'.png'
    
    params.update(dict(X=X_train,y=y_train,feature_names=feature_names,fig_names=fig_savefile))
    start = time.time()    
    ranked_features, feature_ranks = bag_feature_selection(**params)
    end = time.time()
    print(end-start)

    # Save ranked features
             
    # From each crime type, arrange ranked features in a dataframe where each feature name is a row index,