Ejemplo n.º 1
0
def get_multiplier_table():
    """
    By mkiang

    for aggregating acs-features to station-level
    """
    import pandas as pd
    import utils.pg_tools as pg

    pgw = pg.PGWrangler(dbitems=pg.get_pgdict_from_cfg())
    conn = pgw.get_conn()

    multiplier = pd.read_sql_query(
        'select a.battalion as station_name, b.geoid, '
        'st_area(st_intersection(a.geom, b.geom)) / '
        'st_area(b.geom) as multiplier '
        'from tiger_shp.tl_2013_39_bg as b '
        'join luigi_raw_shp.cad_station_areas as a on '
        'st_intersects(a.geom, b.geom)', conn)

    conn.close()

    multiplier.set_index('geoid', inplace=True, drop=True)

    return multiplier
Ejemplo n.º 2
0
def make_default_profile(filename, state, state_abbrev):
    postgres_dict = pg_tools.get_pgdict_from_cfg()
    with open(filename, 'wb') as f:
        for k, v in postgres_dict.iteritems():
            f.write("{}={}\n".format(k, v))
        f.write("STATE={}\n".format(state))
        f.write("STATE_ABBREV={}\n".format(state_abbrev))
Ejemplo n.º 3
0
def make_json_df(m_table='model.lookup',
                 k_values=[5, 30, 60, 90],
                 return_raw=False,
                 subset='2016-08-17'):
    """Just a wrapper to get the json info out of postgres    """
    ## Make a connection
    pgw = pg.PGWrangler(dbitems=pg.get_pgdict_from_cfg())
    conn = pgw.get_conn()

    ## Download the raw data -- clean up json column
    raw_data = pd.read_sql_query('select * from {}'.format(m_table), conn)
    raw_data.columns = raw_data.columns.str.lower()
    raw_data.json = raw_data.json.apply(json.loads)

    if subset:
        raw_data = raw_data[raw_data.timestamp >= subset].copy()
        raw_data.reset_index(inplace=True, drop=True)

    # Helpful for debugging
    if return_raw:
        return raw_data

    ## Extract and normalize the json from raw data -- reindex
    json_df = pd.concat([
        pd.io.json.json_normalize(row[1]) for row in raw_data.json.iteritems()
    ])
    json_df.reset_index(inplace=True, drop=True)

    ## Clean up column names
    json_df.columns = json_df.columns.str.lower()
    json_df.columns = json_df.columns.str.replace(' ', '_')
    json_df.columns = json_df.columns.str.replace('&_', '')

    ## Make 2 columns for feature and importance, listed alphabetically
    json_df['features_list_alpha'] = json_df.model_feats_imps.apply(
        lambda x: [y[0] for y in x])
    json_df['importance_list_alpha'] = json_df.model_feats_imps.apply(
        lambda x: [y[1] for y in x])

    ## Convert this list of lists into a dictionary -- easier handling
    json_df.model_feats_imps = json_df.model_feats_imps.apply(
        lambda x: {y[0]: y[1]
                   for y in x})

    ## Now create a new features/importance colunm, sorted by abs(importance)
    json_df['features_sorted'] = json_df.model_feats_imps.apply(
        lambda x: sorted(x.items(), key=lambda (k, v): abs(v), reverse=True))

    ## Break out the precision, recall, AUC, weighted score
    for i, k in enumerate(k_values):
        json_df['p_' +
                str(k)] = json_df.model_metrics.apply(lambda x: x[i][1][0])
        json_df['r_' +
                str(k)] = json_df.model_metrics.apply(lambda x: x[i][1][1])
        json_df['auc_' +
                str(k)] = json_df.model_metrics.apply(lambda x: x[i][1][2])
        json_df['wsc_' +
                str(k)] = json_df.model_metrics.apply(lambda x: x[i][1][3])

    ## Hash the model and feature strings to make grouping easier
    json_df['feature_hash'] = json_df.features_list_alpha.apply(
        lambda x: hashlib.md5(' '.join(x)).hexdigest())
    json_df['model_hash'] = json_df.model_spec.apply(
        lambda x: hashlib.md5(' '.join(x)).hexdigest())

    ## change the weights and model spec to string to allow for grouping
    json_df['model_str'] = json_df.model_spec.apply(str)
    json_df['weights_str'] = json_df.weights.apply(str)

    json_df.model_str = json_df.model_str.str.replace('RandomForestClassifier',
                                                      'RF')
    json_df.model_str = json_df.model_str.str.replace('LogisticRegression',
                                                      'LR')
    json_df.model_str = json_df.model_str.str.replace('KNeighborsClassifier',
                                                      'KNN')
    json_df.model_str = json_df.model_str.str.replace('ExtraTreesClassifier',
                                                      'ET')
    json_df.model_str = json_df.model_str.str.replace('bootstrap=False, ', '')
    json_df.model_str = json_df.model_str.str.replace('class_weight=None, ',
                                                      '')

    return json_df
Ejemplo n.º 4
0
    name = model_name
    plt.title(name)
    plt.savefig(dir_name + name)
    # plt.show()


#### START REAL STUFF
## THIS IS FOR THE TEST DATA
if os.path.isfile('./all_new_model_scores.csv'):
    model_scores = pd.read_csv('./all_new_model_scores.csv')
    model_scores.set_index('Unnamed: 0', inplace=True, drop=True)
    model_scores.index.rename('incident', inplace=True)
else:
    ## Get the truth
    pgw = pg.PGWrangler(dbitems=pg.get_pgdict_from_cfg())
    conn = pgw.get_conn()
    model_scores = pd.read_sql_query(
        'select incident, trns_to_hosp, code_type '
        'from semantic.master where time_year = 2015', conn)
    ## convert bool to int
    model_scores.trns_to_hosp = model_scores.trns_to_hosp + 0

    ## create index and remap buckets
    model_scores.set_index('incident', inplace=True, drop=True)
    model_scores['bucket'] = model_scores.code_type.map(bucket_map).fillna(
        'other')

## THIS IS FOR THE TRAIN DATA
if os.path.isfile('./training_model_scores.csv'):
    train_scores = pd.read_csv('./training_model_scores.csv')