Ejemplo n.º 1
0
def process(input_file, output_name, var_map, calc=None, agg_areas=True):

    def _add_pct(data_frame):

        var_list = data_frame.columns.tolist()
        for var in GEO_COLUMNS + ['area']:
            if var in var_list:
                var_list.remove(var)
        
        return pct.add_percentages(data_frame, var_list, var_list[0])

    def _export(data_frame, suffix, include_index=False):

        full_name = output_name + '_' + suffix + '.csv'
        data_frame.to_csv(full_name, index=include_index)
        print('Saved file: ' + full_name)

        return

    # Clean municipality data
    data = cd.clean_data(input_file)
    data_new = data[GEO_COLUMNS + sorted(var_map.keys())]
    data_new = data_new.rename(columns=var_map)

    # Perform any extra necessary calculations
    if calc:
        data_new = calc(data_new)

    # Aggregate
    if agg_areas:
        data_agg = agg.aggregate(data_new)
        data_ri = agg.aggregate(data_new, agg_var=(lambda x: True))

    # Calculate percentages
    data_new_w_pct = _add_pct(data_new)
    if agg_areas:
        data_agg_w_pct = _add_pct(data_agg)
        data_ri_w_pct = _add_pct(data_ri.drop('area', axis=1))

    # Export to CSV
    _export(data_new_w_pct, 'munis')
    if agg_areas:
        _export(data_agg_w_pct, 'areas', include_index=True)
        _export(data_ri_w_pct, 'state')
        return (data_new_w_pct, data_agg_w_pct, data_ri_w_pct)
    else:
        return (data_new_w_pct,)
import csv
import pandas as pd
import numpy as np
import kanonymize as ka
import cleandata as cd

df = pd.read_csv('HoustonCrimeData.csv', encoding='utf_8_sig', engine='python')
df = df.drop([
    'Occurrence Date', 'Occurrence Hour', 'NIBRS Class', 'Beat',
    'Offense Count', 'Suffix'
],
             axis=1)
df = cd.clean_data(df)
#print(df.head())
print(df.groupby('NIBRS Description').count().sort_values(['Incident']))
#IN algorithm, manually create address field yourself.
#print(df.groupby('Address').count())
print('df')
ka.k_anonymize(df, 5)
df.to_csv("anonymized_data.csv")
Ejemplo n.º 3
0
import pandas as pd
import sys
sys.path.append('../')

import cleandata as cd

# Export municipality data
data = cd.clean_data('ACS_14_5YR_B19013_with_ann.csv')
data = data.drop('HD02_VD01', axis=1)
data = data.rename(columns={'HD01_VD01': 'med_hh_inc'})
data.to_csv('income_munis.csv', index=False)
Ejemplo n.º 4
0
import pandas as pd
import sys
sys.path.append('../')

import cleandata as cd


# Export municipality data
data = cd.clean_data('ACS_14_5YR_B19013_with_ann.csv')
data = data.drop('HD02_VD01', axis=1)
data = data.rename(columns={'HD01_VD01': 'med_hh_inc'})
data.to_csv('income_munis.csv', index=False)
Ejemplo n.º 5
0
run = Run.get_context()
client = ExplanationClient.from_run(run)

ds = TabularDatasetFactory.from_delimited_files(
    "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv",
    validate=True,
    include_path=False,
    infer_column_types=True,
    set_column_types=None,
    separator=',',
    header=True,
    partition_format=None,
    support_multi_line=False,
    empty_as_string=False)

x, y = clean_data(ds)

feature_names = list(x.columns)

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)


def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--C',