def process(input_file, output_name, var_map, calc=None, agg_areas=True): def _add_pct(data_frame): var_list = data_frame.columns.tolist() for var in GEO_COLUMNS + ['area']: if var in var_list: var_list.remove(var) return pct.add_percentages(data_frame, var_list, var_list[0]) def _export(data_frame, suffix, include_index=False): full_name = output_name + '_' + suffix + '.csv' data_frame.to_csv(full_name, index=include_index) print('Saved file: ' + full_name) return # Clean municipality data data = cd.clean_data(input_file) data_new = data[GEO_COLUMNS + sorted(var_map.keys())] data_new = data_new.rename(columns=var_map) # Perform any extra necessary calculations if calc: data_new = calc(data_new) # Aggregate if agg_areas: data_agg = agg.aggregate(data_new) data_ri = agg.aggregate(data_new, agg_var=(lambda x: True)) # Calculate percentages data_new_w_pct = _add_pct(data_new) if agg_areas: data_agg_w_pct = _add_pct(data_agg) data_ri_w_pct = _add_pct(data_ri.drop('area', axis=1)) # Export to CSV _export(data_new_w_pct, 'munis') if agg_areas: _export(data_agg_w_pct, 'areas', include_index=True) _export(data_ri_w_pct, 'state') return (data_new_w_pct, data_agg_w_pct, data_ri_w_pct) else: return (data_new_w_pct,)
import csv import pandas as pd import numpy as np import kanonymize as ka import cleandata as cd df = pd.read_csv('HoustonCrimeData.csv', encoding='utf_8_sig', engine='python') df = df.drop([ 'Occurrence Date', 'Occurrence Hour', 'NIBRS Class', 'Beat', 'Offense Count', 'Suffix' ], axis=1) df = cd.clean_data(df) #print(df.head()) print(df.groupby('NIBRS Description').count().sort_values(['Incident'])) #IN algorithm, manually create address field yourself. #print(df.groupby('Address').count()) print('df') ka.k_anonymize(df, 5) df.to_csv("anonymized_data.csv")
import pandas as pd import sys sys.path.append('../') import cleandata as cd # Export municipality data data = cd.clean_data('ACS_14_5YR_B19013_with_ann.csv') data = data.drop('HD02_VD01', axis=1) data = data.rename(columns={'HD01_VD01': 'med_hh_inc'}) data.to_csv('income_munis.csv', index=False)
run = Run.get_context() client = ExplanationClient.from_run(run) ds = TabularDatasetFactory.from_delimited_files( "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv", validate=True, include_path=False, infer_column_types=True, set_column_types=None, separator=',', header=True, partition_format=None, support_multi_line=False, empty_as_string=False) x, y = clean_data(ds) feature_names = list(x.columns) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument( '--C',