def get_traffic_violations_df(save=True): data_dir = fetch(TRAFFIC_VIOLATIONS_CONFIG) file = os.listdir(data_dir[0])[0] csv_path = os.path.join(data_dir[0], file) df = pd.read_csv(csv_path) df['Year'] = float_to_int(df['Year'], df.index) clean = ['Make', 'Model'] for c in clean: arr = [] for elt in df[c]: if elt == 'NONE': arr.append(np.nan) else: arr.append(elt) df[c] = pd.Series(arr, dtype=np.object, index=df.index) for c in df: arr = [] for elt in df[c]: if isinstance(elt, str) and '\n' in elt: elt = elt.replace('\n', '') arr.append(elt) df[c] = pd.Series(arr, dtype=df[c].dtype, index=df.index) df['VehicleType'] = df['VehicleType'].astype('category') df['Arrest Type'] = df['Arrest Type'].astype('category') df['Race'] = df['Race'].astype('category') df['Violation Type'] = df['Violation Type'].astype('category') df.rename( columns={col: re.sub(' ', '_', col).lower() for col in df.columns}, inplace=True) write_df(save, df, data_dir[1], TRAFFIC_VIOLATIONS_CONFIG.main_file) return df
def get_crime_df(save=True): # FIXME dead link :s data_dir = fetch(CRIME_DATA_CONFIG) file = os.listdir(data_dir[0])[0] csv_path = os.path.join(data_dir[0], file) df = pd.read_csv(csv_path) cols = ['Area Name', 'Victim Sex', 'Victim Descent', 'Premise Description', 'Weapon Description', 'Status Description', 'Crime Code Description'] print(df.columns) df['Victim Age'] = float_to_int(df['Victim Age'], df.index) df['Premise Code'] = float_to_int(df['Premise Code'], df.index) df['Weapon Used Code'] = float_to_int(df['Weapon Used Code'], df.index) df['Crime Code 1'] = float_to_int(df['Crime Code 1'], df.index) df['Crime Code 2'] = float_to_int(df['Crime Code 2'], df.index) df['Crime Code 3'] = float_to_int(df['Crime Code 3'], df.index) df['Crime Code 4'] = float_to_int(df['Crime Code 4'], df.index) for c in cols: if df[c].dtype == float: df[c] = float_to_int(df[c], df.index) df[c] = df[c].astype('category') write_df(save, df, data_dir[1], CRIME_DATA_CONFIG.main_file) return df
def get_road_safety_df(save=True): data_dir = fetch(ROAD_SAFETY_CONFIG) files = _get_file_paths(data_dir[0]) df = _process_df(files) f_to_i = ['1st_Road_Number', '2nd_Road_Number', 'Location_Easting_OSGR', 'Location_Northing_OSGR', 'Number_of_Vehicles', 'Number_of_Casualties', 'Speed_limit', 'accyr', 'Engine_Capacity_(CC)_df', 'Age_of_Vehicle_df'] str_to_i = ['Vehicle_Reference', 'Vehicle_Reference_df', 'Vehicle_Reference_df_res'] to_del = ['data missing or out of range', 'none', -1, 'unknown or other', 'not known', 'unclassified', 'unknown', 'nan'] for c in df: tab = [] for elt in df[c]: if (isinstance(elt, str) and elt.lower() in to_del) or elt in to_del: tab.append(np.nan) else: tab.append(elt) df[c] = pd.Series(tab, dtype=np.object, index=df.index) for c in f_to_i: df[c] = float_to_int(df[c], df.index) for c in str_to_i: tab = [] for elt in df[c]: if isinstance(elt, str): tab.append(int(elt)) else: tab.append(elt) df[c] = pd.Series(tab, dtype=np.object, index=df.index) for c in df: if len(df[c].unique()) == 1 and str(df[c].unique()[0]) == 'nan': df.drop([c], 1, inplace=True) write_df(save, df, data_dir[1], ROAD_SAFETY_CONFIG.main_file) return df
def _clean_cols(cols, df): for c in cols: tab = [] if 'Predominant' in c: for elt in df[c]: if isinstance(elt, str) and 'None' in elt: tab.append(np.nan) else: tab.append(elt) df[c] = pd.Series(tab, dtype=np.object, index=df.index) elif 'Mean Earnings' in c or 'Median Earnings' in c: for elt in df[c]: if isinstance(elt, str) and 'PrivacySuppressed' in elt: tab.append(np.nan) elif isinstance(elt, str): tab.append(int(elt)) else: tab.append(elt) df[c] = pd.Series(tab, dtype=np.object, index=df.index) elif df[c].dtype == float: df[c] = float_to_int(df[c], df.index) return df