def get_traffic_violations_df(save=True):
    data_dir = fetch(TRAFFIC_VIOLATIONS_CONFIG)
    file = os.listdir(data_dir[0])[0]
    csv_path = os.path.join(data_dir[0], file)
    df = pd.read_csv(csv_path)
    df['Year'] = float_to_int(df['Year'], df.index)
    clean = ['Make', 'Model']
    for c in clean:
        arr = []
        for elt in df[c]:
            if elt == 'NONE':
                arr.append(np.nan)
            else:
                arr.append(elt)
        df[c] = pd.Series(arr, dtype=np.object, index=df.index)

    for c in df:
        arr = []
        for elt in df[c]:
            if isinstance(elt, str) and '\n' in elt:
                elt = elt.replace('\n', '')
            arr.append(elt)
        df[c] = pd.Series(arr, dtype=df[c].dtype, index=df.index)

    df['VehicleType'] = df['VehicleType'].astype('category')
    df['Arrest Type'] = df['Arrest Type'].astype('category')
    df['Race'] = df['Race'].astype('category')
    df['Violation Type'] = df['Violation Type'].astype('category')
    df.rename(
        columns={col: re.sub(' ', '_', col).lower()
                 for col in df.columns},
        inplace=True)
    write_df(save, df, data_dir[1], TRAFFIC_VIOLATIONS_CONFIG.main_file)
    return df
Beispiel #2
0
def get_crime_df(save=True):
    # FIXME dead link :s
    data_dir = fetch(CRIME_DATA_CONFIG)
    file = os.listdir(data_dir[0])[0]
    csv_path = os.path.join(data_dir[0], file)
    df = pd.read_csv(csv_path)

    cols = ['Area Name', 'Victim Sex', 'Victim Descent', 'Premise Description', 'Weapon Description',
            'Status Description', 'Crime Code Description']
    print(df.columns)
    df['Victim Age'] = float_to_int(df['Victim Age'], df.index)
    df['Premise Code'] = float_to_int(df['Premise Code'], df.index)
    df['Weapon Used Code'] = float_to_int(df['Weapon Used Code'], df.index)
    df['Crime Code 1'] = float_to_int(df['Crime Code 1'], df.index)
    df['Crime Code 2'] = float_to_int(df['Crime Code 2'], df.index)
    df['Crime Code 3'] = float_to_int(df['Crime Code 3'], df.index)
    df['Crime Code 4'] = float_to_int(df['Crime Code 4'], df.index)
    for c in cols:
        if df[c].dtype == float:
            df[c] = float_to_int(df[c], df.index)
        df[c] = df[c].astype('category')

    write_df(save, df, data_dir[1], CRIME_DATA_CONFIG.main_file)
    return df
Beispiel #3
0
def get_road_safety_df(save=True):
    data_dir = fetch(ROAD_SAFETY_CONFIG)
    files = _get_file_paths(data_dir[0])
    df = _process_df(files)
    f_to_i = ['1st_Road_Number', '2nd_Road_Number', 'Location_Easting_OSGR', 'Location_Northing_OSGR',
              'Number_of_Vehicles', 'Number_of_Casualties', 'Speed_limit', 'accyr', 'Engine_Capacity_(CC)_df',
              'Age_of_Vehicle_df']
    str_to_i = ['Vehicle_Reference', 'Vehicle_Reference_df', 'Vehicle_Reference_df_res']
    to_del = ['data missing or out of range', 'none', -1, 'unknown or other', 'not known', 'unclassified', 'unknown',
              'nan']

    for c in df:
        tab = []
        for elt in df[c]:
            if (isinstance(elt, str) and elt.lower() in to_del) or elt in to_del:
                tab.append(np.nan)
            else:
                tab.append(elt)
        df[c] = pd.Series(tab, dtype=np.object, index=df.index)

    for c in f_to_i:
        df[c] = float_to_int(df[c], df.index)

    for c in str_to_i:
        tab = []
        for elt in df[c]:
            if isinstance(elt, str):
                tab.append(int(elt))
            else:
                tab.append(elt)
        df[c] = pd.Series(tab, dtype=np.object, index=df.index)

    for c in df:
        if len(df[c].unique()) == 1 and str(df[c].unique()[0]) == 'nan':
            df.drop([c], 1, inplace=True)

    write_df(save, df, data_dir[1], ROAD_SAFETY_CONFIG.main_file)
    return df
Beispiel #4
0
def _clean_cols(cols, df):
    for c in cols:
        tab = []
        if 'Predominant' in c:
            for elt in df[c]:
                if isinstance(elt, str) and 'None' in elt:
                    tab.append(np.nan)
                else:
                    tab.append(elt)
            df[c] = pd.Series(tab, dtype=np.object, index=df.index)
        elif 'Mean Earnings' in c or 'Median Earnings' in c:
            for elt in df[c]:
                if isinstance(elt, str) and 'PrivacySuppressed' in elt:
                    tab.append(np.nan)
                elif isinstance(elt, str):
                    tab.append(int(elt))
                else:
                    tab.append(elt)
            df[c] = pd.Series(tab, dtype=np.object, index=df.index)
        elif df[c].dtype == float:
            df[c] = float_to_int(df[c], df.index)

    return df