def get_federal_election_df(save=True): # data data_dir = fetch(FEDERAL_ELECTION_CONFIG) file = "itcont.txt" csv_path = os.path.join(data_dir[0], file) # header data_dir_header = fetch(FEDERAL_ELECTION_HEADER_CONFIG) file_header = "indiv_header_file.csv" csv_path_header = os.path.join(data_dir_header[0], file_header) df_header = pd.read_csv(csv_path_header) df = pd.read_csv(csv_path, sep='|', encoding='latin1', header=None, names=df_header.columns) # Some donations are negative df['TRANSACTION_AMT'] = df['TRANSACTION_AMT'].abs() # Predicting the log of the donation df['TRANSACTION_AMT'] = df['TRANSACTION_AMT'].apply(np.log) df = df[df['TRANSACTION_AMT'] > 0] df.rename(columns={col: col.lower() for col in df.columns}, inplace=True) df['zip_code'] = df['zip_code'].astype(str) df['city'].loc[1378568] = re.sub('{', '', df['city'].loc[1378568]) df['memo_text'] = df['memo_text'].astype('category') write_df(save, df, data_dir[1], FEDERAL_ELECTION_CONFIG.main_file) return df
def get_open_payment_df(save=True): data_dir = fetch(OPEN_PAYMENTS_CONFIG) files = _get_file_paths(data_dir[0]) df = _process_df(files) df['Physician_Specialty'] = df['Physician_Specialty'].astype('category') write_df(save, df, data_dir[1], OPEN_PAYMENTS_CONFIG.main_file) return df
def get_traffic_violations_df(save=True): data_dir = fetch(TRAFFIC_VIOLATIONS_CONFIG) file = os.listdir(data_dir[0])[0] csv_path = os.path.join(data_dir[0], file) df = pd.read_csv(csv_path) df['Year'] = float_to_int(df['Year'], df.index) clean = ['Make', 'Model'] for c in clean: arr = [] for elt in df[c]: if elt == 'NONE': arr.append(np.nan) else: arr.append(elt) df[c] = pd.Series(arr, dtype=np.object, index=df.index) for c in df: arr = [] for elt in df[c]: if isinstance(elt, str) and '\n' in elt: elt = elt.replace('\n', '') arr.append(elt) df[c] = pd.Series(arr, dtype=df[c].dtype, index=df.index) df['VehicleType'] = df['VehicleType'].astype('category') df['Arrest Type'] = df['Arrest Type'].astype('category') df['Race'] = df['Race'].astype('category') df['Violation Type'] = df['Violation Type'].astype('category') df.rename( columns={col: re.sub(' ', '_', col).lower() for col in df.columns}, inplace=True) write_df(save, df, data_dir[1], TRAFFIC_VIOLATIONS_CONFIG.main_file) return df
def get_public_procurement_df(save=True): # FIXME df.shape = (565163, 75) != from paper # FIXME nb category cae_name = 39623 != from paper # FIXME cae_name become str rather than category # (openml requirments) data_dir = fetch(PUBLIC_PROCUREMENT_CONFIG) file = os.listdir(data_dir[0])[0] csv_path = os.path.join(data_dir[0], file) df = pd.read_csv(csv_path) df.loc[df.ID_LOT == 'Zp 2130-64/15', 'ID_LOT'] = np.nan df.ID_LOT = df.ID_LOT.astype(float) df.loc[df.CRIT_PRICE_WEIGHT == '50 points', 'CRIT_PRICE_WEIGHT'] = np.nan df.loc[[("%" in str(price)) for price in df.CRIT_PRICE_WEIGHT.values], 'CRIT_PRICE_WEIGHT'] = np.nan df.CRIT_PRICE_WEIGHT = df.CRIT_PRICE_WEIGHT.astype(float) row_typo = [] for row, id_lot in enumerate(df.ID_LOT_AWARDED): try: float(id_lot) except: row_typo.append(row) df.loc[row_typo, 'ID_LOT_AWARDED'] = np.nan # 345 over 565163 df.ID_LOT_AWARDED = df.ID_LOT_AWARDED.astype(float) df.loc[[39165, 39164], 'CONTRACT_NUMBER'] = np.nan df.rename(columns={col: col.lower() for col in df.columns}, inplace=True) # df['cae_name'] = df['cae_name'].astype('category') df['cae_name'] = df['cae_name'].astype(str) tronq_cae = [str(x)[:1023] for x in df['cae_name']] df['cae_name'] = pd.Series(tronq_cae, dtype=df['cae_name'].dtype, index=df.index) write_df(save, df, data_dir[1], PUBLIC_PROCUREMENT_CONFIG.main_file) return df
def get_midwest_survey_df(save=True): data_dir = fetch(MIDWEST_SURVEY_CONFIG) file = os.listdir(data_dir[0])[0] csv_path = os.path.join(data_dir[0], file) df = pd.read_csv(csv_path, index_col='RespondentID') df = merge_columns(df) write_df(save, df, data_dir[1], MIDWEST_SURVEY_CONFIG.main_file) return df
def get_midwest_survey_df(save=True): data_dir = fetch(MIDWEST_SURVEY_CONFIG) file = os.listdir(data_dir[0])[0] csv_path = os.path.join(data_dir[0], file) df = pd.read_csv(csv_path, index_col='RespondentID') df = merge_columns(df) write_df(save, df, data_dir[1], MIDWEST_SURVEY_CONFIG.main_file) df.rename(columns={col: 'Location_Census_Region' for col in ['Location (Census Region)']}, inplace=True) return df
def get_drug_discovery_df(save=True): data_dir = fetch(DRUG_DISCOVERY_CONFIG) file = os.listdir(data_dir[0])[1] csv_path = os.path.join(data_dir[0], file) df = pd.read_csv(csv_path, sep='\t', encoding='latin1') cat_cols = ['DRG Definition', 'Provider State'] for c in cat_cols: df[c] = df[c].astype('category') write_df(save, df, data_dir[1], DRUG_DISCOVERY_CONFIG.main_file) return df
def get_medical_charge_df(save=True): data_dir = fetch(MEDICAL_CHARGE_CONFIG) file = glob.glob(os.path.join(data_dir[0], '*.csv'))[0] csv_path = os.path.join(data_dir[0], file) df = pd.read_csv(csv_path) cat_cols = ['DRG Definition', 'Provider State'] for c in cat_cols: df[c] = df[c].astype('category') write_df(save, df, data_dir[1], MEDICAL_CHARGE_CONFIG.main_file) return df
def get_medical_charge_df(save=True): data_dir = fetch(MEDICAL_CHARGE_CONFIG) file = os.listdir(data_dir[0])[1] csv_path = os.path.join(data_dir[0], file) df = pd.read_csv(csv_path, sep=',') cat_cols = ['DRG Definition', 'Provider State'] for c in cat_cols: df[c] = df[c].astype('category') df.rename(columns={col: re.sub(' ', '_', col).lower() for col in df.columns}, inplace=True) write_df(save, df, data_dir[1], MEDICAL_CHARGE_CONFIG.main_file) return df
def get_beer_reviews_df(save=True): data_dir = fetch(BEER_REVIEWS_CONFIG) file = os.listdir(data_dir[0])[0] csv_path = os.path.join(data_dir[0], file) df = pd.read_csv(csv_path) for c in df: arr = [] for elt in df[c]: if isinstance(elt, str) and '\xa0' in elt: elt = elt.replace('\xa0', ' ') arr.append(elt) df[c] = pd.Series(arr, dtype=df[c].dtype, index=df.index) write_df(save, df, data_dir[1], BEER_REVIEWS_CONFIG.main_file) return df
def get_met_objects_df(save=True): data_dir = fetch(MET_OBJECTS_CONFIG) file = os.listdir(data_dir[0])[0] csv_path = os.path.join(data_dir[0], file) df = pd.read_csv(csv_path, encoding='utf-8') cat_cols = ['Department', 'Dynasty', 'State'] clean = [ 'Geography Type', 'State', 'Classification', 'Artist Role', 'Artist Prefix', 'Artist Display Bio', 'Artist Suffix', 'Geography Type' ] period = [] for c in df: arr = [] for elt in df[c]: if isinstance(elt, str) and '\r\n' in elt: elt = elt.replace('\r\n', '') if isinstance(elt, str) and '\u3000' in elt: elt = elt.replace('\u3000', ' ') if isinstance(elt, str) and '\x1e' in elt: elt = elt.replace('\x1e', '') arr.append(elt) df[c] = pd.Series(arr, dtype=df[c].dtype, index=df.index) for c in df['Period']: if type(c) is str: period.append(c) else: period.append(np.nan) df['Period'] = pd.Series(period, dtype=np.object, index=df.index) for c in clean: tab = [] for elt in df[c]: if elt == '|' or elt == '||' or elt == '(none assigned)': tab.append(np.nan) else: tab.append(elt) df[c] = pd.Series(tab, dtype=np.object, index=df.index) for c in cat_cols: df[c] = df[c].astype('category') df.rename( columns={col: re.sub(' ', '_', col).lower() for col in df.columns}, inplace=True) write_df(save, df, data_dir[1], MET_OBJECTS_CONFIG.main_file) return df
def get_employee_salaries_df(save=True): data_dir = fetch(EMPLOYEE_SALARIES_CONFIG) file = os.listdir(data_dir[0])[0] csv_path = os.path.join(data_dir[0], file) df = pd.read_csv(csv_path) df['Year First Hired'] = [ datetime.datetime.strptime(d, '%m/%d/%Y').year for d in df['Date First Hired'] ] df['Gender'] = df['Gender'].astype('category') df['Department'] = df['Department'].astype('category') df['Department Name'] = df['Department Name'].astype('category') df['Assignment Category'] = df['Assignment Category'].astype('category') write_df(save, df, data_dir[1], EMPLOYEE_SALARIES_CONFIG.main_file) return df
def get_colleges_df(save=True): data_dir = fetch(COLLEGES_CONFIG) file = os.listdir(data_dir[0])[0] csv_path = os.path.join(data_dir[0], file) df = pd.read_csv(csv_path, sep='\t', encoding='latin1', index_col='UNITID') df.drop(["Unnamed: 0"], 1, inplace=True) df['State'] = df['State'].astype(str) cols = ['Undergrad Size', 'Predominant Degree', 'Average Cost Academic Year', 'Average Cost Program Year', 'Tuition (Instate)', 'Tuition (Out of state)', 'Spend per student', 'Faculty Salary', 'Mean Earnings 6 years', 'Median Earnings 6 years', 'Mean Earnings 10 years', 'Median Earnings 10 years'] df = _clean_cols(cols, df) cats = ['State', 'Predominant Degree', 'Highest Degree', 'Ownership', 'Region', 'ZIP'] for c in cats: df[c] = df[c].astype('category') write_df(save, df, data_dir[1], COLLEGES_CONFIG.main_file) return df
def get_road_safety_df(save=True): data_dir = fetch(ROAD_SAFETY_CONFIG) files = _get_file_paths(data_dir[0]) df = _process_df(files) f_to_i = ['1st_Road_Number', '2nd_Road_Number', 'Location_Easting_OSGR', 'Location_Northing_OSGR', 'Number_of_Vehicles', 'Number_of_Casualties', 'Speed_limit', 'accyr', 'Engine_Capacity_(CC)_df', 'Age_of_Vehicle_df'] str_to_i = ['Vehicle_Reference', 'Vehicle_Reference_df', 'Vehicle_Reference_df_res'] to_del = ['data missing or out of range', 'none', -1, 'unknown or other', 'not known', 'unclassified', 'unknown', 'nan'] for c in df: tab = [] for elt in df[c]: if (isinstance(elt, str) and elt.lower() in to_del) or elt in to_del: tab.append(np.nan) else: tab.append(elt) df[c] = pd.Series(tab, dtype=np.object, index=df.index) for c in f_to_i: df[c] = float_to_int(df[c], df.index) for c in str_to_i: tab = [] for elt in df[c]: if isinstance(elt, str): tab.append(int(elt)) else: tab.append(elt) df[c] = pd.Series(tab, dtype=np.object, index=df.index) for c in df: if len(df[c].unique()) == 1 and str(df[c].unique()[0]) == 'nan': df.drop([c], 1, inplace=True) write_df(save, df, data_dir[1], ROAD_SAFETY_CONFIG.main_file) return df
def get_crime_df(save=True): # FIXME dead link :s data_dir = fetch(CRIME_DATA_CONFIG) file = os.listdir(data_dir[0])[0] csv_path = os.path.join(data_dir[0], file) df = pd.read_csv(csv_path) cols = ['Area Name', 'Victim Sex', 'Victim Descent', 'Premise Description', 'Weapon Description', 'Status Description', 'Crime Code Description'] print(df.columns) df['Victim Age'] = float_to_int(df['Victim Age'], df.index) df['Premise Code'] = float_to_int(df['Premise Code'], df.index) df['Weapon Used Code'] = float_to_int(df['Weapon Used Code'], df.index) df['Crime Code 1'] = float_to_int(df['Crime Code 1'], df.index) df['Crime Code 2'] = float_to_int(df['Crime Code 2'], df.index) df['Crime Code 3'] = float_to_int(df['Crime Code 3'], df.index) df['Crime Code 4'] = float_to_int(df['Crime Code 4'], df.index) for c in cols: if df[c].dtype == float: df[c] = float_to_int(df[c], df.index) df[c] = df[c].astype('category') write_df(save, df, data_dir[1], CRIME_DATA_CONFIG.main_file) return df