def get_iris_data(file_in): """ Fetch the UCI data set on physical characteristics of Iris species. """ data_text = open(file_in, 'r').read() data_rows = data_text.split('\n') data_rows = data_rows[0:-2] #last two lines are blank x_headers = [ 'sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'target' ] cat_variables = ['target'] data_all = [row.split(',') for row in data_rows] output = pd.DataFrame(data_all, columns=x_headers) output = helpers.replace_missing_mode(output) for col in output: print(col) if col not in cat_variables: output[col] = [float(x) for x in output[col]] output = helpers.one_hot_encode(output, exclude=[]) output = helpers.normalize(output) return (output)
def get_glass_data(file_in): """ Fetch the UCI data set on age of chemical characteristics of glass. """ data_text = open(file_in, 'r').read() data_rows = data_text.split('\n') data_rows = data_rows[0:-1] #last line is blank x_headers = [ 'id', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'target' ] cat_variables = ['target'] data_all = [row.split(',') for row in data_rows] output = pd.DataFrame(data_all, columns=x_headers) output = helpers.replace_missing_mode(output) for col in output: print(col) if col not in cat_variables: output[col] = [float(x) for x in output[col]] output = output.drop(['id'], axis=1) output = helpers.one_hot_encode(output, exclude=[]) output = helpers.normalize(output) return (output)
def get_cancer_data(file_in): """ Fetch the UCI data set on breast cancer characteristics """ data_text = open(file_in, 'r').read() data_rows = data_text.split('\n') data_rows = data_rows[0:-1] #last line is blank x_headers = [ 'id', 'clump_thickness', 'unif_cell_size', 'unif_cell_shape', 'marginal_adhesion', 'single_epithelial_cell_size', 'bare_nuclei', 'bland_chrmatin', 'normal_nucleoli', 'mitoses', 'target' ] cat_variables = ['target'] data_all = [row.split(',') for row in data_rows] output = pd.DataFrame(data_all, columns=x_headers) output = output.drop(['id'], axis=1) output = helpers.replace_missing_mode(output) for col in output: if col not in cat_variables: output[col] = [float(x) for x in output[col]] output = helpers.one_hot_encode(output, exclude=[]) output = helpers.normalize(output) return (output)
def get_vote_data(file_in): """ Fetch and clean the UCI data set on US Representative vote records """ data_text = open(file_in, 'r').read() data_rows = data_text.split('\n') data_rows = data_rows[0:-1] #last line is blank x_headers = [ 'target', 'handicapped-infants', 'water-project-cost-sharing', 'adoption-of-the-budget-resolution', 'physician-fee-freeze', 'el-salvador-aid', 'religious-groups-in-schools', 'anti-satellite-test-ban', 'aid-to-nicaraguan-contras', 'mx-missile', 'immigration', 'synfuels-corporation-cutback', 'education-spending', 'superfund-right-to-sue', 'crime', 'duty-free-exports', 'export-administration-act-south-africa' ] cat_variables = ['target'] data_all = [row.split(',') for row in data_rows] output = pd.DataFrame(data_all, columns=x_headers) output = helpers.replace_missing_mode(output) output = helpers.one_hot_encode(output, exclude=[]) output = helpers.normalize(output) return (output)
def get_soy_data(file_in): """ Fetch the UCI data set on diseases of soybean samples. """ data_text = open(file_in, 'r').read() data_rows = data_text.split('\n') data_rows = data_rows[0:-1] #last line is blank x_headers = [ 'date', 'plant-stand', 'precip', 'temp', 'hail', 'crop-hist', 'area-damaged', 'severity', 'seed-tmt', 'germination', 'plant-growth', 'leaves', 'leafspots-halo', 'leafspots-marg', 'leafspot-size', 'leaf-shread', 'leaf-malf', 'leaf-mild', 'stem', 'lodging', 'stem-cankers', 'canker-lesion', 'fruiting-bodies', 'external decay', 'mycelium', 'int-discolor', 'sclerotia', 'fruit-pods', 'fruit spots', 'seed', 'mold-growth', 'seed-discolor', 'seed-size', 'shriveling', 'roots', 'target' ] cat_variables = ['target'] data_all = [row.split(',') for row in data_rows] output = pd.DataFrame(data_all, columns=x_headers) output = helpers.replace_missing_mode(output) output = helpers.one_hot_encode(output, exclude=[]) output = helpers.normalize(output) return (output)