def load_sterilization(): df = pd.read_csv('../data/sterilization.csv', na_filter=True, na_values=['#DIV/0!']) df = jn.DataFrame(df).clean_names().label_encode('treatment') mapping = dict(zip(df['treatment'], df['treatment_enc'])) return df, mapping
def test_multiindex_clean_names_method_chain(multiindex_dataframe): df = jn.DataFrame(multiindex_dataframe).clean_names() levels = [['a', 'bell_chart', 'decorated_elephant'], ['b', 'normal_distribution', 'r_i_p_rhino_']] labels = [[0, 1, 2], [0, 1, 2]] expected_columns = pd.MultiIndex(levels=levels, labels=labels) assert set(df.columns) == set(expected_columns)
def load_finches(path): # Load the data df = pd.read_csv(path) # Data cleaning methods. This is provided for you. Follow along the annotations # to learn what's going on. df = (jn.DataFrame(df) # wrap dataframe in a Janitor dataframe. .clean_names() # clean column names .rename_column('blength', 'beak_length') # rename blength to beak_length (readability fix) .rename_column('bdepth', 'beak_depth') # rename bdepth to beak_depth (readability fix) .label_encode('species') # create a `species_enc` column that has the species encoded numerically ) return df
import pandas as pd import janitor as jn df = pd.read_excel('dirty_data.xlsx') df = (jn.DataFrame(df).clean_names().remove_empty().rename_column( '%_allocated', 'percent_allocated').rename_column('full_time?', 'full_time').coalesce( ['certification', 'certification.1'], 'certification').encode_categorical( ['subject', 'employee_status', 'full_time']).convert_excel_date('hire_date')) print(df)
def test_rename_column(dataframe): dataframe = jn.DataFrame(dataframe).clean_names() df = dataframe.rename_column('a', 'index') assert set(df.columns) == set( ['index', 'bell_chart', 'decorated_elephant']) # noqa: E501
def test_get_features_targets(dataframe): dataframe = jn.DataFrame(dataframe).clean_names() X, y = dataframe.get_features_targets(target_columns='bell_chart') assert X.shape == (3, 2) assert y.shape == (3, )
def test_clean_names_method_chain(dataframe): df = jn.DataFrame(dataframe).clean_names() expected_columns = ['a', 'bell_chart', 'decorated_elephant'] assert set(df.columns) == set(expected_columns)
def test_fill_empty(null_df): df = jn.DataFrame(null_df).fill_empty(columns=['2'], value=3) assert set(df.loc[:, '2']) == set([3])
def test_convert_excel_date(): df = pd.read_excel('examples/dirty_data.xlsx') df = jn.DataFrame(df).clean_names() df = convert_excel_date(df, 'hire_date') assert df['hire_date'].dtype == 'M8[ns]'
def load_kruschke(): df = pd.read_csv('../data/iq.csv', index_col=0) # comment out the path to the file for students. df = jn.DataFrame(df).label_encode('treatment') return df
import pandas as pd import janitor as jn df = pd.read_excel("dirty_data.xlsx") df = ( jn.DataFrame(df) .clean_names() .remove_empty() .rename_column("%_allocated", "percent_allocated") .rename_column("full_time?", "full_time") .coalesce(["certification", "certification.1"], "certification") .encode_categorical(["subject", "employee_status", "full_time"]) .convert_excel_date("hire_date") ) print(df)