def test_clean_db_drop_na(): d = pd.DataFrame() N = 100 d['quality'] = np.linspace(0, 1E-3, 5 * N) d['rhythm'] = np.asarray([1, 2, 3, 4, np.nan] * N) d['weight'] = np.asarray(np.asarray([ 0, ] * N + list(np.arange(N)) * 4)) d['age'] = d['weight'] df = clean_db.clean_db(d, drop_columns=None, drop_na_in=['rhythm'], drop_zeroes_in=None, quality_threshold=None) assert len(df) == 4 * N == len(df.dropna(subset=('rhythm', )))
def test_clean_db_quality(): d = pd.DataFrame() N = 100 d['quality'] = np.linspace(0, 1E-3, 5 * N) d['rhythm'] = np.asarray([1, 2, 3, 4, np.nan] * N) d['weight'] = np.asarray(np.asarray([ 0, ] * N + list(np.arange(N)) * 4)) d['age'] = d['weight'] df = clean_db.clean_db(d, drop_columns=None, drop_na_in=None, drop_zeroes_in=None, quality_threshold=1E-4) assert max(df.quality) < 1E-4 assert len(df) < len(d) assert len(df) == sum(np.where(d.quality < 1E-4, 1, 0))
def test_clean_db_drop_zeroes(): d = pd.DataFrame() N = 100 d['quality'] = np.linspace(0, 1E-3, 5 * N) d['rhythm'] = np.asarray([1, 2, 3, 4, np.nan] * N) d['weight'] = np.asarray(np.asarray([ 0, ] * N + list(np.arange(N)) * 4)) d['age'] = d['weight'] zeroes_labels = ['age', 'weight'] df = clean_db.clean_db(d, drop_columns=None, drop_na_in=None, drop_zeroes_in=zeroes_labels, quality_threshold=None) assert len(df) == len(d) - 104 # 100 zeroes + 4 zeroes for _ in zeroes_labels: assert min(abs(df[_])) > 0
def test_clean_db_drop_columns(): d = pd.DataFrame() N = 100 d['quality'] = np.linspace(0, 1E-3, 5 * N) d['rhythm'] = np.asarray([1, 2, 3, 4, np.nan] * N) d['weight'] = np.asarray(np.asarray([ 0, ] * N + list(np.arange(N)) * 4)) d['age'] = d['weight'] drop_columns_df = ['rhythm', 'weight'] df = clean_db.clean_db(d, drop_columns=drop_columns_df, drop_na_in=None, drop_zeroes_in=None, quality_threshold=None) df_col = list(set(df.columns) & set(drop_columns_df)) assert len(df_col) == 0 assert len(df) == len(d)