def test_valid_percentages(): # these are valid: assert_valid_percent(0.5) assert_valid_percent(0.9) assert_valid_percent(0.1) # these will fail: assert_raises(ValueError, assert_valid_percent, x=0.0) assert_raises(ValueError, assert_valid_percent, x=1.0) # these will pass: assert_valid_percent(x=1.0, eq_upper=True) assert_valid_percent(x=0.0, eq_lower=True)
def insert_missing_values(df, percent_rows, random_state=None): """ Inserts missing values into a data frame. :param df: data frame we're operating on :param percent_rows: the percentage of rows that should have a missing value. :param random_state: the numpy RandomState :return: a df with missing values """ # get the initialized random_state (if not already initialized) random_state = get_random_state(random_state) df = df.copy() def _insert_random_null(x): """ Chose a random column in a df row to null. This operates in-place. But it's on the copy, so it should be OK. :param x: the data frame """ # -1 because last col will always be y x[random_state.randint(0, len(x) - 1)] = np.nan return x # this is a "truthy" check. If it's zero or False, this will work. if not percent_rows: return df else: # otherwise validate that it's a float percent_rows = assert_valid_percent(percent_rows, eq_upper=True) # eq_lower not necessary because != 0. sample_index = df.sample(frac=percent_rows, random_state=random_state).index # random sample of rows to null df.loc[sample_index] = df.loc[sample_index].apply(_insert_random_null, axis=1) return df
def test_score_dataset_multiclass(): results = score_dataset(y_file="y_test.csv", y_hat_file="y_hat_test.csv") assert_valid_percent(results[0], "Not a valid percent for Accuracy")
def test_score_datasetclassification(): results = score_dataset(y_file="y_test.csv", y_hat_file="y_hat_test.csv") assert_valid_percent(results[0], "Not a valid percent for AUC")
def test_invalid_percetages(pct): with pytest.raises(ValueError): assert_valid_percent(x=pct)
def test_valid_percentages(pct, kwargs): assert_valid_percent(pct, **kwargs)