def test_get_image_quality_no_im_qc_cols(): x = [1,2,3,4] y = [2,3,4,5] df = pd.DataFrame(list(zip(x, y))) df.columns = ["x", "y"] with pytest.raises(ValueError): utils.get_image_quality(df)
def test_get_image_quality_fails_non_dataframe(): # create simple dataframe with ImageQuality columns x = [1, 2, 3] y = [2, 4, 1] z = [2, 5, 1] df = pd.DataFrame(list(zip(x, y, z))) df.columns = ["vals", "ImageQuality_test", "other"] test_list = df["ImageQuality_test"].tolist() with pytest.raises(ValueError): utils.get_image_quality(test_list)
def test_get_image_quality(): # create simple dataframe with ImageQuality columns x = [1, 2, 3] y = [2, 4, 1] z = [2, 5, 1] df = pd.DataFrame(list(zip(x, y, z))) df.columns = ["vals", "ImageQuality_test", "other"] out = utils.get_image_quality(df) print(out) assert out == ["ImageQuality_test"]
def test_get_image_quality_not_beginning(): # column has ImageQuality in middle of string # create simple dataframe with ImageQuality columns x = [1, 2, 3] y = [2, 4, 1] z = [2, 5, 1] df2 = pd.DataFrame(list(zip(x, y, z))) df2.columns = ["vals", "ImageQuality_test", "Cells_ImageQuality"] out = utils.get_image_quality(df2) assert out == ["ImageQuality_test", "Cells_ImageQuality"]
def get_outlier_index(data, method="values", sigma=6, adjust=True, **kwargs): """ Returns index of outlying row(s) Parameters ---------- data: pandas dataframe DataFrame method : string (default="values") either 'simple' which is based on hampels robust outlier test on feature values, or 'ImageQualty' which uses the ImageQualty metrics - FocusScore and PowerLogLogSlope. sigma : int (default=6) number of median absolute deviations away from the sample median to define an outlier. adjust: boolean (default=True) If true will adjust the sigma value to take into account multiple measurements. `sigma_adj = sigma * n_feature_columns` **kwargs: additional arguments to utils.get_featuredata Returns ------- bad_index : list list of row index/indices to remove """ if not isinstance(data, pd.DataFrame): raise ValueError("not a pandas DataFrame") accepted_methods = ["values", "ImageQuality"] if method not in accepted_methods: raise ValueError("invalid argument. Options: values, ImageQuality") if method == "values": feature_cols = utils.get_featuredata(data, **kwargs) # FIXME really crude correction if adjust: sigma = sigma * len(feature_cols) hampel_out = data[feature_cols].apply(stats.hampel, sigma=sigma) hampel_abs = hampel_out.apply(lambda x: sum(abs(x)), axis=1) return hampel_abs[hampel_abs > 0].index.tolist() if method == "ImageQuality": qc_cols = utils.get_image_quality(data) # find bad images with FocusScore focus_cols = [col for col in qc_cols if "FocusScore" in col] hampel_focus = data[focus_cols].apply(stats.hampel, sigma=sigma) focus_sum = hampel_focus.apply(np.sum, axis=1) focus_bad = focus_sum[focus_sum < 0].index.tolist() # find bad images with PowerLogLogSlope plls_cols = [col for col in qc_cols if "PowerLogLogSlope" in col] hampel_plls = data[plls_cols].apply(stats.hampel, sigma=sigma) plls_sum = hampel_plls.apply(np.sum, axis=1) plls_bad = plls_sum[plls_sum < 0].index.tolist() bad_index = list(set(focus_bad + plls_bad)) return bad_index