def test_fast_mcd_large(dials_regression): from scitbx.array_family import flex from dials.algorithms.statistics.fast_mcd import FastMCD # set random seeds to try to avoid assertion errors due to occasionally # finding less common solutions import random random.seed(42) flex.set_random_seed(42) # test large dataset algorithm import os data_pth = os.path.join(dials_regression, "refinement_test_data", "outlier_rejection", "residuals.dat") with open(data_pth, "r") as f: residuals = f.readlines() # ignore first line, which is a header residuals = [[float(val) for val in e.split()] for e in residuals[1:]] X_resid_mm, Y_resid_mm, Phi_resid_mm = zip(*residuals) X_resid_mm = flex.double(X_resid_mm) Y_resid_mm = flex.double(Y_resid_mm) Phi_resid_mm = flex.double(Phi_resid_mm) # Fast MCD raw estimates fast_mcd = FastMCD([X_resid_mm, Y_resid_mm, Phi_resid_mm]) T, S = fast_mcd.get_raw_T_and_S() from libtbx.test_utils import approx_equal assert approx_equal( T, [-0.009702392946856687, 0.008866136837504363, -0.04909037126352747]) assert approx_equal( S, flex.double([ [0.00527965256891, 0.000864300169087, -0.00145971018701], [0.000864300169087, 0.00842807897907, -0.00184047321286], [-0.00145971018701, -0.00184047321286, 0.00698461269031], ]), ) # Fast MCD corrected estimates T, S = fast_mcd.get_corrected_T_and_S() assert approx_equal( T, [-0.009702392946856687, 0.008866136837504363, -0.04909037126352747]) assert approx_equal( S, flex.double([ [0.0129950608638, 0.00212734325892, -0.00359285435473], [0.00212734325892, 0.0207444330604, -0.00453004456394], [-0.00359285435473, -0.00453004456394, 0.0171915605878], ]), ) # Correction factors assert approx_equal(fast_mcd._consistency_fac, 2.45659976388) assert approx_equal(fast_mcd._finite_samp_fac, 1.00193273884)
def _detect_outliers(self, cols): fast_mcd = FastMCD( cols, alpha=self._alpha, max_n_groups=self._max_n_groups, min_group_size=self._min_group_size, n_trials=self._n_trials, k1=self._k1, k2=self._k2, k3=self._k3, ) # get location and MCD scatter estimate T, S = fast_mcd.get_corrected_T_and_S() # get squared Mahalanobis distances d2s = maha_dist_sq(cols, T, S) # compare to the threshold outliers = d2s > self._mahasq_cutoff return outliers
def _detect_outliers(self, cols): outliers = flex.bool(len(cols[0]), False) fast_mcd = FastMCD(cols, alpha = self._alpha, max_n_groups = self._max_n_groups, min_group_size = self._min_group_size, n_trials = self._n_trials, k1 = self._k1, k2 = self._k2, k3 = self._k3) # get location and MCD scatter estimate T, S = fast_mcd.get_corrected_T_and_S() # get squared Mahalanobis distances d2s = maha_dist_sq(cols, T, S) # compare to the threshold outliers = d2s > self._mahasq_cutoff return outliers
def test_fast_mcd_small(): from scitbx.array_family import flex from dials.algorithms.statistics.fast_mcd import FastMCD # set random seeds to try to avoid assertion errors due to occasionally # finding less common solutions import random random.seed(42) flex.set_random_seed(42) # some test data, from R package robustbase: Hawkins, Bradu, Kass's Artificial Data hbk = """10.1 19.6 28.3 9.5 20.5 28.9 10.7 20.2 31.0 9.9 21.5 31.7 10.3 21.1 31.1 10.8 20.4 29.2 10.5 20.9 29.1 9.9 19.6 28.8 9.7 20.7 31.0 9.3 19.7 30.3 11.0 24.0 35.0 12.0 23.0 37.0 12.0 26.0 34.0 11.0 34.0 34.0 3.4 2.9 2.1 3.1 2.2 0.3 0.0 1.6 0.2 2.3 1.6 2.0 0.8 2.9 1.6 3.1 3.4 2.2 2.6 2.2 1.9 0.4 3.2 1.9 2.0 2.3 0.8 1.3 2.3 0.5 1.0 0.0 0.4 0.9 3.3 2.5 3.3 2.5 2.9 1.8 0.8 2.0 1.2 0.9 0.8 1.2 0.7 3.4 3.1 1.4 1.0 0.5 2.4 0.3 1.5 3.1 1.5 0.4 0.0 0.7 3.1 2.4 3.0 1.1 2.2 2.7 0.1 3.0 2.6 1.5 1.2 0.2 2.1 0.0 1.2 0.5 2.0 1.2 3.4 1.6 2.9 0.3 1.0 2.7 0.1 3.3 0.9 1.8 0.5 3.2 1.9 0.1 0.6 1.8 0.5 3.0 3.0 0.1 0.8 3.1 1.6 3.0 3.1 2.5 1.9 2.1 2.8 2.9 2.3 1.5 0.4 3.3 0.6 1.2 0.3 0.4 3.3 1.1 3.0 0.3 0.5 2.4 0.9 1.8 3.2 0.9 1.8 0.7 0.7 2.4 3.4 1.5 1.6 2.1 3.0 0.3 1.5 3.3 0.4 3.4 3.0 0.9 0.1 0.3 1.1 2.7 0.2 2.8 3.0 2.9 2.0 0.7 2.7 0.2 1.8 0.8 1.6 2.0 1.2 0.1 0.0 1.1 2.0 0.6 0.3 1.0 2.2 2.9 2.2 2.5 2.3 0.6 2.0 1.5 0.3 1.7 2.2 0.0 2.2 1.6 0.3 0.4 2.6""" # unpack the data into vectors rows = [[float(e) for e in row.split()] for row in hbk.splitlines()] x1, x2, x3 = [flex.double(e) for e in zip(*rows)] # Fast MCD raw estimates fast_mcd = FastMCD([x1, x2, x3]) T, S = fast_mcd.get_raw_T_and_S() from libtbx.test_utils import approx_equal assert approx_equal( T, [1.5333333333333334, 2.4564102564102566, 1.6076923076923078]) assert approx_equal( S, flex.double([[1.18964912281, 0.00464912280702, 0.217368421053], [0.00464912280702, 0.37620782726, 0.182186234818], [0.217368421053, 0.182186234818, 0.910728744939]])) # Fast MCD corrected estimates T, S = fast_mcd.get_corrected_T_and_S() assert approx_equal( T, [1.5333333333333334, 2.4564102564102566, 1.6076923076923078]) assert approx_equal( S, flex.double([[3.17735853174, 0.012417047794, 0.58055555535], [0.01241704779, 1.00478967011, 0.486589681332], [0.58055555535, 0.486589681332, 2.43240775146]])) # Correction factors assert approx_equal(fast_mcd._consistency_fac, 2.36792847084) assert approx_equal(fast_mcd._finite_samp_fac, 1.12792118859)
def _filter_reflections_based_on_centroid_distance(self): """ Filter reflections too far from predicted position <<<<<<< HEAD """ # Compute the x and y residuals Xobs, Yobs, _ = self.reflections["xyzobs.px.value"].parts() Xcal, Ycal, _ = self.reflections["xyzcal.px"].parts() Xres = Xobs - Xcal Yres = Yobs - Ycal # Compute the epsilon residual s0_length = 1.0 / self.experiments[0].beam.get_wavelength() s1x, s1y, s1z = self.reflections["s2"].parts() s1_length = flex.sqrt(s1x**2 + s1y**2 + s1z**2) Eres = s1_length - s0_length # Initialise the fast_mcd outlier algorithm # fast_mcd = FastMCD((Xres, Yres, Eres)) fast_mcd = FastMCD((Xres, Yres)) # get location and MCD scatter estimate T, S = fast_mcd.get_corrected_T_and_S() # get squared Mahalanobis distances # d2s = maha_dist_sq((Xres, Yres, Eres), T, S) d2s = maha_dist_sq((Xres, Yres), T, S) # Compute the cutoff mahasq_cutoff = chisq_quantile( 2, self.params.refinement.outlier_probability) # compare to the threshold and select reflections selection1 = d2s < mahasq_cutoff selection2 = (flex.sqrt(Xres**2 + Yres**2) < self.params.refinement.max_separation) selection = selection1 & selection2 self.reflections = self.reflections.select(selection) # Print some stuff logger.info("-" * 80) logger.info("Centroid outlier rejection") logger.info(" Using MCD algorithm with probability = %f" % self.params.refinement.outlier_probability) logger.info(" Max X residual: %f" % flex.max(flex.abs(Xres))) logger.info(" Max Y residual: %f" % flex.max(flex.abs(Yres))) logger.info(" Max E residual: %f" % flex.max(flex.abs(Eres))) logger.info(" Mean X RMSD: %f" % (sqrt(flex.sum(Xres**2) / len(Xres)))) logger.info(" Mean Y RMSD: %f" % (sqrt(flex.sum(Yres**2) / len(Yres)))) logger.info(" Mean E RMSD: %f" % (sqrt(flex.sum(Eres**2) / len(Eres)))) logger.info(" MCD location estimate: %.4f, %.4f" % tuple(T)) logger.info(""" MCD scatter estimate: %.7f, %.7f, %.7f, %.7f""" % tuple(list(S))) # logger.info(" MCD location estimate: %.4f, %.4f, %.4f" % tuple(T)) # logger.info(''' MCD scatter estimate: # %.7f, %.7f, %.7f, # %.7f, %.7f, %.7f, # %.7f, %.7f, %.7f''' % tuple(list(S))) logger.info(" Number of outliers: %d" % selection1.count(False)) logger.info( " Number of reflections with residual > %0.2f pixels: %d" % (self.params.refinement.max_separation, selection2.count(False))) logger.info(" Number of reflections selection for refinement: %d" % len(self.reflections)) logger.info("-" * 80) # Throw exception if len(self.reflections) < self.params.refinement.min_n_reflections: raise RuntimeError( "Too few reflections to perform refinement: got %d, expected %d" % (len(self.reflections), self.params.refinement.min_n_reflections))
def test_fast_mcd_small(): from scitbx.array_family import flex from dials.algorithms.statistics.fast_mcd import FastMCD # set random seeds to try to avoid assertion errors due to occasionally # finding less common solutions import random random.seed(42) flex.set_random_seed(42) # some test data, from R package robustbase: Hawkins, Bradu, Kass's Artificial Data hbk = """10.1 19.6 28.3 9.5 20.5 28.9 10.7 20.2 31.0 9.9 21.5 31.7 10.3 21.1 31.1 10.8 20.4 29.2 10.5 20.9 29.1 9.9 19.6 28.8 9.7 20.7 31.0 9.3 19.7 30.3 11.0 24.0 35.0 12.0 23.0 37.0 12.0 26.0 34.0 11.0 34.0 34.0 3.4 2.9 2.1 3.1 2.2 0.3 0.0 1.6 0.2 2.3 1.6 2.0 0.8 2.9 1.6 3.1 3.4 2.2 2.6 2.2 1.9 0.4 3.2 1.9 2.0 2.3 0.8 1.3 2.3 0.5 1.0 0.0 0.4 0.9 3.3 2.5 3.3 2.5 2.9 1.8 0.8 2.0 1.2 0.9 0.8 1.2 0.7 3.4 3.1 1.4 1.0 0.5 2.4 0.3 1.5 3.1 1.5 0.4 0.0 0.7 3.1 2.4 3.0 1.1 2.2 2.7 0.1 3.0 2.6 1.5 1.2 0.2 2.1 0.0 1.2 0.5 2.0 1.2 3.4 1.6 2.9 0.3 1.0 2.7 0.1 3.3 0.9 1.8 0.5 3.2 1.9 0.1 0.6 1.8 0.5 3.0 3.0 0.1 0.8 3.1 1.6 3.0 3.1 2.5 1.9 2.1 2.8 2.9 2.3 1.5 0.4 3.3 0.6 1.2 0.3 0.4 3.3 1.1 3.0 0.3 0.5 2.4 0.9 1.8 3.2 0.9 1.8 0.7 0.7 2.4 3.4 1.5 1.6 2.1 3.0 0.3 1.5 3.3 0.4 3.4 3.0 0.9 0.1 0.3 1.1 2.7 0.2 2.8 3.0 2.9 2.0 0.7 2.7 0.2 1.8 0.8 1.6 2.0 1.2 0.1 0.0 1.1 2.0 0.6 0.3 1.0 2.2 2.9 2.2 2.5 2.3 0.6 2.0 1.5 0.3 1.7 2.2 0.0 2.2 1.6 0.3 0.4 2.6""" # unpack the data into vectors rows = [[float(e) for e in row.split()] for row in hbk.splitlines()] x1, x2, x3 = [flex.double(e) for e in zip(*rows)] # Fast MCD raw estimates fast_mcd = FastMCD([x1, x2, x3]) T, S = fast_mcd.get_raw_T_and_S() from libtbx.test_utils import approx_equal assert approx_equal(T, [1.5333333333333334, 2.4564102564102566, 1.6076923076923078]) assert approx_equal(S, flex.double( [[1.18964912281, 0.00464912280702, 0.217368421053], [0.00464912280702, 0.37620782726, 0.182186234818], [0.217368421053, 0.182186234818, 0.910728744939]])) # Fast MCD corrected estimates T, S = fast_mcd.get_corrected_T_and_S() assert approx_equal(T, [1.5333333333333334, 2.4564102564102566, 1.6076923076923078]) assert approx_equal(S, flex.double( [[3.17735853174, 0.012417047794, 0.58055555535], [0.01241704779, 1.00478967011, 0.486589681332], [0.58055555535, 0.486589681332, 2.43240775146]])) # Correction factors assert approx_equal(fast_mcd._consistency_fac, 2.36792847084) assert approx_equal(fast_mcd._finite_samp_fac, 1.12792118859) print "OK" return
def test_fast_mcd_large(): from scitbx.array_family import flex from dials.algorithms.statistics.fast_mcd import FastMCD # set random seeds to try to avoid assertion errors due to occasionally # finding less common solutions import random random.seed(42) flex.set_random_seed(42) # test large dataset algorithm import libtbx.load_env # required for libtbx.env.find_in_repositories if not libtbx.env.has_module("dials_regression"): print "Skipping test_fast_mcd_large(): dials_regression not available." return # load data import os dials_regression = libtbx.env.find_in_repositories( relative_path="dials_regression", test=os.path.isdir) data_pth = os.path.join(dials_regression, "refinement_test_data", "outlier_rejection", "residuals.dat") with(open(data_pth, "r")) as f: residuals = f.readlines() # ignore first line, which is a header residuals = [[float(val) for val in e.split()] for e in residuals[1:]] X_resid_mm, Y_resid_mm, Phi_resid_mm = zip(*residuals) X_resid_mm = flex.double(X_resid_mm) Y_resid_mm = flex.double(Y_resid_mm) Phi_resid_mm = flex.double(Phi_resid_mm) # Fast MCD raw estimates fast_mcd = FastMCD([X_resid_mm, Y_resid_mm, Phi_resid_mm]) T, S = fast_mcd.get_raw_T_and_S() from libtbx.test_utils import approx_equal assert approx_equal(T, [-0.009702392946856687, 0.008866136837504363, -0.04909037126352747]) assert approx_equal(S, flex.double( [[0.00527965256891, 0.000864300169087, -0.00145971018701], [0.000864300169087, 0.00842807897907, -0.00184047321286], [-0.00145971018701, -0.00184047321286, 0.00698461269031]])) # Fast MCD corrected estimates T, S = fast_mcd.get_corrected_T_and_S() assert approx_equal(T, [-0.009702392946856687, 0.008866136837504363, -0.04909037126352747]) assert approx_equal(S, flex.double( [[0.0129950608638, 0.00212734325892, -0.00359285435473], [0.00212734325892, 0.0207444330604, -0.00453004456394], [-0.00359285435473, -0.00453004456394, 0.0171915605878]])) # Correction factors assert approx_equal(fast_mcd._consistency_fac, 2.45659976388) assert approx_equal(fast_mcd._finite_samp_fac, 1.00193273884) print "OK" return
def _filter_reflections_based_on_centroid_distance( reflection_table, experiment, outlier_probability=0.975, max_separation=2, ): """ Filter reflections too far from predicted position """ # Compute the x and y residuals Xobs, Yobs, _ = reflection_table["xyzobs.px.value"].parts() Xcal, Ycal, _ = reflection_table["xyzcal.px"].parts() Xres = Xobs - Xcal Yres = Yobs - Ycal # Compute the epsilon residual s0_length = 1.0 / experiment.beam.get_wavelength() s1x, s1y, s1z = reflection_table["s2"].parts() s1_length = flex.sqrt(s1x**2 + s1y**2 + s1z**2) Eres = s1_length - s0_length # Initialise the fast_mcd outlier algorithm # fast_mcd = FastMCD((Xres, Yres, Eres)) fast_mcd = FastMCD((Xres, Yres)) # get location and MCD scatter estimate T, S = fast_mcd.get_corrected_T_and_S() # get squared Mahalanobis distances # d2s = maha_dist_sq((Xres, Yres, Eres), T, S) d2s = maha_dist_sq((Xres, Yres), T, S) # Compute the cutoff mahasq_cutoff = chisq_quantile(2, outlier_probability) # compare to the threshold and select reflections selection1 = d2s < mahasq_cutoff selection2 = flex.sqrt(Xres**2 + Yres**2) < max_separation selection = selection1 & selection2 reflection_table = reflection_table.select(selection) n_refl = reflection_table.size() # Print some stuff logger.info("-" * 80) logger.info("Centroid outlier rejection") logger.info( f" Using MCD algorithm with probability = {outlier_probability}") logger.info(" Max X residual: %f" % flex.max(flex.abs(Xres))) logger.info(" Max Y residual: %f" % flex.max(flex.abs(Yres))) logger.info(" Max E residual: %f" % flex.max(flex.abs(Eres))) logger.info(" Mean X RMSD: %f" % (sqrt(flex.sum(Xres**2) / len(Xres)))) logger.info(" Mean Y RMSD: %f" % (sqrt(flex.sum(Yres**2) / len(Yres)))) logger.info(" Mean E RMSD: %f" % (sqrt(flex.sum(Eres**2) / len(Eres)))) logger.info(" MCD location estimate: %.4f, %.4f" % tuple(T)) logger.info(""" MCD scatter estimate: %.7f, %.7f, %.7f, %.7f""" % tuple(S)) logger.info(" Number of outliers: %d" % selection1.count(False)) logger.info(" Number of reflections with residual > %0.2f pixels: %d" % (max_separation, selection2.count(False))) logger.info(f"Number of reflections selection for refinement: {n_refl}") logger.info("-" * 80) return reflection_table