Python fit_cdfの例

プログラミング言語: Python

名前空間/パッケージ名: rstbx.outlier_spots.fit_distribution

メソッド/関数: fit_cdf

hotexamples.comのコード掲載数: 4

Python fit_cdf - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのrstbx.outlier_spots.fit_distribution.fit_cdfの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: simple_integration.py プロジェクト: rimmartin/cctbx_project

    def integration_concept(self,
                            image_number=0,
                            cb_op_to_primitive=None,
                            verbose=False,
                            **kwargs):
        self.image_number = image_number
        NEAR = 10
        pxlsz = self.pixel_size
        self.get_predictions_accounting_for_centering(cb_op_to_primitive,
                                                      **kwargs)
        FWMOSAICITY = self.inputai.getMosaicity()
        DOMAIN_SZ_ANG = kwargs.get("domain_size_ang",
                                   self.__dict__.get("actual", 0))
        refineflag = {True: 0, False: 1}[kwargs.get("domain_size_ang", 0) == 0]
        self.inputpd["symmetry"].show_summary(
            prefix="EXCURSION%1d REPORT FWMOS= %6.4f DOMAIN= %6.1f " %
            (refineflag, FWMOSAICITY, DOMAIN_SZ_ANG))
        from annlib_ext import AnnAdaptor
        self.cell = self.inputai.getOrientation().unit_cell()
        query = flex.double()
        for pred in self.predicted:  # predicted spot coord in pixels
            query.append(pred[0] / pxlsz)
            query.append(pred[1] / pxlsz)
        self.reserve_hkllist_for_signal_search = self.hkllist

        reference = flex.double()
        spots = self.get_observations_with_outlier_removal()

        assert len(
            spots) > NEAR  # Can't do spot/pred matching with too few spots
        for spot in spots:
            reference.append(spot.ctr_mass_x())
            reference.append(spot.ctr_mass_y())

        IS_adapt = AnnAdaptor(data=reference, dim=2, k=NEAR)
        IS_adapt.query(query)
        print "Calculate correction vectors for %d observations & %d predictions" % (
            len(spots), len(self.predicted))
        indexed_pairs_provisional = []
        correction_vectors_provisional = []
        c_v_p_flex = flex.vec3_double()
        idx_cutoff = float(min(self.mask_focus[image_number]))
        if verbose:
            print "idx_cutoff distance in pixels", idx_cutoff
        if not self.horizons_phil.integration.enable_one_to_one_safeguard:
            # legacy code, no safeguard against many-to-one predicted-to-observation mapping
            for i in range(len(self.predicted)):  # loop over predicteds
                #for n in range(NEAR): # loop over near spotfinder spots
                for n in range(1):  # only consider the nearest spotfinder spot
                    Match = dict(spot=IS_adapt.nn[i * NEAR + n], pred=i)
                    if n == 0 and math.sqrt(
                            IS_adapt.distances[i * NEAR + n]) < idx_cutoff:
                        indexed_pairs_provisional.append(Match)

                        vector = matrix.col([
                            spots[Match["spot"]].ctr_mass_x() -
                            self.predicted[Match["pred"]][0] / pxlsz,
                            spots[Match["spot"]].ctr_mass_y() -
                            self.predicted[Match["pred"]][1] / pxlsz
                        ])
                        correction_vectors_provisional.append(vector)
                        c_v_p_flex.append((vector[0], vector[1], 0.))
        else:
            one_to_one = {}
            for i in range(len(self.predicted)):  # loop over predicteds
                annresultidx = i * NEAR
                obsidx = IS_adapt.nn[annresultidx]
                this_distancesq = IS_adapt.distances[annresultidx]
                if obsidx not in one_to_one or \
                   this_distancesq < one_to_one[obsidx]["distancesq"]:
                    if math.sqrt(this_distancesq) < idx_cutoff:
                        one_to_one[obsidx] = dict(spot=obsidx,
                                                  pred=i,
                                                  distancesq=this_distancesq)
            for key, value in one_to_one.items():
                indexed_pairs_provisional.append(value)
                vector = matrix.col([
                    spots[value["spot"]].ctr_mass_x() -
                    self.predicted[value["pred"]][0] / pxlsz,
                    spots[value["spot"]].ctr_mass_y() -
                    self.predicted[value["pred"]][1] / pxlsz
                ])
                correction_vectors_provisional.append(vector)
                c_v_p_flex.append((vector[0], vector[1], 0.))

        print "... %d provisional matches" % len(
            correction_vectors_provisional),
        print "r.m.s.d. in pixels: %5.2f" % (math.sqrt(
            flex.mean(c_v_p_flex.dot(c_v_p_flex))))

        if self.horizons_phil.integration.enable_residual_scatter:
            from matplotlib import pyplot as plt
            fig = plt.figure()
            for cv in correction_vectors_provisional:
                plt.plot([cv[1]], [-cv[0]], "b.")
            plt.title(" %d matches, r.m.s.d. %5.2f pixels" %
                      (len(correction_vectors_provisional),
                       math.sqrt(flex.mean(c_v_p_flex.dot(c_v_p_flex)))))
            plt.axes().set_aspect("equal")
            self.show_figure(plt, fig, "res")
            plt.close()

        if self.horizons_phil.integration.enable_residual_map:
            from matplotlib import pyplot as plt
            fig = plt.figure()
            for match, cv in zip(indexed_pairs_provisional,
                                 correction_vectors_provisional):
                plt.plot([spots[match["spot"]].ctr_mass_y()],
                         [-spots[match["spot"]].ctr_mass_x()], "r.")
                plt.plot([self.predicted[match["pred"]][1] / pxlsz],
                         [-self.predicted[match["pred"]][0] / pxlsz], "g.")
                plt.plot([
                    spots[match["spot"]].ctr_mass_y(),
                    spots[match["spot"]].ctr_mass_y() + 10. * cv[1]
                ], [
                    -spots[match["spot"]].ctr_mass_x(),
                    -spots[match["spot"]].ctr_mass_x() - 10. * cv[0]
                ], 'b-')
            plt.xlim([0, float(self.inputpd["size2"])])
            plt.ylim([-float(self.inputpd["size1"]), 0])
            plt.title(" %d matches, r.m.s.d. %5.2f pixels" %
                      (len(correction_vectors_provisional),
                       math.sqrt(flex.mean(c_v_p_flex.dot(c_v_p_flex)))))
            plt.axes().set_aspect("equal")
            self.show_figure(plt, fig, "map")
            plt.close()
        # insert code here to remove correction length outliers...
        # they are causing terrible
        # problems for finding legitimate correction vectors (print out the list)
        # also remove outliers for the purpose of reporting RMS
        outlier_rejection = True
        cache_refinement_spots = getattr(slip_callbacks.slip_callback,
                                         "requires_refinement_spots", False)
        if outlier_rejection:
            correction_lengths = flex.double(
                [v.length() for v in correction_vectors_provisional])
            clorder = flex.sort_permutation(correction_lengths)
            sorted_cl = correction_lengths.select(clorder)

            ACCEPTABLE_LIMIT = 2
            limit = int(
                0.33 * len(sorted_cl)
            )  # best 1/3 of data are assumed to be correctly modeled.
            if (limit <= ACCEPTABLE_LIMIT):
                raise Sorry(
                    "Not enough indexed spots to reject outliers; have %d need >%d"
                    % (limit, ACCEPTABLE_LIMIT))

            y_data = flex.double(len(sorted_cl))
            for i in range(len(y_data)):
                y_data[i] = float(i) / float(len(y_data))

            # ideas are explained in Sauter & Poon (2010) J Appl Cryst 43, 611-616.
            from rstbx.outlier_spots.fit_distribution import fit_cdf, rayleigh
            fitted_rayleigh = fit_cdf(x_data=sorted_cl[0:limit],
                                      y_data=y_data[0:limit],
                                      distribution=rayleigh)

            inv_cdf = [
                fitted_rayleigh.distribution.inv_cdf(cdf) for cdf in y_data
            ]

            #print "SORTED LIST OF ",len(sorted_cl), "with sigma",fitted_rayleigh.distribution.sigma
            indexed_pairs = []
            correction_vectors = []
            self.correction_vectors = []
            for icand in range(len(sorted_cl)):
                # somewhat arbitrary sigma = 1.0 cutoff for outliers
                if (sorted_cl[icand] - inv_cdf[icand]
                    ) / fitted_rayleigh.distribution.sigma > 1.0:
                    break
                indexed_pairs.append(indexed_pairs_provisional[clorder[icand]])
                correction_vectors.append(
                    correction_vectors_provisional[clorder[icand]])
                if cache_refinement_spots:
                    self.spotfinder.images[self.frame_numbers[
                        self.image_number]]["refinement_spots"].append(
                            spots[indexed_pairs[-1]["spot"]])
                if kwargs.get("verbose_cv") == True:
                    print "CV OBSCENTER %7.2f %7.2f REFINEDCENTER %7.2f %7.2f" % (
                        float(self.inputpd["size1"]) / 2.,
                        float(self.inputpd["size2"]) / 2.,
                        self.inputai.xbeam() / pxlsz,
                        self.inputai.ybeam() / pxlsz),
                    print "OBSSPOT %7.2f %7.2f PREDSPOT %7.2f %7.2f" % (
                        spots[indexed_pairs[-1]["spot"]].ctr_mass_x(),
                        spots[indexed_pairs[-1]["spot"]].ctr_mass_y(),
                        self.predicted[indexed_pairs[-1]["pred"]][0] / pxlsz,
                        self.predicted[indexed_pairs[-1]["pred"]][1] / pxlsz),
                    the_hkl = self.hkllist[indexed_pairs[-1]["pred"]]
                    print "HKL %4d %4d %4d" % the_hkl, "%2d" % self.setting_id,
                    radial, azimuthal = spots[indexed_pairs[-1][
                        "spot"]].get_radial_and_azimuthal_size(
                            self.inputai.xbeam() / pxlsz,
                            self.inputai.ybeam() / pxlsz)
                    print "RADIALpx %5.3f AZIMUTpx %5.3f" % (radial, azimuthal)

                # Store a list of correction vectors in self.
                radial, azimuthal = spots[
                    indexed_pairs[-1]['spot']].get_radial_and_azimuthal_size(
                        self.inputai.xbeam() / pxlsz,
                        self.inputai.ybeam() / pxlsz)
                self.correction_vectors.append(
                    dict(obscenter=(float(self.inputpd['size1']) / 2,
                                    float(self.inputpd['size2']) / 2),
                         refinedcenter=(self.inputai.xbeam() / pxlsz,
                                        self.inputai.ybeam() / pxlsz),
                         obsspot=(
                             spots[indexed_pairs[-1]['spot']].ctr_mass_x(),
                             spots[indexed_pairs[-1]['spot']].ctr_mass_y()),
                         predspot=(
                             self.predicted[indexed_pairs[-1]['pred']][0] /
                             pxlsz,
                             self.predicted[indexed_pairs[-1]['pred']][1] /
                             pxlsz),
                         hkl=(self.hkllist[indexed_pairs[-1]['pred']][0],
                              self.hkllist[indexed_pairs[-1]['pred']][1],
                              self.hkllist[indexed_pairs[-1]['pred']][2]),
                         setting_id=self.setting_id,
                         radial=radial,
                         azimuthal=azimuthal))

            print "After outlier rejection %d indexed spotfinder spots remain." % len(
                indexed_pairs)
            if False:
                rayleigh_cdf = [
                    fitted_rayleigh.distribution.cdf(x=sorted_cl[c])
                    for c in range(len(sorted_cl))
                ]
                from matplotlib import pyplot as plt
                plt.plot(sorted_cl, y_data, "r+")
                #plt.plot(sorted_cl,rayleigh_cdf,"g.")
                plt.plot(inv_cdf, y_data, "b.")
                plt.show()
        else:
            indexed_pairs = indexed_pairs_provisional
            correction_vectors = correction_vectors_provisional
        ########### finished with outlier rejection

        self.inputpd["symmetry"].show_summary(prefix="SETTING ")

        is_triclinic = (self.setting_id == 1)
        if is_triclinic:
            self.triclinic_pairs = [
                dict(pred=self.hkllist[a["pred"]], spot=a["spot"])
                for a in indexed_pairs
            ]

        if self.horizons_phil.integration.model == "user_supplied":
            if kwargs.get("user-reentrant", None) == None:
                from cxi_user import post_outlier_rejection
                self.indexed_pairs = indexed_pairs
                self.spots = spots
                post_outlier_rejection(self, image_number, cb_op_to_primitive,
                                       self.horizons_phil, kwargs)
                return

        ########### finished with user-supplied code

        if self.horizons_phil.integration.spot_shape_verbose:
            from rstbx.new_horizons.spot_shape import spot_shape_verbose
            spot_shape_verbose(rawdata=self.imagefiles.images[
                self.image_number].linearintdata,
                               beam_center_pix=matrix.col(
                                   (self.inputai.xbeam() / pxlsz,
                                    self.inputai.ybeam() / pxlsz)),
                               indexed_pairs=indexed_pairs,
                               spotfinder_observations=spots,
                               distance_mm=self.inputai.distance(),
                               mm_per_pixel=pxlsz,
                               hkllist=self.hkllist,
                               unit_cell=self.cell,
                               wavelength_ang=self.inputai.wavelength)

        #Other checks to be implemented (future):
        # spot is within active area of detector on a circular detector such as the Mar IP
        # integration masks do not overlap; or deconvolute

        correction_lengths = flex.double(
            [v.length() for v in correction_vectors])
        if verbose:
            print "average correction %5.2f over %d vectors" % (
                flex.mean(correction_lengths), len(correction_lengths)),
            print "or %5.2f mm." % (pxlsz * flex.mean(correction_lengths))
        self.r_residual = pxlsz * flex.mean(correction_lengths)

        #assert len(indexed_pairs)>NEAR # must have enough indexed spots
        if (len(indexed_pairs) <= NEAR):
            raise Sorry("Not enough indexed spots, only found %d, need %d" %
                        (len(indexed_pairs), NEAR))

        reference = flex.double()
        for item in indexed_pairs:
            reference.append(spots[item["spot"]].ctr_mass_x())
            reference.append(spots[item["spot"]].ctr_mass_y())

        PS_adapt = AnnAdaptor(data=reference, dim=2, k=NEAR)
        PS_adapt.query(query)

        self.BSmasks = []
        #self.null_correction_mapping( predicted=self.predicted,
        #                                    correction_vectors = correction_vectors,
        #                                    IS_adapt = IS_adapt,
        #                                    spots = spots)
        self.positional_correction_mapping(
            predicted=self.predicted,
            correction_vectors=correction_vectors,
            PS_adapt=PS_adapt,
            IS_adapt=IS_adapt,
            spots=spots)

        # which spots are close enough to interfere with background?
        MAXOVER = 6
        OS_adapt = AnnAdaptor(data=query, dim=2, k=MAXOVER)  #six near nbrs
        OS_adapt.query(query)
        if self.mask_focus[image_number] is None:
            raise Sorry(
                "No observed/predicted spot agreement; no Spotfinder masks; skip integration"
            )
        nbr_cutoff = 2.0 * max(self.mask_focus[image_number])
        FRAME = int(nbr_cutoff / 2)
        #print "The overlap cutoff is %d pixels"%nbr_cutoff
        nbr_cutoff_sq = nbr_cutoff * nbr_cutoff

        #print "Optimized C++ section...",
        self.set_frame(FRAME)
        self.set_background_factor(kwargs["background_factor"])
        self.set_nbr_cutoff_sq(nbr_cutoff_sq)
        self.set_guard_width_sq(self.horizons_phil.integration.guard_width_sq)
        self.set_detector_gain(self.horizons_phil.integration.detector_gain)
        flex_sorted = flex.int()
        for item in self.sorted:
            flex_sorted.append(item[0])
            flex_sorted.append(item[1])

        if self.horizons_phil.integration.mask_pixel_value is not None:
            self.set_mask_pixel_val(
                self.horizons_phil.integration.mask_pixel_value)

        image_obj = self.imagefiles.imageindex(
            self.frame_numbers[self.image_number])
        image_obj.read()
        rawdata = image_obj.linearintdata  # assume image #1

        if self.inputai.active_areas != None:
            self.detector_xy_draft = self.safe_background(
                rawdata=rawdata,
                predicted=self.predicted,
                OS_adapt=OS_adapt,
                sorted=flex_sorted,
                tiles=self.inputai.active_areas.IT,
                tile_id=self.inputai.active_areas.tile_id)
        else:
            self.detector_xy_draft = self.safe_background(
                rawdata=rawdata,
                predicted=self.predicted,
                OS_adapt=OS_adapt,
                sorted=flex_sorted)
        for i in range(len(self.predicted)):  # loop over predicteds
            B_S_mask = {}
            keys = self.get_bsmask(i)
            for k in range(0, len(keys), 2):
                B_S_mask[(keys[k], keys[k + 1])] = True
            self.BSmasks.append(B_S_mask)
        #print "Done"
        return

コード例 #2

ファイルを表示

    def update_detail(self, horizon_phil, current_status, first_time_through,
                      verbose):
        assert (len(self.observed_spots) == len(self.predicted_spots))

        if horizon_phil.indexing.outlier_detection.verbose:
            classes = [
                str(current_status[i]) for i in range(len(self.observed_spots))
            ]
            class_types = set(classes)
            class_counts = dict([[item, classes.count(item)]
                                 for item in class_types])
            flex_counts = flex.int(class_counts.values())
            assert flex.sum(flex_counts) == len(self.observed_spots)
            #for pair in class_counts.items():
            #  print "%10s %6d"%pair
            #print "%10s %6d"%("TOTAL",len(self.observed_spots))
            if status_with_marked_outliers == None:
                # status_with_marked_outliers==None is shorthand for identifying the first run through
                print """After indexing on a subset of %d spots (from all images), %d were reclassified as
      either lying on the spindle, or potential overlapped spots or ice rings.""" % (
                    len(self.observed_spots),
                    len(self.observed_spots) - class_counts["GOOD"])
            else:
                print """Rerefinement on just the well-fit spots followed by spot reclassification
      leaves %d good spots on which to calculate a triclinic rmsd.""" % (
                    class_counts["GOOD"])

        # check good spots
        if (self.good is not None):
            match = 0
            for i in range(len(self.observed_spots)):
                if ((current_status[i] == SpotClass.GOOD) and self.good[i]):
                    match = match + 1
            if self.verbose:
                print "Number of GOOD spots matched with previous model =", match

        # calculate differences for all spots
        self.sorted_observed_spots = {}
        self.dr = flex.double()
        self.not_good_dr = flex.double()
        self.dx = [0.0 for i in range(len(self.observed_spots))]
        self.dy = [0.0 for i in range(len(self.observed_spots))]
        for i in range(len(self.observed_spots)):
            o = self.observed_spots[i]
            p = self.predicted_spots[i]
            self.dx[i] = o[0] - p[0]
            self.dy[i] = o[1] - p[1]
            self.sorted_observed_spots[math.sqrt(self.dx[i] * self.dx[i] +
                                                 self.dy[i] * self.dy[i])] = i

        # separate GOOD spots
        spotclasses = {
            SpotClass.GOOD: 0,
            SpotClass.SPINDLE: 0,
            SpotClass.OVERLAP: 0,
            SpotClass.ICE: 0,
            SpotClass.OUTLIER: 0,
            SpotClass.NONE: 0
        }
        for key in sorted(self.sorted_observed_spots.keys()):
            spotclass = current_status[self.sorted_observed_spots[key]]
            spotclasses[spotclass] += 1
            if (current_status[self.sorted_observed_spots[key]] ==
                    SpotClass.GOOD):
                self.dr.append(key)
            else:
                self.not_good_dr.append(key)
        if verbose:
            print ", ".join([
                "=".join([str(i[0]), "%d" % i[1]])
                for i in spotclasses.items()
            ]),
        totalsp = sum(
            [spotclasses.values()[iidx] for iidx in range(len(spotclasses))])
        if verbose:
            print "Total=%d" % (totalsp), "# observed spots", len(
                self.observed_spots)
        assert totalsp == len(
            self.observed_spots
        ), "Some spot pairs have the same predicted-observed distances. Do you have duplicated images?"

        self.x = flex.double(len(self.dr))
        for i in range(len(self.x)):
            self.x[i] = float(i) / float(len(self.x))

        limit = int(self.fraction * len(self.dr))
        if limit < 4:
            return  # Basic sanity check, need at least a few good spots to fit the distribution
        fitted_rayleigh = fit_cdf(x_data=self.dr[0:limit],
                                  y_data=self.x[0:limit],
                                  distribution=rayleigh_cpp)
        if False:
            y_data = self.x[0:limit]
            inv_cdf = [
                fitted_rayleigh.distribution.inv_cdf(cdf) for cdf in y_data
            ]
            from matplotlib import pyplot as plt
            plt.plot(self.dr[0:limit], self.x[0:limit], "r+")
            plt.plot(inv_cdf, y_data, "b.")
            plt.show()

        # store indices for spots used for fitting
        self.fraction_spot_indices = []
        for dr in self.dr[0:limit]:
            self.fraction_spot_indices.append(self.sorted_observed_spots[dr])

        # generate points for fitted distributions
        rayleigh_cdf_x = flex.double(range(500))
        rayleigh_cdf_x /= float(len(rayleigh_cdf_x))
        rayleigh_cdf = fitted_rayleigh.distribution.cdf(x=rayleigh_cdf_x)

        # generate points for pdf
        dr_bins, dr_histogram = make_histogram_data(data=self.dr, n_bins=100)
        rayleigh_pdf = flex.double(len(dr_bins))
        for i in range(len(dr_bins)):
            rayleigh_pdf[i] = fitted_rayleigh.distribution.pdf(x=dr_bins[i])
        rayleigh_pdf = rayleigh_pdf / flex.sum(rayleigh_pdf)
        dr_bins = flex.double(dr_bins)
        dr_histogram = flex.double(dr_histogram)

        # standard deviation for cdf
        sd = math.sqrt((4.0 - math.pi) / (2.0) * fitted_rayleigh.x[0] *
                       fitted_rayleigh.x[0])
        if self.verbose:
            print 'Standard deviation of Rayleigh fit = %4.3f' % sd
        sd_data = None
        radius_outlier_index = None
        limit_outlier = None
        # --- Quoted code superceeded by extension module call to find_green_bar
        """
    for i in range(len(rayleigh_cdf_x)):
      mx = rayleigh_cdf_x[i]
      my = rayleigh_cdf[i]
      for j in range(1,len(self.dr)):
        upper_x = self.dr[j]
        upper_y = self.x[j]
        lower_x = self.dr[j-1]
        lower_y = self.x[j-1]
        if ((my >= lower_y) and (my < upper_y)):
          if ((sd <= (upper_x - mx)) and ((lower_x - mx) > 0.0)):
            sd_data = ((mx,my),(lower_x,lower_y))
            radius_outlier_index = j-1
            limit_outlier = lower_x
            if self.verbose:print "Width of green bar = %4.3f"%(lower_x - mx)
            break
        if (sd_data is not None):
          break
    """
        from rstbx.indexing_api import find_green_bar
        green = find_green_bar(rayleigh_cdf_x=rayleigh_cdf_x,
                               rayleigh_cdf=rayleigh_cdf,
                               dr=self.dr,
                               x=self.x,
                               sd=sd)
        if green.is_set:
            #assert radius_outlier_index == green.radius_outlier_index
            #assert limit_outlier == green.limit_outlier
            #assert sd_data[0][0] == green.sd_mx
            #assert sd_data[0][1] == green.sd_my
            #assert sd_data[1][0] == green.sd_lower_x
            #assert sd_data[1][1] == green.sd_lower_y
            if self.verbose:
                print "Width of green bar = %4.3f" % (green.sd_lower_x -
                                                      green.sd_mx)
            radius_outlier_index = green.radius_outlier_index
            limit_outlier = green.limit_outlier
            sd_data = ((green.sd_mx, green.sd_my), (green.sd_lower_x,
                                                    green.sd_lower_y))

        if (radius_outlier_index is None):
            radius_outlier_index = len(self.dr)
        if (limit_outlier is None):
            limit_outlier = self.dr[-1]
        radius_95 = None
        for i in range(len(rayleigh_cdf)):
            if (rayleigh_cdf[i] >= 0.95):
                radius_95 = rayleigh_cdf_x[i]
                break
        if (radius_95 is None):
            radius_95 = rayleigh_cdf_x[-1]
        upper_circle = []
        lower_circle = []
        d_radius = 2.0 * radius_95 / 100.0
        x = -radius_95
        r2 = radius_95 * radius_95
        for i in range(100):
            y = math.sqrt(r2 - x * x)
            upper_circle.append((x, y))
            lower_circle.append((x, -y))
            x = x + d_radius
        y = 0.0
        upper_circle.append((x, y))
        lower_circle.append((x, -y))
        self.sqrtr2 = math.sqrt(r2)

        # color code dx dy
        dxdy_fraction = []
        dxdy_inliers = []
        dxdy_outliers = []

        limit = self.dr[int(self.fraction * len(self.dr))]

        trifold = dict(fraction=0, inlier=0, outlier=0, total=0)
        for key in self.dr:
            trifold["total"] += 1
            i = self.sorted_observed_spots[key]
            if (key < limit):
                trifold["fraction"] += 1
                if (not ((self.dx[i] > 1.0) or (self.dx[i] < -1.0) or
                         (self.dy[i] > 1.0) or (self.dy[i] < -1.0))):
                    dxdy_fraction.append((self.dx[i], self.dy[i]))
            elif (key < limit_outlier):
                trifold["inlier"] += 1
                if (not ((self.dx[i] > 1.0) or (self.dx[i] < -1.0) or
                         (self.dy[i] > 1.0) or (self.dy[i] < -1.0))):
                    dxdy_inliers.append((self.dx[i], self.dy[i]))
            else:
                trifold["outlier"] += 1
                if (not ((self.dx[i] > 1.0) or (self.dx[i] < -1.0) or
                         (self.dy[i] > 1.0) or (self.dy[i] < -1.0))):
                    dxdy_outliers.append((self.dx[i], self.dy[i]))
        if verbose:
            print ", ".join(
                ["=".join([str(i[0]), "%d" % i[1]]) for i in trifold.items()])

        # color code observed fractions
        o_fraction = []
        o_inliers = []
        o_outliers = []
        mr = format_data(x_data=rayleigh_cdf_x, y_data=rayleigh_cdf)
        limit = int(self.fraction * len(self.dr))
        for i in range(len(self.dr)):
            if (self.dr[i] <= 1.0):
                if (i < limit):
                    o_fraction.append((self.dr[i], self.x[i]))
                elif (i < radius_outlier_index):
                    o_inliers.append((self.dr[i], self.x[i]))
                else:
                    o_outliers.append((self.dr[i], self.x[i]))
        if horizon_phil.indexing.outlier_detection.verbose:
            o_outliers_for_severity = []
            for i in range(radius_outlier_index, len(self.dr)):
                o_outliers_for_severity.append((self.dr[i], self.x[i]))

        # limit data range
        for i in range(len(dr_bins)):
            if (dr_bins[i] > 1.0):
                dr_bins.resize(i)
                dr_histogram.resize(i)
                rayleigh_pdf.resize(i)
                break
        ho = format_data(x_data=dr_bins, y_data=dr_histogram)
        hr = format_data(x_data=dr_bins, y_data=rayleigh_pdf)

        # format data for graphing
        self.plot_dxdy_data = [
            dxdy_fraction, dxdy_inliers, dxdy_outliers, [(0.0, 0.0)], [], [],
            [], []
        ]

        self.framework = {
            4: dict(status=SpotClass.SPINDLE),
            5: dict(status=SpotClass.OVERLAP),
            6: dict(status=SpotClass.OUTLIER),
            7: dict(status=SpotClass.ICE),
        }
        for key in self.not_good_dr:
            i = self.sorted_observed_spots[key]
            status = current_status[i]
            if (not ((self.dx[i] > 1.0) or (self.dx[i] < -1.0) or
                     (self.dy[i] > 1.0) or (self.dy[i] < -1.0))):
                statuskey = [
                    k for k in self.framework.keys()
                    if self.framework[k]["status"] == status
                ][0]
                self.plot_dxdy_data[statuskey].append((self.dx[i], self.dy[i]))

        self.plot_cdf_data = [mr, o_fraction, o_inliers, o_outliers]
        if (sd_data is not None):
            self.plot_cdf_data.append(sd_data)
        self.plot_pdf_data = [ho, hr]

        # mark outliers
        if (first_time_through):  #i.e., first time through the update() method
            if (radius_outlier_index < len(self.dr)):
                for i in range(radius_outlier_index, len(self.dr)):
                    current_status[self.sorted_observed_spots[
                        self.dr[i]]] = SpotClass.OUTLIER

        # reset good spots
        self.good = [False for i in range(len(self.observed_spots))]
        for i in range(len(self.observed_spots)):
            if (current_status[i] == SpotClass.GOOD):
                self.good[i] = True

        count_outlier = 0
        count_good = 0
        for i in range(len(self.observed_spots)):
            if (current_status[i] == SpotClass.OUTLIER):
                count_outlier = count_outlier + 1
            elif (current_status[i] == SpotClass.GOOD):
                count_good = count_good + 1
        if self.verbose:            print 'Old GOOD =', len(self.dr),\
  'OUTLIER =', count_outlier,\
  'New GOOD =', count_good
        if horizon_phil.indexing.outlier_detection.verbose and status_with_marked_outliers is None:
            print "\nOf the remaining %d spots, %.1f%% were lattice outliers, leaving %d well-fit spots" % (
                len(self.dr), 100. * count_outlier / len(self.dr), count_good)
            if count_outlier == 0: return
            #width of green bar is sd
            delta_spread = o_outliers_for_severity[1][
                1] - o_outliers_for_severity[0][1]
            severity = 0.
            for item in o_outliers_for_severity:
                delta_r = item[0]  # obs - predicted deviation in mm
                spread = item[
                    1]  # order of observed deviation on a scale from 0 to 1
                # now invert the cdf to find expected delta r:
                expected_delta_r = fitted_rayleigh.distribution.sigma * math.sqrt(
                    -2. * math.log(1. - spread))
                #print item, expected_delta_r, (delta_r - expected_delta_r) / sd
                severity += ((delta_r - expected_delta_r) / sd)
            severity *= delta_spread
            print "The outlier severity is %.2f sigma [defined in J Appl Cryst (2010) 43, p.611 sec. 4].\n" % severity
        return current_status

コード例 #3

ファイルを表示

ファイル: outlier_detection.py プロジェクト: cctbx/cctbx-playground

  def update_detail(self,horizon_phil,current_status,first_time_through,verbose):
    assert(len(self.observed_spots) == len(self.predicted_spots))

    if horizon_phil.indexing.outlier_detection.verbose:
      classes=[str(current_status[i]) for i in xrange(len(self.observed_spots))]
      class_types = set(classes)
      class_counts = dict([[item,classes.count(item)] for item in class_types])
      flex_counts = flex.int(class_counts.values())
      assert flex.sum(flex_counts) == len(self.observed_spots)
      #for pair in class_counts.items():
      #  print "%10s %6d"%pair
      #print "%10s %6d"%("TOTAL",len(self.observed_spots))
      if status_with_marked_outliers == None:
        # status_with_marked_outliers==None is shorthand for identifying the first run through
        print """After indexing on a subset of %d spots (from all images), %d were reclassified as
      either lying on the spindle, or potential overlapped spots or ice rings."""%(
      len(self.observed_spots),len(self.observed_spots)-class_counts["GOOD"])
      else:
        print """Rerefinement on just the well-fit spots followed by spot reclassification
      leaves %d good spots on which to calculate a triclinic rmsd."""%(class_counts["GOOD"])

    # check good spots
    if (self.good is not None):
      match = 0
      for i in xrange(len(self.observed_spots)):
        if ((current_status[i] == SpotClass.GOOD) and self.good[i]):
          match = match + 1
      if self.verbose:print "Number of GOOD spots matched with previous model =",match

    # calculate differences for all spots
    self.sorted_observed_spots = {}
    self.dr = flex.double()
    self.not_good_dr = flex.double()
    self.dx = [0.0 for i in xrange(len(self.observed_spots))]
    self.dy = [0.0 for i in xrange(len(self.observed_spots))]
    for i in xrange(len(self.observed_spots)):
      o = self.observed_spots[i]
      p = self.predicted_spots[i]
      self.dx[i] = o[0] - p[0]
      self.dy[i] = o[1] - p[1]
      self.sorted_observed_spots[
        math.sqrt(self.dx[i]*self.dx[i] + self.dy[i]*self.dy[i])] = i

    # separate GOOD spots
    spotclasses = {SpotClass.GOOD:0,SpotClass.SPINDLE:0,SpotClass.OVERLAP:0,SpotClass.ICE:0,SpotClass.OUTLIER:0,SpotClass.NONE:0}
    for key in sorted(self.sorted_observed_spots.keys()):
      spotclass = current_status[self.sorted_observed_spots[key]]
      spotclasses[spotclass]+=1
      if (current_status[self.sorted_observed_spots[key]] == SpotClass.GOOD):
        self.dr.append(key)
      else:
        self.not_good_dr.append(key)
    if verbose: print ", ".join(["=".join([str(i[0]),"%d"%i[1]]) for i in spotclasses.items()]),
    totalsp = sum([spotclasses.values()[iidx] for iidx in xrange(len(spotclasses))])
    if verbose: print "Total=%d"%(totalsp),"# observed spots",len(self.observed_spots)
    assert totalsp == len(self.observed_spots)

    self.x = flex.double(len(self.dr))
    for i in xrange(len(self.x)):
      self.x[i] = float(i)/float(len(self.x))

    limit = int(self.fraction*len(self.dr))
    if limit < 4: return # Basic sanity check, need at least a few good spots to fit the distribution
    fitted_rayleigh = fit_cdf(x_data=self.dr[0:limit],
                              y_data=self.x[0:limit],distribution=rayleigh)
    if False:
        y_data=self.x[0:limit]
        inv_cdf = [fitted_rayleigh.distribution.inv_cdf(cdf) for cdf in y_data]
        from matplotlib import pyplot as plt
        plt.plot(self.dr[0:limit],self.x[0:limit],"r+")
        plt.plot(inv_cdf,y_data,"b.")
        plt.show()

    # store indices for spots used for fitting
    self.fraction_spot_indices = []
    for dr in self.dr[0:limit]:
      self.fraction_spot_indices.append(self.sorted_observed_spots[dr])

    # generate points for fitted distributions
    rayleigh_cdf_x = flex.double(500)
    for i in xrange(len(rayleigh_cdf_x)):
      rayleigh_cdf_x[i] = float(i)/float(len(rayleigh_cdf_x))
    rayleigh_cdf = flex.double(len(rayleigh_cdf_x))
    for i in xrange(len(rayleigh_cdf_x)):
      rayleigh_cdf[i] = fitted_rayleigh.distribution.cdf(x=rayleigh_cdf_x[i])

    # generate points for pdf
    dr_bins,dr_histogram = make_histogram_data(data=self.dr,n_bins=100)
    rayleigh_pdf = flex.double(len(dr_bins))
    for i in xrange(len(dr_bins)):
      rayleigh_pdf[i] = fitted_rayleigh.distribution.pdf(x=dr_bins[i])
    rayleigh_pdf = rayleigh_pdf/flex.sum(rayleigh_pdf)
    dr_bins = flex.double(dr_bins)
    dr_histogram = flex.double(dr_histogram)

    # standard deviation for cdf
    sd = math.sqrt((4.0-math.pi)/(2.0)*
                   fitted_rayleigh.x[0]*fitted_rayleigh.x[0])
    if self.verbose:print 'Standard deviation of Rayleigh fit = %4.3f'%sd
    sd_data = None
    radius_outlier_index = None
    limit_outlier = None
    for i in xrange(len(rayleigh_cdf_x)):
      mx = rayleigh_cdf_x[i]
      my = rayleigh_cdf[i]
      for j in xrange(1,len(self.dr)):
        upper_x = self.dr[j]
        upper_y = self.x[j]
        lower_x = self.dr[j-1]
        lower_y = self.x[j-1]
        if ((my >= lower_y) and (my < upper_y)):
          if ((sd <= (upper_x - mx)) and ((lower_x - mx) > 0.0)):
            sd_data = ((mx,my),(lower_x,lower_y))
            radius_outlier_index = j-1
            limit_outlier = lower_x
            if self.verbose:print "Width of green bar = %4.3f"%(lower_x - mx)
            break
        if (sd_data is not None):
          break
    if (radius_outlier_index is None):
      radius_outlier_index = len(self.dr)
    if (limit_outlier is None):
      limit_outlier = self.dr[-1]
    radius_95 = None
    for i in xrange(len(rayleigh_cdf)):
      if (rayleigh_cdf[i] >= 0.95):
        radius_95 = rayleigh_cdf_x[i]
        break
    if (radius_95 is None):
      radius_95 = rayleigh_cdf_x[-1]
    upper_circle = []
    lower_circle = []
    d_radius = 2.0*radius_95/100.0
    x = -radius_95
    r2 = radius_95*radius_95
    for i in xrange(100):
      y = math.sqrt(r2 - x*x)
      upper_circle.append((x,y))
      lower_circle.append((x,-y))
      x = x + d_radius
    y = 0.0
    upper_circle.append((x,y))
    lower_circle.append((x,-y))
    self.sqrtr2 = math.sqrt(r2)

    # color code dx dy
    dxdy_fraction = []
    dxdy_inliers = []
    dxdy_outliers = []

    limit = self.dr[int(self.fraction*len(self.dr))]

    trifold = dict(fraction=0,inlier=0,outlier=0,total=0)
    for key in self.dr:
      trifold["total"]+=1
      i = self.sorted_observed_spots[key]
      if (key < limit):
        trifold["fraction"]+=1
        if (not ((self.dx[i] > 1.0) or (self.dx[i] < -1.0) or
                 (self.dy[i] > 1.0) or (self.dy[i] < -1.0))):
          dxdy_fraction.append((self.dx[i],self.dy[i]))
      elif (key < limit_outlier):
        trifold["inlier"]+=1
        if (not ((self.dx[i] > 1.0) or (self.dx[i] < -1.0) or
                 (self.dy[i] > 1.0) or (self.dy[i] < -1.0))):
          dxdy_inliers.append((self.dx[i],self.dy[i]))
      else:
        trifold["outlier"]+=1
        if (not ((self.dx[i] > 1.0) or (self.dx[i] < -1.0) or
                 (self.dy[i] > 1.0) or (self.dy[i] < -1.0))):
          dxdy_outliers.append((self.dx[i],self.dy[i]))
    if verbose: print ", ".join(["=".join([str(i[0]),"%d"%i[1]]) for i in trifold.items()])

    # color code observed fractions
    o_fraction = []
    o_inliers = []
    o_outliers = []
    mr = format_data(x_data=rayleigh_cdf_x,y_data=rayleigh_cdf)
    limit = int(self.fraction*len(self.dr))
    for i in xrange(len(self.dr)):
      if (self.dr[i] <= 1.0):
        if (i < limit):
          o_fraction.append((self.dr[i],self.x[i]))
        elif (i < radius_outlier_index):
          o_inliers.append((self.dr[i],self.x[i]))
        else:
          o_outliers.append((self.dr[i],self.x[i]))
    if horizon_phil.indexing.outlier_detection.verbose:
      o_outliers_for_severity = []
      for i in xrange(radius_outlier_index, len(self.dr)):
        o_outliers_for_severity.append((self.dr[i],self.x[i]))

    # limit data range
    for i in xrange(len(dr_bins)):
      if (dr_bins[i] > 1.0):
        dr_bins.resize(i)
        dr_histogram.resize(i)
        rayleigh_pdf.resize(i)
        break
    ho = format_data(x_data=dr_bins,y_data=dr_histogram)
    hr = format_data(x_data=dr_bins,y_data=rayleigh_pdf)

    # format data for graphing
    self.plot_dxdy_data = [dxdy_fraction,dxdy_inliers,dxdy_outliers,
                           [(0.0,0.0)],[],[],[],[]]

    self.framework = {4:dict(status=SpotClass.SPINDLE),
                      5:dict(status=SpotClass.OVERLAP),
                      6:dict(status=SpotClass.OUTLIER),
                      7:dict(status=SpotClass.ICE),
    }
    for key in self.not_good_dr:
      i = self.sorted_observed_spots[key]
      status = current_status[i]
      if (not ((self.dx[i] > 1.0) or (self.dx[i] < -1.0) or
               (self.dy[i] > 1.0) or (self.dy[i] < -1.0))):
        statuskey = [k for k in self.framework.keys() if self.framework[k]["status"]==status][0]
        self.plot_dxdy_data[statuskey].append((self.dx[i],self.dy[i]))

    self.plot_cdf_data = [mr,o_fraction,o_inliers,o_outliers]
    if (sd_data is not None):
      self.plot_cdf_data.append(sd_data)
    self.plot_pdf_data = [ho,hr]

    # mark outliers
    if (first_time_through): #i.e., first time through the update() method
      if (radius_outlier_index < len(self.dr)):
        for i in xrange(radius_outlier_index,len(self.dr)):
          current_status[self.sorted_observed_spots[self.dr[i]]] = SpotClass.OUTLIER

    # reset good spots
    self.good = [False for i in xrange(len(self.observed_spots))]
    for i in xrange(len(self.observed_spots)):
      if (current_status[i] == SpotClass.GOOD):
        self.good[i] = True

    count_outlier = 0
    count_good = 0
    for i in xrange(len(self.observed_spots)):
      if (current_status[i] == SpotClass.OUTLIER):
        count_outlier = count_outlier + 1
      elif (current_status[i] == SpotClass.GOOD):
        count_good = count_good + 1
    if self.verbose:print 'Old GOOD =', len(self.dr),\
          'OUTLIER =', count_outlier,\
          'New GOOD =', count_good
    if horizon_phil.indexing.outlier_detection.verbose and status_with_marked_outliers is None:
      print "\nOf the remaining %d spots, %.1f%% were lattice outliers, leaving %d well-fit spots"%(
       len(self.dr),100.*count_outlier/len(self.dr), count_good )
      if count_outlier==0:return
      #width of green bar is sd
      delta_spread = o_outliers_for_severity[1][1]-o_outliers_for_severity[0][1]
      severity = 0.
      for item in o_outliers_for_severity:
        delta_r = item[0] # obs - predicted deviation in mm
        spread = item[1] # order of observed deviation on a scale from 0 to 1
        # now invert the cdf to find expected delta r:
        expected_delta_r = fitted_rayleigh.distribution.sigma * math.sqrt(
          -2.* math.log(1.-spread) )
        #print item, expected_delta_r, (delta_r - expected_delta_r) / sd
        severity += ((delta_r - expected_delta_r) / sd)
      severity *= delta_spread
      print "The outlier severity is %.2f sigma [defined in J Appl Cryst (2010) 43, p.611 sec. 4].\n"%severity
    return current_status

コード例 #4

ファイルを表示

ファイル: simple_integration.py プロジェクト: cctbx/cctbx-playground

  def integration_concept(self,image_number=0,cb_op_to_primitive=None,verbose=False,**kwargs):
    self.image_number = image_number
    NEAR = 10
    pxlsz = self.pixel_size
    self.get_predictions_accounting_for_centering(cb_op_to_primitive,**kwargs)
    FWMOSAICITY = self.inputai.getMosaicity()
    DOMAIN_SZ_ANG = kwargs.get("domain_size_ang",  self.__dict__.get("actual",0)  )
    refineflag = {True:0,False:1}[kwargs.get("domain_size_ang",0)==0]
    self.inputpd["symmetry"].show_summary(prefix="EXCURSION%1d REPORT FWMOS= %6.4f DOMAIN= %6.1f "%(refineflag,FWMOSAICITY,DOMAIN_SZ_ANG))
    from annlib_ext import AnnAdaptor
    self.cell = self.inputai.getOrientation().unit_cell()
    query = flex.double()
    for pred in self.predicted: # predicted spot coord in pixels
      query.append(pred[0]/pxlsz)
      query.append(pred[1]/pxlsz)
    self.reserve_hkllist_for_signal_search = self.hkllist

    reference = flex.double()
    spots = self.get_observations_with_outlier_removal()

    assert len(spots)>NEAR# Can't do spot/pred matching with too few spots
    for spot in spots:
      reference.append(spot.ctr_mass_x())
      reference.append(spot.ctr_mass_y())

    IS_adapt = AnnAdaptor(data=reference,dim=2,k=NEAR)
    IS_adapt.query(query)
    print "Calculate correction vectors for %d observations & %d predictions"%(len(spots),len(self.predicted))
    indexed_pairs_provisional = []
    correction_vectors_provisional = []
    c_v_p_flex = flex.vec3_double()
    idx_cutoff = float(min(self.mask_focus[image_number]))
    if verbose:
      print "idx_cutoff distance in pixels",idx_cutoff
    if not self.horizons_phil.integration.enable_one_to_one_safeguard:
     # legacy code, no safeguard against many-to-one predicted-to-observation mapping
     for i in xrange(len(self.predicted)): # loop over predicteds
      #for n in xrange(NEAR): # loop over near spotfinder spots
      for n in xrange(1): # only consider the nearest spotfinder spot
        Match = dict(spot=IS_adapt.nn[i*NEAR+n],pred=i)
        if n==0 and math.sqrt(IS_adapt.distances[i*NEAR+n]) < idx_cutoff:
          indexed_pairs_provisional.append(Match)

          vector = matrix.col(
            [spots[Match["spot"]].ctr_mass_x() - self.predicted[Match["pred"]][0]/pxlsz,
             spots[Match["spot"]].ctr_mass_y() - self.predicted[Match["pred"]][1]/pxlsz])
          correction_vectors_provisional.append(vector)
          c_v_p_flex.append((vector[0],vector[1],0.))
    else:
      one_to_one = {}
      for i in xrange(len(self.predicted)): # loop over predicteds
        annresultidx = i*NEAR
        obsidx = IS_adapt.nn[annresultidx]
        this_distancesq = IS_adapt.distances[annresultidx]
        if not one_to_one.has_key(obsidx) or \
           this_distancesq < one_to_one[obsidx]["distancesq"]:
           if math.sqrt(this_distancesq) < idx_cutoff:
             one_to_one[obsidx] = dict(spot=obsidx,pred=i,distancesq=this_distancesq)
      for key,value in one_to_one.items():
        indexed_pairs_provisional.append(value)
        vector = matrix.col(
            [spots[value["spot"]].ctr_mass_x() - self.predicted[value["pred"]][0]/pxlsz,
             spots[value["spot"]].ctr_mass_y() - self.predicted[value["pred"]][1]/pxlsz])
        correction_vectors_provisional.append(vector)
        c_v_p_flex.append((vector[0],vector[1],0.))

    print "... %d provisional matches"%len(correction_vectors_provisional),
    print "r.m.s.d. in pixels: %5.2f"%(math.sqrt(flex.mean(c_v_p_flex.dot(c_v_p_flex))))

    if self.horizons_phil.integration.enable_residual_scatter:
      from matplotlib import pyplot as plt
      fig = plt.figure()
      for cv in correction_vectors_provisional:
        plt.plot([cv[1]],[-cv[0]],"b.")
      plt.title(" %d matches, r.m.s.d. %5.2f pixels"%(len(correction_vectors_provisional),math.sqrt(flex.mean(c_v_p_flex.dot(c_v_p_flex)))))
      plt.axes().set_aspect("equal")
      self.show_figure(plt,fig,"res")
      plt.close()

    if self.horizons_phil.integration.enable_residual_map:
      from matplotlib import pyplot as plt
      fig = plt.figure()
      for match,cv in zip(indexed_pairs_provisional,correction_vectors_provisional):
        plt.plot([spots[match["spot"]].ctr_mass_y()],[-spots[match["spot"]].ctr_mass_x()],"r.")
        plt.plot([self.predicted[match["pred"]][1]/pxlsz],[-self.predicted[match["pred"]][0]/pxlsz],"g.")
        plt.plot([spots[match["spot"]].ctr_mass_y(), spots[match["spot"]].ctr_mass_y() + 10.*cv[1]],
                 [-spots[match["spot"]].ctr_mass_x(), -spots[match["spot"]].ctr_mass_x() - 10.*cv[0]],'b-')
      plt.xlim([0,float(self.inputpd["size2"])])
      plt.ylim([-float(self.inputpd["size1"]),0])
      plt.title(" %d matches, r.m.s.d. %5.2f pixels"%(len(correction_vectors_provisional),math.sqrt(flex.mean(c_v_p_flex.dot(c_v_p_flex)))))
      plt.axes().set_aspect("equal")
      self.show_figure(plt,fig,"map")
      plt.close()
    # insert code here to remove correction length outliers...
    # they are causing terrible
    # problems for finding legitimate correction vectors (print out the list)
    # also remove outliers for the purpose of reporting RMS
    outlier_rejection = True
    cache_refinement_spots = getattr(slip_callbacks.slip_callback,"requires_refinement_spots",False)
    if outlier_rejection:
      correction_lengths = flex.double([v.length() for v in correction_vectors_provisional])
      clorder = flex.sort_permutation(correction_lengths)
      sorted_cl = correction_lengths.select(clorder)

      ACCEPTABLE_LIMIT = 2
      limit = int(0.33 * len(sorted_cl)) # best 1/3 of data are assumed to be correctly modeled.
      if (limit <= ACCEPTABLE_LIMIT):
        raise Sorry("Not enough indexed spots to reject outliers; have %d need >%d" % (limit, ACCEPTABLE_LIMIT))

      y_data = flex.double(len(sorted_cl))
      for i in xrange(len(y_data)):
        y_data[i] = float(i)/float(len(y_data))

      # ideas are explained in Sauter & Poon (2010) J Appl Cryst 43, 611-616.
      from rstbx.outlier_spots.fit_distribution import fit_cdf,rayleigh
      fitted_rayleigh = fit_cdf(x_data = sorted_cl[0:limit],
                                y_data = y_data[0:limit],
                                distribution=rayleigh)

      inv_cdf = [fitted_rayleigh.distribution.inv_cdf(cdf) for cdf in y_data]

      #print "SORTED LIST OF ",len(sorted_cl), "with sigma",fitted_rayleigh.distribution.sigma
      indexed_pairs = []
      correction_vectors = []
      self.correction_vectors = []
      for icand in xrange(len(sorted_cl)):
        # somewhat arbitrary sigma = 1.0 cutoff for outliers
        if (sorted_cl[icand]-inv_cdf[icand])/fitted_rayleigh.distribution.sigma > 1.0:
          break
        indexed_pairs.append(indexed_pairs_provisional[clorder[icand]])
        correction_vectors.append(correction_vectors_provisional[clorder[icand]])
        if cache_refinement_spots:
          self.spotfinder.images[self.frame_numbers[self.image_number]]["refinement_spots"].append(
          spots[indexed_pairs[-1]["spot"]])
        if kwargs.get("verbose_cv")==True:
            print "CV OBSCENTER %7.2f %7.2f REFINEDCENTER %7.2f %7.2f"%(
              float(self.inputpd["size1"])/2.,float(self.inputpd["size2"])/2.,
              self.inputai.xbeam()/pxlsz, self.inputai.ybeam()/pxlsz),
            print "OBSSPOT %7.2f %7.2f PREDSPOT %7.2f %7.2f"%(
              spots[indexed_pairs[-1]["spot"]].ctr_mass_x(),
              spots[indexed_pairs[-1]["spot"]].ctr_mass_y(),
              self.predicted[indexed_pairs[-1]["pred"]][0]/pxlsz,
              self.predicted[indexed_pairs[-1]["pred"]][1]/pxlsz),
            the_hkl = self.hkllist[indexed_pairs[-1]["pred"]]
            print "HKL %4d %4d %4d"%the_hkl,"%2d"%self.setting_id,
            radial, azimuthal = spots[indexed_pairs[-1]["spot"]].get_radial_and_azimuthal_size(
              self.inputai.xbeam()/pxlsz, self.inputai.ybeam()/pxlsz)
            print "RADIALpx %5.3f AZIMUTpx %5.3f"%(radial,azimuthal)

        # Store a list of correction vectors in self.
        radial, azimuthal = spots[indexed_pairs[-1]['spot']].get_radial_and_azimuthal_size(
          self.inputai.xbeam()/pxlsz, self.inputai.ybeam()/pxlsz)
        self.correction_vectors.append(
          dict(obscenter=(float(self.inputpd['size1']) / 2,
                          float(self.inputpd['size2']) / 2),
               refinedcenter=(self.inputai.xbeam() / pxlsz,
                              self.inputai.ybeam() / pxlsz),
               obsspot=(spots[indexed_pairs[-1]['spot']].ctr_mass_x(),
                        spots[indexed_pairs[-1]['spot']].ctr_mass_y()),
               predspot=(self.predicted[indexed_pairs[-1]['pred']][0] / pxlsz,
                         self.predicted[indexed_pairs[-1]['pred']][1] / pxlsz),
               hkl=(self.hkllist[indexed_pairs[-1]['pred']][0],
                    self.hkllist[indexed_pairs[-1]['pred']][1],
                    self.hkllist[indexed_pairs[-1]['pred']][2]),
               setting_id=self.setting_id,
               radial=radial,
               azimuthal=azimuthal))

      print "After outlier rejection %d indexed spotfinder spots remain."%len(indexed_pairs)
      if False:
        rayleigh_cdf = [
          fitted_rayleigh.distribution.cdf(x=sorted_cl[c]) for c in xrange(len(sorted_cl))]
        from matplotlib import pyplot as plt
        plt.plot(sorted_cl,y_data,"r+")
        #plt.plot(sorted_cl,rayleigh_cdf,"g.")
        plt.plot(inv_cdf,y_data,"b.")
        plt.show()
    else:
      indexed_pairs = indexed_pairs_provisional
      correction_vectors = correction_vectors_provisional
    ########### finished with outlier rejection

    self.inputpd["symmetry"].show_summary(prefix="SETTING ")

    is_triclinic = (self.setting_id==1)
    if is_triclinic:
      self.triclinic_pairs = [ dict(pred=self.hkllist[a["pred"]],spot=a["spot"])
        for a in indexed_pairs ]

    if self.horizons_phil.integration.model == "user_supplied":
      if kwargs.get("user-reentrant",None)==None:
        from cxi_user import post_outlier_rejection
        self.indexed_pairs = indexed_pairs
        self.spots = spots
        post_outlier_rejection(self,image_number,cb_op_to_primitive,self.horizons_phil,kwargs)
        return

    ########### finished with user-supplied code

    if self.horizons_phil.integration.spot_shape_verbose:
        from rstbx.new_horizons.spot_shape import spot_shape_verbose
        spot_shape_verbose(rawdata = self.imagefiles.images[self.image_number].linearintdata,
           beam_center_pix = matrix.col((self.inputai.xbeam()/pxlsz, self.inputai.ybeam()/pxlsz)),
           indexed_pairs = indexed_pairs,
           spotfinder_observations = spots,
           distance_mm = self.inputai.distance(),
           mm_per_pixel = pxlsz,
           hkllist = self.hkllist,
           unit_cell = self.cell,
           wavelength_ang = self.inputai.wavelength
        )

    #Other checks to be implemented (future):
    # spot is within active area of detector on a circular detector such as the Mar IP
    # integration masks do not overlap; or deconvolute

    correction_lengths=flex.double([v.length() for v in correction_vectors])
    if verbose:
      print "average correction %5.2f over %d vectors"%(flex.mean(correction_lengths),
      len(correction_lengths)),
      print "or %5.2f mm."%(pxlsz*flex.mean(correction_lengths))
    self.r_residual = pxlsz*flex.mean(correction_lengths)

    #assert len(indexed_pairs)>NEAR # must have enough indexed spots
    if (len(indexed_pairs) <= NEAR):
      raise Sorry("Not enough indexed spots, only found %d, need %d" % (len(indexed_pairs), NEAR))

    reference = flex.double()
    for item in indexed_pairs:
      reference.append(spots[item["spot"]].ctr_mass_x())
      reference.append(spots[item["spot"]].ctr_mass_y())

    PS_adapt = AnnAdaptor(data=reference,dim=2,k=NEAR)
    PS_adapt.query(query)

    self.BSmasks = []
    #self.null_correction_mapping( predicted=self.predicted,
    #                                    correction_vectors = correction_vectors,
    #                                    IS_adapt = IS_adapt,
    #                                    spots = spots)
    self.positional_correction_mapping( predicted=self.predicted,
                                        correction_vectors = correction_vectors,
                                        PS_adapt = PS_adapt,
                                        IS_adapt = IS_adapt,
                                        spots = spots)

    # which spots are close enough to interfere with background?
    MAXOVER=6
    OS_adapt = AnnAdaptor(data=query,dim=2,k=MAXOVER) #six near nbrs
    OS_adapt.query(query)
    if self.mask_focus[image_number] is None:
      raise Sorry("No observed/predicted spot agreement; no Spotfinder masks; skip integration")
    nbr_cutoff = 2.0* max(self.mask_focus[image_number])
    FRAME = int(nbr_cutoff/2)
    #print "The overlap cutoff is %d pixels"%nbr_cutoff
    nbr_cutoff_sq = nbr_cutoff * nbr_cutoff

    #print "Optimized C++ section...",
    self.set_frame(FRAME)
    self.set_background_factor(kwargs["background_factor"])
    self.set_nbr_cutoff_sq(nbr_cutoff_sq)
    self.set_guard_width_sq(self.horizons_phil.integration.guard_width_sq)
    self.set_detector_gain(self.horizons_phil.integration.detector_gain)
    flex_sorted = flex.int()
    for item in self.sorted:
      flex_sorted.append(item[0]);flex_sorted.append(item[1]);

    if self.horizons_phil.integration.mask_pixel_value is not None:
      self.set_mask_pixel_val(self.horizons_phil.integration.mask_pixel_value)

    image_obj = self.imagefiles.imageindex(self.frame_numbers[self.image_number])
    image_obj.read()
    rawdata = image_obj.linearintdata # assume image #1

    if self.inputai.active_areas != None:
      self.detector_xy_draft = self.safe_background( rawdata=rawdata,
                          predicted=self.predicted,
                          OS_adapt=OS_adapt,
                          sorted=flex_sorted,
                          tiles=self.inputai.active_areas.IT,
                          tile_id=self.inputai.active_areas.tile_id);
    else:
      self.detector_xy_draft = self.safe_background( rawdata=rawdata,
                          predicted=self.predicted,
                          OS_adapt=OS_adapt,
                          sorted=flex_sorted);
    for i in xrange(len(self.predicted)): # loop over predicteds
      B_S_mask = {}
      keys = self.get_bsmask(i)
      for k in xrange(0,len(keys),2):
        B_S_mask[(keys[k],keys[k+1])]=True
      self.BSmasks.append(B_S_mask)
    #print "Done"
    return