Exemple #1
0
    def scale_all(self):
        t1 = time.time()

        self.read_all_mysql()
        self.millers = self.millers_mysql
        self.frames = self.frames_mysql
        self._frames = self._frames_mysql
        self.observations = self.observations_mysql
        self._observations = self._observations_mysql
        if self.params.model is None:
            self.n_accepted = len(self.frames["cc"])
            self.n_low_corr = 0
            self.those_accepted = flex.bool(self.n_accepted)
        else:
            self.n_accepted = (self.frames["cc"] >
                               self.params.min_corr).count(True)
            self.n_low_corr = (self.frames["cc"] >
                               self.params.min_corr).count(False)
            self.those_accepted = (self.frames["cc"] > self.params.min_corr)
            statsy = flex.mean_and_variance(self.frames["cc"])
            print >> self.log, "%5d images, individual image correlation coefficients are %6.3f +/- %5.3f" % (
                len(self.frames["cc"]),
                statsy.mean(),
                statsy.unweighted_sample_standard_deviation(),
            )
        if self.params.scaling.report_ML and self.frames.has_key(
                "half_mosaicity_deg"):
            mosaic = self.frames["half_mosaicity_deg"].select(
                self.those_accepted)
            Mstat = flex.mean_and_variance(mosaic)
            print >> self.log, "%5d images, half mosaicity is %6.3f +/- %5.3f degrees" % (
                len(mosaic), Mstat.mean(),
                Mstat.unweighted_sample_standard_deviation())
            domain = self.frames["domain_size_ang"].select(self.those_accepted)
            Dstat = flex.mean_and_variance(domain)
            print >> self.log, "%5d images, domain size is %6.0f +/- %5.0f Angstroms" % (
                len(domain), Dstat.mean(),
                Dstat.unweighted_sample_standard_deviation())

            invdomain = 1. / domain
            Dstat = flex.mean_and_variance(invdomain)
            print >> self.log, "%5d images, inverse domain size is %f +/- %f Angstroms" % (
                len(domain), Dstat.mean(),
                Dstat.unweighted_sample_standard_deviation())
            print >> self.log, "%5d images, domain size is %6.0f +/- %5.0f Angstroms" % (
                len(domain), 1. / Dstat.mean(),
                1. / Dstat.unweighted_sample_standard_deviation())

        t2 = time.time()
        print >> self.log, ""
        print >> self.log, "#" * 80
        print >> self.log, "FINISHED MERGING"
        print >> self.log, "  Elapsed time: %.1fs" % (t2 - t1)
        print >> self.log, "  %d integration files were accepted" % (
            self.n_accepted)
        print >> self.log, "  %d rejected due to poor correlation" % \
          self.n_low_corr
Exemple #2
0
 def print_table(self):
  from libtbx import table_utils
  from libtbx.str_utils import format_value

  table_header = ["Tile","Dist","Nobs","aRmsd","Rmsd","delx","dely","disp","rotdeg","Rsigma","Tsigma"]
  table_data = []
  table_data.append(table_header)
  sort_radii = flex.sort_permutation(flex.double(self.radii))
  tile_rmsds = flex.double()
  radial_sigmas = flex.double(len(self.tiles) // 4)
  tangen_sigmas = flex.double(len(self.tiles) // 4)
  for idx in range(len(self.tiles) // 4):
    x = sort_radii[idx]
    if self.tilecounts[x] < 3:
      wtaveg = 0.0
      radial = (0,0)
      tangential = (0,0)
      rmean,tmean,rsigma,tsigma=(0,0,1,1)
    else:
      wtaveg = self.weighted_average_angle_deg_from_tile(x)
      radial,tangential,rmean,tmean,rsigma,tsigma = get_radial_tangential_vectors(self,x)

    radial_sigmas[x]=rsigma
    tangen_sigmas[x]=tsigma
    table_data.append(  [
      format_value("%3d",   x),
      format_value("%7.2f", self.radii[x]),
      format_value("%6d",  self.tilecounts[x]),
      format_value("%5.2f", self.asymmetric_tile_rmsd[x]),
      format_value("%5.2f", self.tile_rmsd[x]),
      format_value("%5.2f", self.mean_cv[x][0]),
      format_value("%5.2f", self.mean_cv[x][1]),
      format_value("%5.2f", matrix.col(self.mean_cv[x]).length()),
      format_value("%6.2f", wtaveg),
      format_value("%6.2f", rsigma),
      format_value("%6.2f", tsigma),
    ])
  table_data.append([""]*len(table_header))
  rstats = flex.mean_and_variance(radial_sigmas,self.tilecounts.as_double())
  tstats = flex.mean_and_variance(tangen_sigmas,self.tilecounts.as_double())
  table_data.append(  [
      format_value("%3s",   "ALL"),
      format_value("%s", ""),
      format_value("%6d",  self.overall_N),
      format_value("%5.2f", math.sqrt(flex.mean(self.delrsq))),
      format_value("%5.2f", self.overall_rmsd),
      format_value("%5.2f", self.overall_cv[0]),
      format_value("%5.2f", self.overall_cv[1]),
      format_value("%5.2f", flex.mean(flex.double([matrix.col(cv).length() for cv in self.mean_cv]))),
      format_value("%s", ""),
      format_value("%6.2f", rstats.mean()),
      format_value("%6.2f", tstats.mean()),
    ])

  print
  print table_utils.format(table_data,has_header=1,justify='center',delim=" ")
Exemple #3
0
 def print_table(self):
  from libtbx import table_utils
  from libtbx.str_utils import format_value

  table_header = ["Tile","Dist","Nobs","aRmsd","Rmsd","delx","dely","disp","rotdeg","Rsigma","Tsigma"]
  table_data = []
  table_data.append(table_header)
  sort_radii = flex.sort_permutation(flex.double(self.radii))
  tile_rmsds = flex.double()
  radial_sigmas = flex.double(len(self.tiles) // 4)
  tangen_sigmas = flex.double(len(self.tiles) // 4)
  for idx in range(len(self.tiles) // 4):
    x = sort_radii[idx]
    if self.tilecounts[x] < 3:
      wtaveg = 0.0
      radial = (0,0)
      tangential = (0,0)
      rmean,tmean,rsigma,tsigma=(0,0,1,1)
    else:
      wtaveg = self.weighted_average_angle_deg_from_tile(x)
      radial,tangential,rmean,tmean,rsigma,tsigma = get_radial_tangential_vectors(self,x)

    radial_sigmas[x]=rsigma
    tangen_sigmas[x]=tsigma
    table_data.append(  [
      format_value("%3d",   x),
      format_value("%7.2f", self.radii[x]),
      format_value("%6d",  self.tilecounts[x]),
      format_value("%5.2f", self.asymmetric_tile_rmsd[x]),
      format_value("%5.2f", self.tile_rmsd[x]),
      format_value("%5.2f", self.mean_cv[x][0]),
      format_value("%5.2f", self.mean_cv[x][1]),
      format_value("%5.2f", matrix.col(self.mean_cv[x]).length()),
      format_value("%6.2f", wtaveg),
      format_value("%6.2f", rsigma),
      format_value("%6.2f", tsigma),
    ])
  table_data.append([""]*len(table_header))
  rstats = flex.mean_and_variance(radial_sigmas,self.tilecounts.as_double())
  tstats = flex.mean_and_variance(tangen_sigmas,self.tilecounts.as_double())
  table_data.append(  [
      format_value("%3s",   "ALL"),
      format_value("%s", ""),
      format_value("%6d",  self.overall_N),
      format_value("%5.2f", math.sqrt(flex.mean(self.delrsq))),
      format_value("%5.2f", self.overall_rmsd),
      format_value("%5.2f", self.overall_cv[0]),
      format_value("%5.2f", self.overall_cv[1]),
      format_value("%5.2f", flex.mean(flex.double([matrix.col(cv).length() for cv in self.mean_cv]))),
      format_value("%s", ""),
      format_value("%6.2f", rstats.mean()),
      format_value("%6.2f", tstats.mean()),
    ])

  print
  print table_utils.format(table_data,has_header=1,justify='center',delim=" ")
Exemple #4
0
    def load_data(self, chi_angles=None):
        # reorganize data from
        # a list of all dihedrals for each atom_group per model
        # to
        # atom_group -> list of grouped dihedral angles (e.g. all chi1 in one list)
        if (chi_angles is not None):
            n_models = len(chi_angles)
            n_groups = len(chi_angles[0]['id_str'])

            id_str = chi_angles[0]['id_str']
            xyz = chi_angles[0]['xyz']
            id_str_map = dict()
            all_angles = [list() for i in range(n_groups)]
            for i in range(n_groups):  # loop over atom_groups
                n_dihedrals = len(chi_angles[0]['chi_angles'][i])
                angles = [list() for j in range(n_dihedrals)]
                for j in range(n_models):
                    dihedrals = chi_angles[j]['chi_angles'][i]
                    for k in range(n_dihedrals):
                        dihedral = dihedrals[k]
                        if (dihedral is not None):
                            angles[k].append(dihedral)

                # check if adding 360 to negative angles is helpful
                # avoids issues where angles are clustered only near 0 and 360
                for j in range(len(angles)):
                    if (len(angles[j]) > 0):
                        stddev_a = flex.mean_and_variance(
                            flex.double(
                                angles[j])).unweighted_sample_variance()
                        dihedrals_b = flex.double(angles[j])
                        for k in range(len(dihedrals_b)):
                            if (dihedrals_b[k] < 0.0):
                                dihedrals_b[k] += 360
                        stddev_b = flex.mean_and_variance(
                            dihedrals_b).unweighted_sample_variance()
                        if ((stddev_b < stddev_a) or (max(angles[j]) < 0.0)):
                            angles[j] = list(dihedrals_b)

                # store reorganized data
                all_angles[i] = angles

                # build map matching id_str with row index for fast random access
                id_str_map[id_str[i]] = i

            self.chi_angles = {
                'id_str': id_str,
                'id_str_map': id_str_map,
                'xyz': xyz,
                'values': all_angles
            }
            self.meets_threshold = [
                False for i in range(len(self.chi_angles['id_str']))
            ]
            self.UpdateTable()
            self.UpdatePlot()
Exemple #5
0
def exercise_optimise_shelxl_weights():
    def calc_goof(fo2, fc, w, k, n_params):
        fc2 = fc.as_intensity_array()
        w = w(fo2.data(), fo2.sigmas(), fc2.data(), k)
        return math.sqrt(
            flex.sum(w * flex.pow2(fo2.data() - k * fc2.data())) /
            (fo2.size() - n_params))

    xs = smtbx.development.sucrose()
    k = 0.05 + 10 * flex.random_double()
    fc = xs.structure_factors(anomalous_flag=False, d_min=0.7).f_calc()
    fo = fc.as_amplitude_array()
    fo = fo.customized_copy(data=fo.data() * math.sqrt(k))
    fo = fo.customized_copy(sigmas=0.03 * fo.data())
    sigmas = fo.sigmas()
    for i in range(fo.size()):
        fo.data()[i] += 2 * scitbx.random.variate(
          scitbx.random.normal_distribution(sigma=sigmas[i]))() \
          + 0.5*random.random()
    fo2 = fo.as_intensity_array()
    fc2 = fc.as_intensity_array()
    w = least_squares.mainstream_shelx_weighting(a=0.1)
    s = calc_goof(fo2, fc, w, k, xs.n_parameters())
    w2 = w.optimise_parameters(fo2, fc2, k, xs.n_parameters())
    s2 = calc_goof(fo2, fc, w2, k, xs.n_parameters())
    # sort data and setup binning by fc/fc_max
    fc_sq = fc.as_intensity_array()
    fc_sq_over_fc_sq_max = fc_sq.data() / flex.max(fc_sq.data())
    permutation = flex.sort_permutation(fc_sq_over_fc_sq_max)
    fc_sq_over_fc_sq_max = fc_sq.customized_copy(
        data=fc_sq_over_fc_sq_max).select(permutation)
    fc_sq = fc_sq.select(permutation)
    fo_sq = fo2.select(permutation)
    n_bins = 10
    bin_max = 0
    bin_limits = flex.size_t(1, 0)
    bin_count = flex.size_t()
    for i in range(n_bins):
        bin_limits.append(int(math.ceil((i + 1) * fc_sq.size() / n_bins)))
        bin_count.append(bin_limits[i + 1] - bin_limits[i])
    goofs_w = flex.double()
    goofs_w2 = flex.double()
    for i_bin in range(n_bins):
        sel = flex.size_t_range(bin_limits[i_bin], bin_limits[i_bin + 1])
        goofs_w2.append(
            calc_goof(fo_sq.select(sel), fc_sq.select(sel), w2, k,
                      xs.n_parameters()))
        goofs_w.append(
            calc_goof(fo_sq.select(sel), fc_sq.select(sel), w, k,
                      xs.n_parameters()))
    a = flex.mean_and_variance(goofs_w).unweighted_sample_variance()
    b = flex.mean_and_variance(goofs_w2).unweighted_sample_variance()
    assert a > b or abs(1 - s) > abs(1 - s2)
    assert a > b  # flat analysis of variance
    assert abs(1 - s) > abs(1 - s2)  # GooF close to 1
def exercise_optimise_shelxl_weights():
  def calc_goof(fo2, fc, w, k, n_params):
    fc2 = fc.as_intensity_array()
    w = w(fo2.data(), fo2.sigmas(), fc2.data(), k)
    return math.sqrt(flex.sum(
      w * flex.pow2(fo2.data() - k*fc2.data()))/(fo2.size() - n_params))
  xs = smtbx.development.sucrose()
  k = 0.05 + 10 * flex.random_double()
  fc = xs.structure_factors(anomalous_flag=False, d_min=0.7).f_calc()
  fo = fc.as_amplitude_array()
  fo = fo.customized_copy(data=fo.data()*math.sqrt(k))
  fo = fo.customized_copy(sigmas=0.03*fo.data())
  sigmas = fo.sigmas()
  for i in range(fo.size()):
    fo.data()[i] += 2 * scitbx.random.variate(
      scitbx.random.normal_distribution(sigma=sigmas[i]))() \
      + 0.5*random.random()
  fo2 = fo.as_intensity_array()
  fc2 = fc.as_intensity_array()
  w = least_squares.mainstream_shelx_weighting(a=0.1)
  s = calc_goof(fo2, fc, w, k, xs.n_parameters())
  w2 = w.optimise_parameters(fo2, fc2, k, xs.n_parameters())
  s2 = calc_goof(fo2, fc, w2, k, xs.n_parameters())
  # sort data and setup binning by fc/fc_max
  fc_sq = fc.as_intensity_array()
  fc_sq_over_fc_sq_max = fc_sq.data()/flex.max(fc_sq.data())
  permutation = flex.sort_permutation(fc_sq_over_fc_sq_max)
  fc_sq_over_fc_sq_max = fc_sq.customized_copy(
    data=fc_sq_over_fc_sq_max).select(permutation)
  fc_sq = fc_sq.select(permutation)
  fo_sq = fo2.select(permutation)
  n_bins = 10
  bin_max = 0
  bin_limits = flex.size_t(1, 0)
  bin_count = flex.size_t()
  for i in range(n_bins):
    bin_limits.append(int(math.ceil((i+1) * fc_sq.size()/n_bins)))
    bin_count.append(bin_limits[i+1] - bin_limits[i])
  goofs_w = flex.double()
  goofs_w2 = flex.double()
  for i_bin in range(n_bins):
    sel = flex.size_t_range(bin_limits[i_bin], bin_limits[i_bin+1])
    goofs_w2.append(calc_goof(fo_sq.select(sel),
                              fc_sq.select(sel),
                              w2, k, xs.n_parameters()))
    goofs_w.append(calc_goof(fo_sq.select(sel),
                              fc_sq.select(sel),
                              w, k, xs.n_parameters()))
  a = flex.mean_and_variance(goofs_w).unweighted_sample_variance()
  b = flex.mean_and_variance(goofs_w2).unweighted_sample_variance()
  assert a > b or abs(1-s) > abs(1-s2)
  assert a > b # flat analysis of variance
  assert abs(1-s) > abs(1-s2) # GooF close to 1
  def load_data(self, chi_angles=None):
    # reorganize data from
    # a list of all dihedrals for each atom_group per model
    # to
    # atom_group -> list of grouped dihedral angles (e.g. all chi1 in one list)
    if (chi_angles is not None):
      n_models = len(chi_angles)
      n_groups = len(chi_angles[0]['id_str'])

      id_str = chi_angles[0]['id_str']
      xyz = chi_angles[0]['xyz']
      id_str_map = dict()
      all_angles = [ list() for i in xrange(n_groups) ]
      for i in xrange(n_groups):           # loop over atom_groups
        n_dihedrals = len(chi_angles[0]['chi_angles'][i])
        angles = [ list() for j in xrange(n_dihedrals) ]
        for j in xrange(n_models):
          dihedrals = chi_angles[j]['chi_angles'][i]
          for k in xrange(n_dihedrals):
            dihedral = dihedrals[k]
            if (dihedral is not None):
              angles[k].append(dihedral)

        # check if adding 360 to negative angles is helpful
        # avoids issues where angles are clustered only near 0 and 360
        for j in xrange(len(angles)):
          if (len(angles[j]) > 0):
            stddev_a = flex.mean_and_variance(
              flex.double(angles[j])).unweighted_sample_variance()
            dihedrals_b = flex.double(angles[j])
            for k in xrange(len(dihedrals_b)):
              if (dihedrals_b[k] < 0.0):
                dihedrals_b[k] += 360
            stddev_b = flex.mean_and_variance(
              dihedrals_b).unweighted_sample_variance()
            if ( (stddev_b < stddev_a) or (max(angles[j]) < 0.0) ):
              angles[j] = list(dihedrals_b)

        # store reorganized data
        all_angles[i] = angles

        # build map matching id_str with row index for fast random access
        id_str_map[id_str[i]] = i

      self.chi_angles = { 'id_str': id_str,
                          'id_str_map': id_str_map,
                          'xyz': xyz,
                          'values': all_angles }
      self.meets_threshold = [ False for i in
                               xrange(len(self.chi_angles['id_str'])) ]
      self.UpdateTable()
      self.UpdatePlot()
Exemple #8
0
  def scale_all (self) :
    t1 = time.time()

    self.read_all_mysql()
    self.millers = self.millers_mysql
    self.frames = self.frames_mysql
    self._frames = self._frames_mysql
    self.observations = self.observations_mysql
    self._observations = self._observations_mysql
    if self.params.model is None:
      self.n_accepted = len(self.frames["cc"])
      self.n_low_corr = 0
      self.those_accepted = flex.bool(self.n_accepted)
    else:
      self.n_accepted = (self.frames["cc"]>self.params.min_corr).count(True)
      self.n_low_corr = (self.frames["cc"]>self.params.min_corr).count(False)
      self.those_accepted = (self.frames["cc"]>self.params.min_corr)
      statsy = flex.mean_and_variance(self.frames["cc"])
      print >> self.log, "%5d images, individual image correlation coefficients are %6.3f +/- %5.3f"%(
               len(self.frames["cc"]),
               statsy.mean(),  statsy.unweighted_sample_standard_deviation(),
               )
    if self.params.scaling.report_ML and self.frames.has_key("half_mosaicity_deg"):
      mosaic = self.frames["half_mosaicity_deg"].select(self.those_accepted)
      Mstat = flex.mean_and_variance(mosaic)
      print >> self.log, "%5d images, half mosaicity is %6.3f +/- %5.3f degrees"%(
               len(mosaic), Mstat.mean(), Mstat.unweighted_sample_standard_deviation())
      domain = self.frames["domain_size_ang"].select(self.those_accepted)
      Dstat = flex.mean_and_variance(domain)
      print >> self.log, "%5d images, domain size is %6.0f +/- %5.0f Angstroms"%(
               len(domain), Dstat.mean(), Dstat.unweighted_sample_standard_deviation())

      invdomain = 1./domain
      Dstat = flex.mean_and_variance(invdomain)
      print >> self.log, "%5d images, inverse domain size is %f +/- %f Angstroms"%(
               len(domain), Dstat.mean(), Dstat.unweighted_sample_standard_deviation())
      print >> self.log, "%5d images, domain size is %6.0f +/- %5.0f Angstroms"%(
               len(domain), 1./Dstat.mean(), 1./Dstat.unweighted_sample_standard_deviation())

    t2 = time.time()
    print >> self.log, ""
    print >> self.log, "#" * 80
    print >> self.log, "FINISHED MERGING"
    print >> self.log, "  Elapsed time: %.1fs" % (t2 - t1)
    print >> self.log, "  %d integration files were accepted" % (
      self.n_accepted)
    print >> self.log, "  %d rejected due to poor correlation" % \
      self.n_low_corr
Exemple #9
0
 def calculate_solvent_mask(self):
     # calculate mask
     lsd = local_standard_deviation_map(
         self.map_coeffs,
         self.radius,
         mean_solvent_density=self.mean_solvent_density,
         symmetry_flags=maptbx.use_space_group_symmetry,
         resolution_factor=self.params.grid_resolution_factor,
         method=2)
     self.rms_map = lsd.map
     self.mask = lsd.mask(self.params.solvent_fraction)
     # setup solvent/protein selections
     self.solvent_selection = (self.mask == 1)
     self.protein_selection = (self.mask == 0)
     self.solvent_iselection = self.solvent_selection.iselection()
     self.protein_iselection = self.protein_selection.iselection()
     self.n_solvent_grid_points = self.mask.count(1)
     self.n_protein_grid_points = self.mask.count(0)
     # map statistics
     self.mean_protein_density = self.mean_protein_density_start = flex.mean(
         self.map.select(self.protein_iselection))
     self.mean_solvent_density = self.mean_solvent_density_start = flex.mean(
         self.map.select(self.solvent_iselection))
     self.mask_percent = self.n_solvent_grid_points / (
         self.mask.size()) * 100
     self.f000_over_v = ((
       (1/self.params.protein_solvent_ratio) * self.mean_protein_density)
                         - self.mean_solvent_density) \
         * (self.params.protein_solvent_ratio/(self.params.protein_solvent_ratio-1))
     self.rms_protein_density = rms(self.map.select(
         self.protein_iselection))
     self.rms_solvent_density = rms(self.map.select(
         self.solvent_iselection))
     self.standard_deviation_local_rms = flex.mean_and_variance(
         lsd.map.as_1d()).unweighted_sample_standard_deviation()
Exemple #10
0
 def calculate_solvent_mask(self):
   # calculate mask
   lsd = local_standard_deviation_map(
     self.map_coeffs,
     self.radius,
     mean_solvent_density=self.mean_solvent_density,
     symmetry_flags=maptbx.use_space_group_symmetry,
     resolution_factor=self.params.grid_resolution_factor,
     method=2)
   self.rms_map = lsd.map
   self.mask = lsd.mask(self.params.solvent_fraction)
   # setup solvent/protein selections
   self.solvent_selection = (self.mask == 1)
   self.protein_selection = (self.mask == 0)
   self.solvent_iselection = self.solvent_selection.iselection()
   self.protein_iselection = self.protein_selection.iselection()
   self.n_solvent_grid_points = self.mask.count(1)
   self.n_protein_grid_points = self.mask.count(0)
   # map statistics
   self.mean_protein_density = self.mean_protein_density_start = flex.mean(
     self.map.select(self.protein_iselection))
   self.mean_solvent_density = self.mean_solvent_density_start = flex.mean(
     self.map.select(self.solvent_iselection))
   self.mask_percent = self.n_solvent_grid_points/(self.mask.size()) * 100
   self.f000_over_v = ((
     (1/self.params.protein_solvent_ratio) * self.mean_protein_density)
                       - self.mean_solvent_density) \
       * (self.params.protein_solvent_ratio/(self.params.protein_solvent_ratio-1))
   self.rms_protein_density = rms(self.map.select(self.protein_iselection))
   self.rms_solvent_density = rms(self.map.select(self.solvent_iselection))
   self.standard_deviation_local_rms = flex.mean_and_variance(
     lsd.map.as_1d()).unweighted_sample_standard_deviation()
Exemple #11
0
    def UpdateTable(self, event=None):
        '''
    Construct table of residues that satisfy threshold
    '''

        # check threshold
        threshold = self.threshold_control.GetValue()
        threshold_error = 'Please enter a fraction, between 0.0 and 1.0, for the threshold.'
        try:
            threshold = float(threshold)
        except ValueError:
            raise Sorry(threshold_error)
        if ((threshold < 0.0) or (threshold > 1.0)):
            raise Sorry(threshold_error)
        threshold = 1.0 - threshold

        # check if dihedrals lie within 2 standard deviations of the mean.
        # for Gaussian distributions, 95% of the sample is within 2 standard
        # deviations of the mean (default threshold value)
        # set atom_group to be displayed if fraction is below threshold.
        # actual check is n_outliers/n_total > (1 - input_threshold)
        for i in range(len(self.chi_angles['id_str'])):  # loop over model
            self.meets_threshold[i] = False
            for j in range(len(
                    self.chi_angles['values'][i])):  # loop over group
                dihedrals = flex.double(self.chi_angles['values'][i][j])
                if (len(dihedrals) > 0):
                    mean_stddev = flex.mean_and_variance(dihedrals)
                    mean = mean_stddev.mean()
                    stddev = mean_stddev.unweighted_sample_standard_deviation()
                    n_outliers = 0
                    chi_min = mean - 2.0 * stddev
                    chi_max = mean + 2.0 * stddev
                    for k in range(len(dihedrals)):
                        if ((dihedrals[k] < chi_min)
                                or (dihedrals[k] > chi_max)):
                            n_outliers += 1
                    is_outlier = (float(n_outliers) / len(dihedrals) >
                                  threshold)
                    if (is_outlier):
                        self.meets_threshold[i] = True
                        break

        # refresh table
        table_data = list()
        for i in range(len(self.meets_threshold)):
            if (self.meets_threshold[i]):
                row = ['---' for j in range(8)]
                row[0] = self.chi_angles['id_str'][i]
                row[6] = None  # sel_str for model viewer
                row[7] = self.chi_angles['xyz'][i]  # xyz for model viewer
                for j in range(len(self.chi_angles['values'][i])):
                    chi = self.chi_angles['values'][i][j]
                    if ((None not in chi) and (len(chi) > 0)):
                        row[j + 1] = flex.mean(flex.double(chi))
                table_data.append(row)
        self.table.ReloadData(table_data)
        self.table.Refresh()
 def show_summary(self):
   w = flex.double([e.beam.get_wavelength() for e in self.experiments])
   stats=flex.mean_and_variance(w)
   print "Wavelength mean and standard deviation:",stats.mean(),stats.unweighted_sample_standard_deviation()
   uc = [e.crystal.get_unit_cell().parameters() for e in self.experiments]
   a = flex.double([u[0] for u in uc])
   stats=flex.mean_and_variance(a)
   print "Unit cell a mean and standard deviation:",stats.mean(),stats.unweighted_sample_standard_deviation()
   b = flex.double([u[1] for u in uc])
   stats=flex.mean_and_variance(b)
   print "Unit cell b mean and standard deviation:",stats.mean(),stats.unweighted_sample_standard_deviation()
   c = flex.double([u[2] for u in uc])
   stats=flex.mean_and_variance(c)
   print "Unit cell c mean and standard deviation:",stats.mean(),stats.unweighted_sample_standard_deviation()
   d = flex.double([e.crystal.domain_size for e in self.experiments])
   stats=flex.mean_and_variance(d)
   # NOTE XXX FIXME:  cxi.index seems to record the half-domain size; report here the full domain size
   print "Domain size mean and standard deviation:",2.*stats.mean(),2.*stats.unweighted_sample_standard_deviation()
Exemple #13
0
  def compute_functional_and_gradients(self):
    """The compute_functional_and_gradients() function

    @return Two-tuple of the value of the functional, and an
            <code>n</code>-long vector with the values of the
            gradients at the current position
    """

    #from libtbx.development.timers import Profiler
    from xfel import compute_functional_and_gradients

    n_frames = len(self._subset)

    #p = Profiler("compute_functional_and_gradients [C++]")
    (f, g) = compute_functional_and_gradients(
      self.x, self.w, n_frames, self._observations)
    #del p

    # XXX Only output this every 100 iterations or so.
    scales = self.x[0:len(self._subset)]
    stats = flex.mean_and_variance(scales)
    print "* f =% 10.4e, g =% f+/-%f" % (
      math.sqrt(f),
      stats.mean(),
      stats.unweighted_sample_standard_deviation())

    # Warn if there are non_positive per-frame scaling factors.
    scales_non_positive = scales.select(scales <= 1e-6) # XXX Or just zero!
    if len(scales_non_positive) > 0:
      stats = flex.mean_and_variance(scales_non_positive)
      if len(scales_non_positive) > 1:
        sigma = stats.unweighted_sample_standard_deviation()
      else:
        sigma = 0
      print "Have %d non-positive per-frame scaling factors: " \
        "%f+/-%f [%f, %f]" % (
          len(scales_non_positive),
          stats.mean(),
          sigma,
          flex.min(scales_non_positive),
          flex.max(scales_non_positive))

    return (f, g)
Exemple #14
0
    def compute_functional_and_gradients(self):
        """The compute_functional_and_gradients() function

    @return Two-tuple of the value of the functional, and an
            <code>n</code>-long vector with the values of the
            gradients at the current position
    """

        #from libtbx.development.timers import Profiler
        from xfel import compute_functional_and_gradients

        n_frames = len(self._subset)

        #p = Profiler("compute_functional_and_gradients [C++]")
        (f, g) = compute_functional_and_gradients(self.x, self.w, n_frames,
                                                  self._observations)
        #del p

        # XXX Only output this every 100 iterations or so.
        scales = self.x[0:len(self._subset)]
        stats = flex.mean_and_variance(scales)
        print("* f =% 10.4e, g =% f+/-%f" %
              (math.sqrt(f), stats.mean(),
               stats.unweighted_sample_standard_deviation()))

        # Warn if there are non_positive per-frame scaling factors.
        scales_non_positive = scales.select(
            scales <= 1e-6)  # XXX Or just zero!
        if len(scales_non_positive) > 0:
            stats = flex.mean_and_variance(scales_non_positive)
            if len(scales_non_positive) > 1:
                sigma = stats.unweighted_sample_standard_deviation()
            else:
                sigma = 0
            print("Have %d non-positive per-frame scaling factors: " \
              "%f+/-%f [%f, %f]" % (
                len(scales_non_positive),
                stats.mean(),
                sigma,
                flex.min(scales_non_positive),
                flex.max(scales_non_positive)))

        return (f, g)
  def UpdateTable(self, event=None):
    '''
    Construct table of residues that satisfy threshold
    '''

    # check threshold
    threshold = self.threshold_control.GetValue()
    threshold_error = 'Please enter a fraction, between 0.0 and 1.0, for the threshold.'
    try:
      threshold = float(threshold)
    except ValueError:
      raise Sorry(threshold_error)
    if ( (threshold < 0.0) or (threshold > 1.0) ):
      raise Sorry(threshold_error)
    threshold = 1.0 - threshold

    # check if dihedrals lie within 2 standard deviations of the mean.
    # for Gaussian distributions, 95% of the sample is within 2 standard
    # deviations of the mean (default threshold value)
    # set atom_group to be displayed if fraction is below threshold.
    # actual check is n_outliers/n_total > (1 - input_threshold)
    for i in xrange(len(self.chi_angles['id_str'])): # loop over model
      self.meets_threshold[i] = False
      for j in xrange(len(self.chi_angles['values'][i])): # loop over group
        dihedrals = flex.double(self.chi_angles['values'][i][j])
        if (len(dihedrals) > 0):
          mean_stddev = flex.mean_and_variance(dihedrals)
          mean = mean_stddev.mean()
          stddev = mean_stddev.unweighted_sample_standard_deviation()
          n_outliers = 0
          chi_min = mean - 2.0*stddev
          chi_max = mean + 2.0*stddev
          for k in xrange(len(dihedrals)):
            if ( (dihedrals[k] < chi_min) or (dihedrals[k] > chi_max) ):
              n_outliers += 1
          is_outlier = (float(n_outliers)/len(dihedrals) > threshold)
          if (is_outlier):
            self.meets_threshold[i] = True
            break

    # refresh table
    table_data = list()
    for i in xrange(len(self.meets_threshold)):
      if (self.meets_threshold[i]):
        row = [ '---' for j in xrange(8) ]
        row[0] = self.chi_angles['id_str'][i]
        row[6] = None                          # sel_str for model viewer
        row[7] = self.chi_angles['xyz'][i]     # xyz for model viewer
        for j in xrange(len(self.chi_angles['values'][i])):
          chi = self.chi_angles['values'][i][j]
          if ( (None not in chi) and (len(chi) > 0) ):
            row[j+1] = flex.mean(flex.double(chi))
        table_data.append(row)
    self.table.ReloadData(table_data)
    self.table.Refresh()
Exemple #16
0
 def _show_each(edges):
   for edge, ref_edge, label in zip(edges, ref_edges, labels):
     h = flex.histogram(edge, n_slots=n_slots)
     smin, smax = flex.min(edge), flex.max(edge)
     stats = flex.mean_and_variance(edge)
     print >> out, "  %s edge" % label
     print >> out, "     range:     %6.2f - %.2f" % (smin, smax)
     print >> out, "     mean:      %6.2f +/- %6.2f on N = %d" % (
       stats.mean(), stats.unweighted_sample_standard_deviation(), edge.size())
     print >> out, "     reference: %6.2f" % ref_edge
     h.show(f=out, prefix="    ", format_cutoffs="%6.2f")
     print >> out, ""
Exemple #17
0
 def _show_each (edges) :
   for edge, ref_edge, label in zip(edges, ref_edges, labels) :
     h = flex.histogram(edge, n_slots=n_slots)
     smin, smax = flex.min(edge), flex.max(edge)
     stats = flex.mean_and_variance(edge)
     print >> out, "  %s edge" % label
     print >> out, "     range:     %6.2f - %.2f" % (smin, smax)
     print >> out, "     mean:      %6.2f +/- %6.2f on N = %d" % (
       stats.mean(), stats.unweighted_sample_standard_deviation(), edge.size())
     print >> out, "     reference: %6.2f" % ref_edge
     h.show(f=out, prefix="    ", format_cutoffs="%6.2f")
     print >> out, ""
def get_gaussian_rho(Dij, d_c):
    NN = Dij.focus()[0]
    rho = flex.double(NN)
    mu = flex.mean(Dij.as_1d())
    sigma = flex.mean_and_variance(
        Dij.as_1d()).unweighted_sample_standard_deviation()
    for i in range(NN):
        for j in range(NN):
            z = (Dij[i * NN + j] - mu) / sigma
            print(z, 'AA')
            rho[i] += math.exp(-z * z)
    return rho
Exemple #19
0
 def show_summary(self):
     w = flex.double([e.beam.get_wavelength() for e in self.experiments])
     stats = flex.mean_and_variance(w)
     print "Wavelength mean and standard deviation:", stats.mean(
     ), stats.unweighted_sample_standard_deviation()
     uc = [e.crystal.get_unit_cell().parameters() for e in self.experiments]
     a = flex.double([u[0] for u in uc])
     stats = flex.mean_and_variance(a)
     print "Unit cell a mean and standard deviation:", stats.mean(
     ), stats.unweighted_sample_standard_deviation()
     b = flex.double([u[1] for u in uc])
     stats = flex.mean_and_variance(b)
     print "Unit cell b mean and standard deviation:", stats.mean(
     ), stats.unweighted_sample_standard_deviation()
     c = flex.double([u[2] for u in uc])
     stats = flex.mean_and_variance(c)
     print "Unit cell c mean and standard deviation:", stats.mean(
     ), stats.unweighted_sample_standard_deviation()
     d = flex.double([e.crystal.domain_size for e in self.experiments])
     stats = flex.mean_and_variance(d)
     # NOTE XXX FIXME:  cxi.index seems to record the half-domain size; report here the full domain size
     print "Domain size mean and standard deviation:", 2. * stats.mean(
     ), 2. * stats.unweighted_sample_standard_deviation()
Exemple #20
0
    def __init__(self,
                 datadir,
                 work_params,
                 plot=False,
                 esd_plot=False,
                 half_data_flag=0):
        casetag = work_params.output.prefix
        # read the ground truth values back in
        import cPickle as pickle
        # it is assumed (for now) that the reference millers contain a complete asymmetric unit
        # of indices, within the (d_max,d_min) region of interest and possibly outside the region.
        reference_millers = pickle.load(
            open(os.path.join(datadir, casetag + "_miller.pickle"), "rb"))
        experiment_manager = read_experiments(work_params)

        obs = pickle.load(
            open(os.path.join(datadir, casetag + "_observation.pickle"), "rb"))
        print "Read in %d observations" % (len(obs["observed_intensity"]))
        reference_millers.show_summary(prefix="Miller index file ")

        print len(obs["frame_lookup"]), len(
            obs["observed_intensity"]), flex.max(
                obs['miller_lookup']), flex.max(obs['frame_lookup'])
        max_frameno = flex.max(obs["frame_lookup"])

        from iotbx import mtz
        mtz_object = mtz.object(file_name=work_params.scaling.mtz_file)
        #for array in mtz_object.as_miller_arrays():
        #  this_label = array.info().label_string()
        #  print this_label, array.observation_type()
        I_sim = mtz_object.as_miller_arrays()[0].as_intensity_array()
        I_sim.show_summary()
        MODEL_REINDEX_OP = work_params.model_reindex_op
        I_sim = I_sim.change_basis(MODEL_REINDEX_OP).map_to_asu()

        #match up isomorphous (the simulated fake F's) with experimental unique set
        matches = miller.match_multi_indices(
            miller_indices_unique=reference_millers.indices(),
            miller_indices=I_sim.indices())

        print "original unique", len(reference_millers.indices())
        print "isomorphous set", len(I_sim.indices())
        print "pairs", len(matches.pairs())
        iso_data = flex.double(len(reference_millers.indices()))

        for pair in matches.pairs():
            iso_data[pair[0]] = I_sim.data()[pair[1]]

        reference_data = miller.array(miller_set=reference_millers,
                                      data=iso_data)
        reference_data.set_observation_type_xray_intensity()

        FOBS = prepare_observations_for_scaling(
            work_params,
            obs=obs,
            reference_intensities=reference_data,
            files=experiment_manager.get_files(),
            half_data_flag=half_data_flag)

        I, I_visited, G, G_visited = I_and_G_base_estimate(FOBS,
                                                           params=work_params)
        print "I length", len(I), "G length", len(
            G), "(Reference set; entire asymmetric unit)"
        assert len(reference_data.data()) == len(I)

        #presumably these assertions fail when half data are taken for CC1/2 or d_min is cut
        model_I = reference_data.data()[0:len(I)]

        T = Timer("%d frames" % (len(G), ))

        mapper = mapper_factory(xscale6e)
        minimizer = mapper(I,
                           G,
                           I_visited,
                           G_visited,
                           FOBS,
                           params=work_params,
                           experiments=experiment_manager.get_experiments())

        del T
        minimizer.show_summary()

        Fit = minimizer.e_unpack()
        Gstats = flex.mean_and_variance(Fit["G"].select(G_visited == 1))
        print "G mean and standard deviation:", Gstats.mean(
        ), Gstats.unweighted_sample_standard_deviation()
        if "Bfactor" in work_params.levmar.parameter_flags:
            Bstats = flex.mean_and_variance(Fit["B"].select(G_visited == 1))
            print "B mean and standard deviation:", Bstats.mean(
            ), Bstats.unweighted_sample_standard_deviation()
        show_correlation(Fit["I"], model_I, I_visited, "Correlation of I:")
        Fit_stddev = minimizer.e_unpack_stddev()

        # XXX FIXME known bug:  the length of Fit["G"] could be smaller than the length of experiment_manager.get_files()
        # Not sure if this has any operational drawbacks.  It's a result of half-dataset selection.

        if plot:
            plot_it(Fit["I"], model_I, mode="I")
            if "Rxy" in work_params.levmar.parameter_flags:
                show_histogram(Fit["Ax"], "Histogram of x rotation (degrees)")
                show_histogram(Fit["Ay"], "Histogram of y rotation (degrees)")
        print

        if esd_plot:
            minimizer.esd_plot()

        from cctbx.examples.merging.show_results import show_overall_observations
        table1, self.n_bins, self.d_min = show_overall_observations(
            Fit["I"],
            Fit_stddev["I"],
            I_visited,
            reference_data,
            FOBS,
            title="Statistics for all reflections",
            work_params=work_params)

        self.FSIM = FOBS
        self.ordered_intensities = reference_data
        self.reference_millers = reference_millers
        self.Fit_I = Fit["I"]
        self.Fit_I_stddev = Fit_stddev["I"]
        self.I_visited = I_visited
        self.Fit = Fit
        self.experiments = experiment_manager
Exemple #21
0
def run_correction_vector_plot(working_phil):

    L = lines(working_phil)
    for line in L.vectors():
        pass  # pull out the information, lines class does all the work

    close_x = flex.double()
    close_y = flex.double()
    far_x = flex.double()
    far_y = flex.double()
    master_coords = L.master_coords
    master_cv = L.master_cv
    master_tiles = L.master_tiles
    for idx in range(0, len(master_coords), 10):
        if matrix.col(
                master_cv[idx]).length() < L.tile_rmsd[master_tiles[idx]]:
            pass
            #close_x.append(master_coords[idx][0])
            #close_y.append(master_coords[idx][1])
        else:
            far_x.append(master_coords[idx][0])
            far_y.append(master_coords[idx][1])
            close_x.append(master_coords[idx][0] + master_cv[idx][0])
            close_y.append(master_coords[idx][1] + master_cv[idx][1])
    if working_phil.show_plots is True:
        from matplotlib import pyplot as plt
        plt.plot(close_x, close_y, "r.")
        plt.plot(far_x, far_y, "g.")
        plt.axes().set_aspect("equal")
        plt.show()

    sort_radii = flex.sort_permutation(flex.double(L.radii))
    tile_rmsds = flex.double()
    radial_sigmas = flex.double(64)
    tangen_sigmas = flex.double(64)
    for idx in range(64):
        x = sort_radii[idx]
        print(
            "Tile %2d: radius %7.2f, %6d observations, delx %5.2f  dely %5.2f, rmsd = %5.2f"
            % (x, L.radii[x], L.tilecounts[x], L.mean_cv[x][0],
               L.mean_cv[x][1], L.tile_rmsd[x]),
            end=' ')
        if L.tilecounts[x] < 3:
            print()
            radial = (0, 0)
            tangential = (0, 0)
            rmean, tmean, rsigma, tsigma = (0, 0, 1, 1)
        else:
            wtaveg = L.weighted_average_angle_deg_from_tile(x)
            print("Tile rotation %6.2f deg" % wtaveg, end=' ')
            radial, tangential, rmean, tmean, rsigma, tsigma = get_radial_tangential_vectors(
                L, x)
            print("%6.2f %6.2f" % (rsigma, tsigma))
        radial_sigmas[x] = rsigma
        tangen_sigmas[x] = tsigma
    rstats = flex.mean_and_variance(radial_sigmas, L.tilecounts.as_double())
    tstats = flex.mean_and_variance(tangen_sigmas, L.tilecounts.as_double())

    print(
        "\nOverall                 %8d observations, delx %5.2f  dely %5.2f, rmsd = %5.2f"
        % (L.overall_N, L.overall_cv[0], L.overall_cv[1], L.overall_rmsd))
    print("Average tile rmsd %5.2f" % flex.mean(flex.double(L.tile_rmsd)))
    print("Average tile displacement %5.2f" %
          (flex.mean(flex.double([matrix.col(cv).length()
                                  for cv in L.mean_cv]))))
    print("Weighted average radial sigma %6.2f" % rstats.mean())
    print("Weighted average tangential sigma %6.2f" % tstats.mean())

    if working_phil.show_plots is True:
        plt.plot([(L.tiles[4 * x + 0] + L.tiles[4 * x + 2]) / 2.
                  for x in range(64)],
                 [(L.tiles[4 * x + 1] + L.tiles[4 * x + 3]) / 2.
                  for x in range(64)], "go")
        for x in range(64):
            plt.text(10 + (L.tiles[4 * x + 0] + L.tiles[4 * x + 2]) / 2.,
                     10 + (L.tiles[4 * x + 1] + L.tiles[4 * x + 3]) / 2.,
                     "%d" % x)
        plt.show()

        for idx in range(64):
            x = sort_radii[idx]
            print(
                "Tile %2d: radius %7.2f, %6d observations, delx %5.2f  dely %5.2f, rmsd = %5.2f"
                % (x, L.radii[x], L.tilecounts[x], L.mean_cv[x][0],
                   L.mean_cv[x][1], L.tile_rmsd[x]),
                end=' ')
            if L.tilecounts[x] < 3:
                print()
                radial = (0, 0)
                tangential = (0, 0)
                rmean, tmean, rsigma, tsigma = (0, 0, 1, 1)
            else:
                wtaveg = L.weighted_average_angle_deg_from_tile(x)
                print("Tile rotation %6.2f deg" % wtaveg, end=' ')
                radial, tangential, rmean, tmean, rsigma, tsigma = get_radial_tangential_vectors(
                    L, x)
                print("%6.2f %6.2f" % (rsigma, tsigma))

            if working_phil.colormap:
                from pylab import imshow, axes, colorbar, show
                import numpy

                xcv, ycv = get_correction_vector_xy(L, x)
                _min = min(min(xcv), min(ycv))
                _max = max(max(xcv), max(ycv))

                hist, xedges, yedges = numpy.histogram2d(xcv,
                                                         ycv,
                                                         bins=40,
                                                         range=[[_min, _max],
                                                                [_min, _max]])
                extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]

                imshow(hist.T,
                       extent=extent,
                       interpolation='nearest',
                       origin='lower')
                from matplotlib.patches import Ellipse
                ell = Ellipse(xy=(L.mean_cv[x][0], L.mean_cv[x][1]),
                              width=2. * rsigma,
                              height=2. * tsigma,
                              angle=math.atan2(-(radial[1]), -(radial[0])) *
                              180. / math.pi,
                              edgecolor="y",
                              linewidth=2,
                              fill=False,
                              zorder=100)
                axes().add_artist(ell)
                colorbar()
                show()

            else:
                from matplotlib import pyplot as plt
                xcv, ycv = get_correction_vector_xy(L, x)
                if len(xcv) == 0 or len(ycv) == 0: continue
                plt.plot(xcv, ycv, "r.")
                plt.plot([L.mean_cv[x][0]], [L.mean_cv[x][1]], "go")
                plt.plot([L.mean_cv[x][0] + radial[0]],
                         [L.mean_cv[x][1] + radial[1]], "yo")
                plt.plot([L.mean_cv[x][0] + tangential[0]],
                         [L.mean_cv[x][1] + tangential[1]], "bo")
                from matplotlib.patches import Ellipse
                ell = Ellipse(xy=(L.mean_cv[x][0], L.mean_cv[x][1]),
                              width=2. * rsigma,
                              height=2. * tsigma,
                              angle=math.atan2(-(radial[1]), -(radial[0])) *
                              180. / math.pi,
                              edgecolor="y",
                              linewidth=2,
                              fill=False,
                              zorder=100)
                plt.axes().add_artist(ell)
                plt.axes().set_aspect("equal")
                _min = min(min(xcv), min(ycv))
                _max = max(max(xcv), max(ycv))
                plt.axes().set_xlim(_min, _max)
                plt.axes().set_ylim(_min, _max)
                plt.show()
Exemple #22
0
 def _normalised_delta_cc_i(self):
     mav = flex.mean_and_variance(self.delta_cc)
     return (self.delta_cc -
             mav.mean()) / mav.unweighted_sample_standard_deviation()
Exemple #23
0
 def __init__(self,datadir,work_params,plot=False,esd_plot=False,half_data_flag=0):
  casetag = work_params.output.prefix
  # read the ground truth values back in
  import cPickle as pickle
  # it is assumed (for now) that the reference millers contain a complete asymmetric unit
  # of indices, within the (d_max,d_min) region of interest and possibly outside the region.
  reference_millers = pickle.load(open(os.path.join(datadir,casetag+"_miller.pickle"),"rb"))
  experiment_manager = read_experiments(work_params)

  obs = pickle.load(open(os.path.join(datadir,casetag+"_observation.pickle"),"rb"))
  print "Read in %d observations"%(len(obs["observed_intensity"]))
  reference_millers.show_summary(prefix="Miller index file ")

  print len(obs["frame_lookup"]),len(obs["observed_intensity"]), flex.max(obs['miller_lookup']),flex.max(obs['frame_lookup'])
  max_frameno = flex.max(obs["frame_lookup"])

  from iotbx import mtz
  mtz_object = mtz.object(file_name=work_params.scaling.mtz_file)
  #for array in mtz_object.as_miller_arrays():
  #  this_label = array.info().label_string()
  #  print this_label, array.observation_type()
  I_sim = mtz_object.as_miller_arrays()[0].as_intensity_array()
  I_sim.show_summary()
  MODEL_REINDEX_OP = work_params.model_reindex_op
  I_sim = I_sim.change_basis(MODEL_REINDEX_OP).map_to_asu()

  #match up isomorphous (the simulated fake F's) with experimental unique set
  matches = miller.match_multi_indices(
      miller_indices_unique=reference_millers.indices(),
      miller_indices=I_sim.indices())

  print "original unique",len(reference_millers.indices())
  print "isomorphous set",len(I_sim.indices())
  print "pairs",len(matches.pairs())
  iso_data = flex.double(len(reference_millers.indices()))

  for pair in matches.pairs():
    iso_data[pair[0]] = I_sim.data()[pair[1]]

  reference_data = miller.array(miller_set = reference_millers,
                                data = iso_data)
  reference_data.set_observation_type_xray_intensity()

  FOBS = prepare_observations_for_scaling(work_params,obs=obs,
                                          reference_intensities=reference_data,
                                          files = experiment_manager.get_files(),
                                          half_data_flag=half_data_flag)

  I,I_visited,G,G_visited = I_and_G_base_estimate(FOBS,params=work_params)
  print "I length",len(I), "G length",len(G), "(Reference set; entire asymmetric unit)"
  assert len(reference_data.data()) == len(I)

  #presumably these assertions fail when half data are taken for CC1/2 or d_min is cut
  model_I = reference_data.data()[0:len(I)]

  T = Timer("%d frames"%(len(G), ))

  mapper = mapper_factory(xscale6e)
  minimizer = mapper(I,G,I_visited,G_visited,FOBS,params=work_params,
                     experiments=experiment_manager.get_experiments())

  del T
  minimizer.show_summary()

  Fit = minimizer.e_unpack()
  Gstats=flex.mean_and_variance(Fit["G"].select(G_visited==1))
  print "G mean and standard deviation:",Gstats.mean(),Gstats.unweighted_sample_standard_deviation()
  if "Bfactor" in work_params.levmar.parameter_flags:
    Bstats=flex.mean_and_variance(Fit["B"].select(G_visited==1))
    print "B mean and standard deviation:",Bstats.mean(),Bstats.unweighted_sample_standard_deviation()
  show_correlation(Fit["I"],model_I,I_visited,"Correlation of I:")
  Fit_stddev = minimizer.e_unpack_stddev()

  # XXX FIXME known bug:  the length of Fit["G"] could be smaller than the length of experiment_manager.get_files()
  # Not sure if this has any operational drawbacks.  It's a result of half-dataset selection.

  if plot:
    plot_it(Fit["I"], model_I, mode="I")
    if "Rxy" in work_params.levmar.parameter_flags:
      show_histogram(Fit["Ax"],"Histogram of x rotation (degrees)")
      show_histogram(Fit["Ay"],"Histogram of y rotation (degrees)")
  print

  if esd_plot:
    minimizer.esd_plot()

  from cctbx.examples.merging.show_results import show_overall_observations
  table1,self.n_bins,self.d_min = show_overall_observations(
           Fit["I"],Fit_stddev["I"],I_visited,
           reference_data,FOBS,title="Statistics for all reflections",
           work_params = work_params)

  self.FSIM=FOBS
  self.ordered_intensities=reference_data
  self.reference_millers=reference_millers
  self.Fit_I=Fit["I"]
  self.Fit_I_stddev=Fit_stddev["I"]
  self.I_visited=I_visited
  self.Fit = Fit
  self.experiments = experiment_manager
    def __init__(self, **kwargs):
        group_args.__init__(self, **kwargs)
        print('finished Dij, now calculating rho_i and density')
        from xfel.clustering import Rodriguez_Laio_clustering_2014 as RL
        R = RL(distance_matrix=self.Dij, d_c=self.d_c)
        #from clustering.plot_with_dimensional_embedding import plot_with_dimensional_embedding
        #plot_with_dimensional_embedding(1-self.Dij/flex.max(self.Dij), show_plot=True)
        if hasattr(self, 'strategy') is False:
            self.strategy = 'default'
        self.rho = rho = R.get_rho()
        ave_rho = flex.mean(rho.as_double())
        NN = self.Dij.focus()[0]
        i_max = flex.max_index(rho)
        delta_i_max = flex.max(
            flex.double([self.Dij[i_max, j] for j in range(NN)]))
        rho_order = flex.sort_permutation(rho, reverse=True)
        rho_order_list = list(rho_order)
        self.delta = delta = R.get_delta(rho_order=rho_order,
                                         delta_i_max=delta_i_max)
        cluster_id = flex.int(NN, -1)  # -1 means no cluster
        delta_order = flex.sort_permutation(delta, reverse=True)
        MAX_PERCENTILE_RHO = self.max_percentile_rho  # cluster centers have to be in the top percentile
        n_cluster = 0
        #
        #
        print('Z_DELTA = ', self.Z_delta)

        pick_top_solution = False
        rho_stdev = flex.mean_and_variance(
            rho.as_double()).unweighted_sample_standard_deviation()
        delta_stdev = flex.mean_and_variance(
            delta).unweighted_sample_standard_deviation()
        if rho_stdev != 0.0 and delta_stdev != 0:
            rho_z = (rho.as_double() -
                     flex.mean(rho.as_double())) / (rho_stdev)
            delta_z = (delta - flex.mean(delta)) / (delta_stdev)
        else:
            pick_top_solution = True
            if rho_stdev == 0.0:
                centroids = [flex.first_index(delta, flex.max(delta))]
            elif delta_stdev == 0.0:
                centroids = [flex.first_index(rho, flex.max(rho))]

        significant_delta = []
        significant_rho = []
        # Define strategy to decide cluster center here. Only one should be true
        debug_fix_clustering = True
        if self.strategy == 'one_cluster':
            debug_fix_clustering = False
            strategy2 = True
        if self.strategy == 'strategy_3':
            debug_fix_clustering = False
            strategy3 = True
            strategy2 = False

        if debug_fix_clustering:
            if not pick_top_solution:
                delta_z_cutoff = min(1.0, max(delta_z))
                rho_z_cutoff = min(1.0, max(rho_z))
                for ic in range(NN):
                    # test the density & rho
                    if delta_z[ic] >= delta_z_cutoff or delta_z[
                            ic] <= -delta_z_cutoff:
                        significant_delta.append(ic)
                    if rho_z[ic] >= rho_z_cutoff or rho_z[ic] <= -rho_z_cutoff:
                        significant_rho.append(ic)
                if True:
                    # Use idea quoted in Rodriguez Laio 2014 paper
                    # " Thus, cluster centers are recognized as points for which the value of delta is anomalously large."
                    centroid_candidates = list(significant_delta)
                    candidate_delta_z = flex.double()
                    for ic in centroid_candidates:
                        if ic == rho_order[0]:
                            delta_z_of_rho_order_0 = delta_z[ic]
                        candidate_delta_z.append(delta_z[ic])
                    i_sorted = flex.sort_permutation(candidate_delta_z,
                                                     reverse=True)
                    # Check that once sorted the top one is not equal to the 2nd or 3rd position
                    # If there is a tie, assign centroid to the first one in rho order
                    centroids = []
                    # rho_order[0] has to be a centroid
                    centroids.append(rho_order[0])

                    #centroids.append(centroid_candidates[i_sorted[0]])
                    for i in range(0, len(i_sorted[:])):
                        if centroid_candidates[i_sorted[i]] == rho_order[0]:
                            continue
                        if delta_z_of_rho_order_0 - candidate_delta_z[
                                i_sorted[i]] > 1.0:
                            if i > 1:
                                if -candidate_delta_z[i_sorted[
                                        i - 1]] + candidate_delta_z[
                                            i_sorted[0]] > 1.0:
                                    centroids.append(
                                        centroid_candidates[i_sorted[i]])
                            else:
                                centroids.append(
                                    centroid_candidates[i_sorted[i]])
                        else:
                            break
                if False:
                    centroid_candidates = list(
                        set(significant_delta).intersection(
                            set(significant_rho)))
                    # Now compare the relative orders of the max delta_z and max rho_z to make sure they are within 1 stdev
                    centroids = []
                    max_delta_z_candidates = -999.9
                    max_rho_z_candidates = -999.9
                    for ic in centroid_candidates:
                        if delta_z[ic] > max_delta_z_candidates:
                            max_delta_z_candidates = delta_z[ic]
                        if rho_z[ic] > max_rho_z_candidates:
                            max_rho_z_candidates = rho_z[ic]
                    for ic in centroid_candidates:
                        if max_delta_z_candidates - delta_z[
                                ic] < 1.0 and max_rho_z_candidates - rho_z[
                                    ic] < 1.0:
                            centroids.append(ic)

            #item_idxs = [delta_order[ic] for ic,centroid in enumerate(centroids)]
            item_idxs = centroids
            for item_idx in item_idxs:
                cluster_id[item_idx] = n_cluster
                print('CLUSTERING_STATS', item_idx, cluster_id[item_idx])
                n_cluster += 1
                ####
        elif strategy2:
            # Go through list of clusters, see which one has highest joint rank in both rho and delta lists
            # This will only assign one cluster center based on highest product of rho and delta ranks
            product_list_of_ranks = []
            for ic in range(NN):
                rho_tmp = self.rho[ic]
                delta_tmp = self.delta[ic]
                product_list_of_ranks.append(rho_tmp * delta_tmp)
            import numpy as np
            item_idx = np.argmax(product_list_of_ranks)
            cluster_id[item_idx] = n_cluster  # Only cluster assigned
            print('CLUSTERING_STATS', item_idx, cluster_id[item_idx])
            n_cluster += 1
        elif strategy3:
            # use product of delta and rho and pick out top candidates
            # have to use a significance z_score to filter out the very best
            product_list_of_ranks = flex.double()
            for ic in range(NN):
                rho_tmp = self.rho[ic]
                delta_tmp = self.delta[ic]
                product_list_of_ranks.append(rho_tmp * delta_tmp)
            import numpy as np
            iid_sorted = flex.sort_permutation(product_list_of_ranks,
                                               reverse=True)
            cluster_id[
                iid_sorted[0]] = n_cluster  # first point always a cluster
            n_cluster += 1
            print('CLUSTERING_STATS S3', iid_sorted[0],
                  cluster_id[iid_sorted[0]])
            #product_list_of_ranks[iid_sorted[0]]=0.0 # set this to 0.0 so that the mean/stdev does not get biased by one point
            stdev = np.std(product_list_of_ranks)
            mean = np.mean(product_list_of_ranks)
            n_sorted = 3
            #if stdev == 0.0:
            #  n_sorted=1

            z_critical = 3.0  # 2 sigma significance ?
            # Only go through say 3-4 datapoints
            # basically there won't be more than 2-3 lattices on an image realistically
            for iid in iid_sorted[1:n_sorted]:
                z_score = (product_list_of_ranks[iid] - mean) / stdev
                if z_score > z_critical:
                    cluster_id[iid] = n_cluster
                    n_cluster += 1
                    print('CLUSTERING_STATS S3', iid, cluster_id[iid])
                else:
                    break  # No point going over all points once below threshold z_score

        else:
            for ic in range(NN):
                item_idx = delta_order[ic]
                if ic != 0:
                    if delta[item_idx] <= 0.25 * delta[
                            delta_order[0]]:  # too low to be a medoid
                        continue
                item_rho_order = rho_order_list.index(item_idx)
                if (item_rho_order) / NN < MAX_PERCENTILE_RHO:
                    cluster_id[item_idx] = n_cluster
                    print('CLUSTERING_STATS', ic, item_idx, item_rho_order,
                          cluster_id[item_idx])
                    n_cluster += 1
        ###


#
        print('Found %d clusters' % n_cluster)
        for x in range(NN):
            if cluster_id[x] >= 0:
                print("XC", x, cluster_id[x], rho[x], delta[x])
        self.cluster_id_maxima = cluster_id.deep_copy()
        R.cluster_assignment(rho_order, cluster_id, rho)
        self.cluster_id_full = cluster_id.deep_copy()

        #halo = flex.bool(NN,False)
        #border = R.get_border( cluster_id = cluster_id )

        #for ic in range(n_cluster): #loop thru all border regions; find highest density
        #  this_border = (cluster_id == ic) & (border==True)
        #  if this_border.count(True)>0:
        #    highest_density = flex.max(rho.select(this_border))
        #    halo_selection = (rho < highest_density) & (this_border==True)
        #    if halo_selection.count(True)>0:
        #      cluster_id.set_selected(halo_selection,-1)
        #    core_selection = (cluster_id == ic) & ~halo_selection
        #    highest_density = flex.max(rho.select(core_selection))
        #    too_sparse = core_selection & (rho.as_double() < highest_density/10.) # another heuristic
        #    if too_sparse.count(True)>0:
        #      cluster_id.set_selected(too_sparse,-1)
        self.cluster_id_final = cluster_id.deep_copy()
    def __init__(self, **kwargs):
        group_args.__init__(self, **kwargs)
        print('finished Dij, now calculating rho_i and density')
        from xfel.clustering import Rodriguez_Laio_clustering_2014 as RL
        R = RL(distance_matrix=self.Dij, d_c=self.d_c)
        #from IPython import embed; embed(); exit()
        #from clustering.plot_with_dimensional_embedding import plot_with_dimensional_embedding
        #plot_with_dimensional_embedding(1-self.Dij/flex.max(self.Dij), show_plot=True)
        self.rho = rho = R.get_rho()
        ave_rho = flex.mean(rho.as_double())
        NN = self.Dij.focus()[0]
        i_max = flex.max_index(rho)
        delta_i_max = flex.max(
            flex.double([self.Dij[i_max, j] for j in range(NN)]))
        rho_order = flex.sort_permutation(rho, reverse=True)
        rho_order_list = list(rho_order)
        self.delta = delta = R.get_delta(rho_order=rho_order,
                                         delta_i_max=delta_i_max)
        cluster_id = flex.int(NN, -1)  # -1 means no cluster
        delta_order = flex.sort_permutation(delta, reverse=True)
        MAX_PERCENTILE_RHO = self.max_percentile_rho  # cluster centers have to be in the top percentile
        n_cluster = 0
        #
        pick_top_solution = False
        rho_stdev = flex.mean_and_variance(
            rho.as_double()).unweighted_sample_standard_deviation()
        delta_stdev = flex.mean_and_variance(
            delta).unweighted_sample_standard_deviation()
        if rho_stdev != 0.0 and delta_stdev != 0:
            rho_z = (rho.as_double() -
                     flex.mean(rho.as_double())) / (rho_stdev)
            delta_z = (delta - flex.mean(delta)) / (delta_stdev)
        else:
            pick_top_solution = True
            if rho_stdev == 0.0:
                centroids = [flex.first_index(delta, flex.max(delta))]
            elif delta_stdev == 0.0:
                centroids = [flex.first_index(rho, flex.max(rho))]

        significant_delta = []
        significant_rho = []
        debug_fix_clustering = True
        if debug_fix_clustering:
            if not pick_top_solution:
                delta_z_cutoff = min(1.0, max(delta_z))
                rho_z_cutoff = min(1.0, max(rho_z))
                for ic in range(NN):
                    # test the density & rho
                    if delta_z[ic] >= delta_z_cutoff:
                        significant_delta.append(ic)
                    if rho_z[ic] >= rho_z_cutoff:
                        significant_rho.append(ic)
                centroid_candidates = list(
                    set(significant_delta).intersection(set(significant_rho)))
                # Now compare the relative orders of the max delta_z and max rho_z to make sure they are within 1 stdev
                centroids = []
                max_delta_z_candidates = -999.9
                max_rho_z_candidates = -999.9
                for ic in centroid_candidates:
                    if delta_z[ic] > max_delta_z_candidates:
                        max_delta_z_candidates = delta_z[ic]
                    if rho_z[ic] > max_rho_z_candidates:
                        max_rho_z_candidates = rho_z[ic]
                for ic in centroid_candidates:
                    if max_delta_z_candidates - delta_z[
                            ic] < 1.0 and max_rho_z_candidates - rho_z[
                                ic] < 1.0:
                        centroids.append(ic)

            item_idxs = [
                delta_order[ic] for ic, centroid in enumerate(centroids)
            ]
            for item_idx in item_idxs:
                cluster_id[item_idx] = n_cluster
                print('CLUSTERING_STATS', item_idx, cluster_id[item_idx])
                n_cluster += 1
                ####
        else:
            for ic in range(NN):
                item_idx = delta_order[ic]
                if ic != 0:
                    if delta[item_idx] <= 0.25 * delta[
                            delta_order[0]]:  # too low to be a medoid
                        continue
                item_rho_order = rho_order_list.index(item_idx)
                if (item_rho_order) / NN < MAX_PERCENTILE_RHO:
                    cluster_id[item_idx] = n_cluster
                    print('CLUSTERING_STATS', ic, item_idx, item_rho_order,
                          cluster_id[item_idx])
                    n_cluster += 1
        ###


#
#
        print('Found %d clusters' % n_cluster)
        for x in range(NN):
            if cluster_id[x] >= 0:
                print("XC", x, cluster_id[x], rho[x], delta[x])
        self.cluster_id_maxima = cluster_id.deep_copy()
        R.cluster_assignment(rho_order, cluster_id)
        self.cluster_id_full = cluster_id.deep_copy()

        #halo = flex.bool(NN,False)
        #border = R.get_border( cluster_id = cluster_id )

        #for ic in range(n_cluster): #loop thru all border regions; find highest density
        #  this_border = (cluster_id == ic) & (border==True)
        #  if this_border.count(True)>0:
        #    highest_density = flex.max(rho.select(this_border))
        #    halo_selection = (rho < highest_density) & (this_border==True)
        #    if halo_selection.count(True)>0:
        #      cluster_id.set_selected(halo_selection,-1)
        #    core_selection = (cluster_id == ic) & ~halo_selection
        #    highest_density = flex.max(rho.select(core_selection))
        #    too_sparse = core_selection & (rho.as_double() < highest_density/10.) # another heuristic
        #    if too_sparse.count(True)>0:
        #      cluster_id.set_selected(too_sparse,-1)
        self.cluster_id_final = cluster_id.deep_copy()
Exemple #26
0
 def _compute_normalised_delta_ccs(self):
     mav = flex.mean_and_variance(self.delta_cc_half)
     return (self.delta_cc_half -
             mav.mean()) / mav.unweighted_sample_standard_deviation()
def run_correction_vector_plot(working_phil):

  L = lines(working_phil)
  for line in L.vectors():
    pass # pull out the information, lines class does all the work

  close_x = flex.double()
  close_y = flex.double()
  far_x = flex.double()
  far_y = flex.double()
  master_coords = L.master_coords
  master_cv = L.master_cv
  master_tiles = L.master_tiles
  for idx in xrange(0,len(master_coords),10):
    if matrix.col(master_cv[idx]).length() < L.tile_rmsd[ master_tiles[idx] ]:
      pass
      #close_x.append(master_coords[idx][0])
      #close_y.append(master_coords[idx][1])
    else:
      far_x.append(master_coords[idx][0])
      far_y.append(master_coords[idx][1])
      close_x.append(master_coords[idx][0]+master_cv[idx][0])
      close_y.append(master_coords[idx][1]+master_cv[idx][1])
  if working_phil.show_plots is True:
    from matplotlib import pyplot as plt
    plt.plot(close_x,close_y,"r.")
    plt.plot(far_x,far_y,"g.")
    plt.axes().set_aspect("equal")
    plt.show()


  sort_radii = flex.sort_permutation(flex.double(L.radii))
  tile_rmsds = flex.double()
  radial_sigmas = flex.double(64)
  tangen_sigmas = flex.double(64)
  for idx in xrange(64):
    x = sort_radii[idx]
    print "Tile %2d: radius %7.2f, %6d observations, delx %5.2f  dely %5.2f, rmsd = %5.2f"%(
      x, L.radii[x], L.tilecounts[x], L.mean_cv[x][0], L.mean_cv[x][1],
      L.tile_rmsd[x]
        ),
    if L.tilecounts[x] < 3:
      print
      radial = (0,0)
      tangential = (0,0)
      rmean,tmean,rsigma,tsigma=(0,0,1,1)
    else:
      wtaveg = L.weighted_average_angle_deg_from_tile(x)
      print "Tile rotation %6.2f deg"%wtaveg,
      radial,tangential,rmean,tmean,rsigma,tsigma = get_radial_tangential_vectors(L,x)
      print "%6.2f %6.2f"%(rsigma,tsigma)
    radial_sigmas[x]=rsigma
    tangen_sigmas[x]=tsigma
  rstats = flex.mean_and_variance(radial_sigmas,L.tilecounts.as_double())
  tstats = flex.mean_and_variance(tangen_sigmas,L.tilecounts.as_double())

  print "\nOverall                 %8d observations, delx %5.2f  dely %5.2f, rmsd = %5.2f"%(
      L.overall_N, L.overall_cv[0], L.overall_cv[1], L.overall_rmsd)
  print "Average tile rmsd %5.2f"%flex.mean(flex.double(L.tile_rmsd))
  print "Average tile displacement %5.2f"%(flex.mean(
    flex.double([matrix.col(cv).length() for cv in L.mean_cv])))
  print "Weighted average radial sigma %6.2f"%rstats.mean()
  print "Weighted average tangential sigma %6.2f"%tstats.mean()

  if working_phil.show_plots is True:
    plt.plot([(L.tiles[4*x+0]+L.tiles[4*x+2])/2. for x in xrange(64)],[(L.tiles[4*x+1]+L.tiles[4*x+3])/2. for x in xrange(64)],"go")
    for x in xrange(64):
      plt.text(10+(L.tiles[4*x+0]+L.tiles[4*x+2])/2.,10+(L.tiles[4*x+1]+L.tiles[4*x+3])/2.,"%d"%x)
    plt.show()

    for idx in xrange(64):
      x = sort_radii[idx]
      print "Tile %2d: radius %7.2f, %6d observations, delx %5.2f  dely %5.2f, rmsd = %5.2f"%(
        x, L.radii[x], L.tilecounts[x], L.mean_cv[x][0], L.mean_cv[x][1],
        L.tile_rmsd[x]
        ),
      if L.tilecounts[x] < 3:
        print
        radial = (0,0)
        tangential = (0,0)
        rmean,tmean,rsigma,tsigma=(0,0,1,1)
      else:
        wtaveg = L.weighted_average_angle_deg_from_tile(x)
        print "Tile rotation %6.2f deg"%wtaveg,
        radial,tangential,rmean,tmean,rsigma,tsigma = get_radial_tangential_vectors(L,x)
        print "%6.2f %6.2f"%(rsigma,tsigma)

      if working_phil.colormap:
        from pylab import imshow, axes, colorbar, show
        import numpy

        xcv,ycv = get_correction_vector_xy(L,x)
        _min = min(min(xcv),min(ycv))
        _max = max(max(xcv),max(ycv))

        hist,xedges,yedges = numpy.histogram2d(xcv,ycv,bins=40,range=[[_min,_max],[_min,_max]])
        extent = [xedges[0], xedges[-1], yedges[0], yedges[-1] ]

        imshow(hist.T,extent=extent,interpolation='nearest',origin='lower')
        from matplotlib.patches import Ellipse
        ell = Ellipse(xy=(L.mean_cv[x][0],L.mean_cv[x][1]),
                      width=2.*rsigma, height=2.*tsigma,
                      angle=math.atan2(-(radial[1]),-(radial[0]))*180./math.pi,
                      edgecolor="y", linewidth=2, fill=False, zorder=100)
        axes().add_artist(ell)
        colorbar()
        show()

      else:
        from matplotlib import pyplot as plt
        xcv,ycv = get_correction_vector_xy(L,x)
        if len(xcv)==0 or len(ycv)==0: continue
        plt.plot(xcv,ycv,"r.")
        plt.plot([L.mean_cv[x][0]],[L.mean_cv[x][1]],"go")
        plt.plot([L.mean_cv[x][0]+radial[0]],[L.mean_cv[x][1]+radial[1]],"yo")
        plt.plot([L.mean_cv[x][0]+tangential[0]],[L.mean_cv[x][1]+tangential[1]],"bo")
        from matplotlib.patches import Ellipse
        ell = Ellipse(xy=(L.mean_cv[x][0],L.mean_cv[x][1]),
                      width=2.*rsigma, height=2.*tsigma,
                      angle=math.atan2(-(radial[1]),-(radial[0]))*180./math.pi,
                      edgecolor="y", linewidth=2, fill=False, zorder=100)
        plt.axes().add_artist(ell)
        plt.axes().set_aspect("equal")
        _min = min(min(xcv),min(ycv))
        _max = max(max(xcv),max(ycv))
        plt.axes().set_xlim(_min,_max)
        plt.axes().set_ylim(_min,_max)
        plt.show()
def get_uc_consensus(experiments_list,
                     show_plot=False,
                     return_only_first_indexed_model=False,
                     finalize_method=None,
                     clustering_params=None):
    '''
  Uses the Rodriguez Laio 2014 method to do a clustering of the unit cells and then vote for the highest
  consensus unit cell. Input needs to be a list of experiments object.
  Clustering code taken from github.com/cctbx-xfel/cluster_regression
  Returns an experiment object with crystal unit cell from the cluster with the most points
  '''
    if return_only_first_indexed_model:
        return [experiments_list[0].crystals()[0]], None
    cells = []
    from xfel.clustering.singleframe import CellOnlyFrame
    save_plot = False
    # Flag for testing Lysozyme data from NKS.Make sure cluster_regression repository is present and configured
    # Program will exit after plots are displayed if this flag is true
    test_nks = False
    if test_nks:
        from cctbx import crystal
        import libtbx.load_env
        cluster_regression = libtbx.env.find_in_repositories(
            relative_path="cluster_regression", test=os.path.isdir)
        file_name = os.path.join(cluster_regression, 'examples',
                                 'lysozyme1341.txt')
        for line in open(file_name, "r").xreadlines():
            tokens = line.strip().split()
            unit_cell = tuple(float(x) for x in tokens[0:6])
            space_group_symbol = tokens[6]
            crystal_symmetry = crystal.symmetry(
                unit_cell=unit_cell, space_group_symbol=space_group_symbol)
            cells.append(CellOnlyFrame(crystal_symmetry))
    else:
        for experiment in experiments_list:
            if len(experiment.crystals()) > 1:
                print('IOTA:Should have only one crystal model')
            crystal_symmetry = experiment.crystals()[0].get_crystal_symmetry()
            cells.append(CellOnlyFrame(crystal_symmetry))
    MM = [c.mm for c in cells]  # metrical matrices
    MM_double = flex.double()
    for i in range(len(MM)):
        Tup = MM[i]
        for j in range(6):
            MM_double.append(Tup[j])
    print('There are %d cells' % len(MM))
    coord_x = flex.double([c.uc[0] for c in cells])
    coord_y = flex.double([c.uc[1] for c in cells])
    if show_plot or save_plot:
        import matplotlib
        if not show_plot:
            matplotlib.use('Agg')
        import matplotlib.pyplot as plt
        #from IPython import embed; embed(); exit()
        plt.plot([c.uc[0] for c in cells], [c.uc[1] for c in cells],
                 "k.",
                 markersize=3.)
        plt.axes().set_aspect("equal")
    if save_plot:
        plot_name = 'uc_cluster.png'
        plt.savefig(plot_name,
                    size_inches=(10, 10),
                    dpi=300,
                    bbox_inches='tight')
    if show_plot:
        plt.show()
    print('Now constructing a Dij matrix: Starting Unit Cell clustering')
    NN = len(MM)
    from cctbx.uctbx.determine_unit_cell import NCDist_flatten
    Dij = NCDist_flatten(MM_double)
    d_c = flex.mean_and_variance(
        Dij.as_1d()).unweighted_sample_standard_deviation()  #6.13
    #FIXME should be a PHIL param
    if len(cells) < 5:
        return [experiments_list[0].crystals()[0]], None
    CM = clustering_manager(Dij=Dij, d_c=d_c, max_percentile_rho=0.95)
    n_cluster = 1 + flex.max(CM.cluster_id_final)
    print(len(cells), ' datapoints have been analyzed')
    print('%d CLUSTERS' % n_cluster)
    for i in range(n_cluster):
        item = flex.first_index(CM.cluster_id_maxima, i)
        print('Cluster %d central Unit cell = %d' % (i, item))
        cells[item].crystal_symmetry.show_summary()

    # More plots for debugging
    appcolors = [
        'b', 'r', '#ff7f0e', '#2ca02c', '#9467bd', '#8c564b', '#e377c2',
        '#7f7f7f', '#bcbd22', '#17becf'
    ]
    if show_plot:
        # Decision graph
        import matplotlib.pyplot as plt
        plt.plot(CM.rho, CM.delta, "r.", markersize=3.)
        for x in range(NN):
            if CM.cluster_id_maxima[x] >= 0:
                plt.plot([CM.rho[x]], [CM.delta[x]], "ro")
        plt.show()

    if show_plot:
        import matplotlib.pyplot as plt
        colors = [appcolors[i % 10] for i in CM.cluster_id_full]
        plt.scatter(coord_x,
                    coord_y,
                    marker='o',
                    color=colors,
                    linewidth=0.4,
                    edgecolor='k')
        for i in range(n_cluster):
            item = flex.first_index(CM.cluster_id_maxima, i)
            plt.plot([cells[item].uc[0]], cells[item].uc[1], 'y.')
            plt.axes().set_aspect("equal")
            plt.show()
    if test_nks:
        exit()

    # Now look at each unit cell cluster for orientational clustering
    # idea is to cluster the orientational component in each of the unit cell clusters
    #
    do_orientational_clustering = not return_only_first_indexed_model  # temporary.
    dxtbx_crystal_models = []
    if do_orientational_clustering:
        print('IOTA: Starting orientational clustering')
        Dij_ori = {}  # dictionary to store Dij for each cluster
        uc_experiments_list = {
        }  # dictionary to store experiments_lists for each cluster
        from collections import Counter
        uc_cluster_count = Counter(list(CM.cluster_id_final))
        # instantiate the Dij_ori flat 1-d array
        # Put all experiments list from same uc cluster together
        if True:
            from scitbx.matrix import sqr
            from cctbx_orientation_ext import crystal_orientation
            #crystal_orientation_list = []
            #for i in range(len(experiments_list)):
            #  crystal_orientation_list.append(crystal_orientation(experiments_list[i].crystals()[0].get_A(), True))
            #from IPython import embed; embed(); exit()
            #A_direct = sqr(crystal_orientation_list[i].reciprocal_matrix()).transpose().inverse()
            #print ("Direct A matrix 1st element = %12.6f"%A_direct[0])
        for i in range(len(experiments_list)):
            if CM.cluster_id_full[i] not in uc_experiments_list:
                uc_experiments_list[CM.cluster_id_full[i]] = []
            uc_experiments_list[CM.cluster_id_full[i]].append(
                experiments_list[i])
        for cluster in uc_cluster_count:
            # Make sure there are atleast a minimum number of samples in the cluster
            if uc_cluster_count[cluster] < 5:
                continue
            Dij_ori[cluster] = flex.double(
                [[0.0] * uc_cluster_count[cluster]] *
                uc_cluster_count[cluster])
            # Now populate the Dij_ori array
            N_samples_in_cluster = len(uc_experiments_list[cluster])
            for i in range(N_samples_in_cluster - 1):
                for j in range(i + 1, N_samples_in_cluster):
                    dij_ori = get_dij_ori(
                        uc_experiments_list[cluster][i].crystals()[0],
                        uc_experiments_list[cluster][j].crystals()[0])
                    Dij_ori[cluster][N_samples_in_cluster * i + j] = dij_ori
                    Dij_ori[cluster][N_samples_in_cluster * j + i] = dij_ori

        # Now do the orientational cluster analysis
        #from IPython import embed; embed(); exit()
        d_c_ori = 0.13
        from exafel_project.ADSE13_25.clustering.plot_with_dimensional_embedding import plot_with_dimensional_embedding
        #plot_with_dimensional_embedding(1-Dij_ori[1]/flex.max(Dij_ori[1]), show_plot=True)
        for cluster in Dij_ori:
            d_c_ori = flex.mean_and_variance(Dij_ori[cluster].as_1d(
            )).unweighted_sample_standard_deviation()
            CM_ori = clustering_manager(Dij=Dij_ori[cluster],
                                        d_c=d_c_ori,
                                        max_percentile_rho=0.85)
            n_cluster_ori = 1 + flex.max(CM_ori.cluster_id_final)
            #from IPython import embed; embed()
            #FIXME should be a PHIL param
            for i in range(n_cluster_ori):
                if len([zz for zz in CM_ori.cluster_id_final if zz == i]) < 5:
                    continue
                item = flex.first_index(CM_ori.cluster_id_maxima, i)
                dxtbx_crystal_model = uc_experiments_list[cluster][
                    item].crystals()[0]
                dxtbx_crystal_models.append(dxtbx_crystal_model)
                from scitbx.matrix import sqr
                from cctbx_orientation_ext import crystal_orientation
                crystal_orientation = crystal_orientation(
                    dxtbx_crystal_model.get_A(), True)
                A_direct = sqr(crystal_orientation.reciprocal_matrix()
                               ).transpose().inverse()
                print(
                    "IOTA: Direct A matrix 1st element of orientational cluster %d  = %12.6f"
                    % (i, A_direct[0]))
            if show_plot:
                # Decision graph
                stretch_plot_factor = 1.05  # (1+fraction of limits by which xlim,ylim should be set)
                import matplotlib.pyplot as plt
                plt.plot(CM_ori.rho, CM_ori.delta, "r.", markersize=3.)
                for x in range(len(list(CM_ori.cluster_id_final))):
                    if CM_ori.cluster_id_maxima[x] >= 0:
                        plt.plot([CM_ori.rho[x]], [CM_ori.delta[x]], "ro")
                #from IPython import embed; embed(); exit()
                plt.xlim([-10, stretch_plot_factor * flex.max(CM_ori.rho)])
                plt.ylim([-10, stretch_plot_factor * flex.max(CM_ori.delta)])
                plt.show()
    # Make sure the crystal models are not too close to each other
    # FIXME should be a PHIL
    min_angle = 5.0  # taken from indexer.py
    close_models_list = []
    if len(dxtbx_crystal_models) > 1:
        from dials.algorithms.indexing.compare_orientation_matrices import difference_rotation_matrix_axis_angle
        for i_a in range(0, len(dxtbx_crystal_models) - 1):
            for i_b in range(i_a, len(dxtbx_crystal_models)):
                cryst_a = dxtbx_crystal_models[i_a]
                cryst_b = dxtbx_crystal_models[i_b]
                R_ab, axis, angle, cb_op_ab = difference_rotation_matrix_axis_angle(
                    cryst_a, cryst_b)
                # FIXME
                if abs(angle) < min_angle:  # degrees
                    close_models_list.append((i_a, i_b))

    # Now prune the dxtbx_crystal_models list
    for close_models in close_models_list:
        i_a, i_b = close_models
        if dxtbx_crystal_models[i_a] is not None and dxtbx_crystal_models[
                i_b] is not None:
            dxtbx_crystal_models[i_a] = None

    dxtbx_crystal_models = [x for x in dxtbx_crystal_models if x is not None]
    if len(dxtbx_crystal_models) > 0:
        return dxtbx_crystal_models, None
    else:
        # If nothing works, atleast return the 1st crystal model that was found
        return [experiments_list[0].crystals()[0]], None