Beispiel #1
0
def npp(hklin):
    from iotbx.reflection_file_reader import any_reflection_file
    from xia2.Toolkit.NPP import npp_ify
    from scitbx.array_family import flex
    import math
    import sys
    reader = any_reflection_file(hklin)
    mtz_object = reader.file_content()
    intensities = [
        ma for ma in reader.as_miller_arrays(merge_equivalents=False)
        if ma.info().labels == ['I', 'SIGI']
    ][0]
    indices = intensities.indices()

    # merging: use external variance i.e. variances derived from SIGI column
    merger = intensities.merge_equivalents(use_internal_variance=False)
    mult = merger.redundancies().data()
    imean = merger.array()
    unique = imean.indices()
    iobs = imean.data()
    # scale up variance to account for sqrt(multiplicity) effective scaling
    variobs = (imean.sigmas()**2) * mult.as_double()

    all = flex.double()
    cen = flex.double()

    for hkl, i, v, m in zip(unique, iobs, variobs, mult):

        # only consider if meaningful number of observations
        if m < 3:
            continue

        sel = indices == hkl
        data = intensities.select(sel).data()

        assert (m == len(data))

        _x, _y = npp_ify(data, input_mean_variance=(i, v))

        # perform linreg on (i) all data and (ii) subset between +/- 2 sigma

        sel = (flex.abs(_x) < 2)
        _x_ = _x.select(sel)
        _y_ = _y.select(sel)

        fit_all = flex.linear_regression(_x, _y)
        fit_cen = flex.linear_regression(_x_, _y_)

        all.append(fit_all.slope())
        cen.append(fit_cen.slope())

        print('%3d %3d %3d' % hkl, '%.2f %.2f %.2f' % (i, v, i/math.sqrt(v)), \
          '%.2f %.2f' % (fit_all.slope(), fit_cen.slope()), '%d' % m)

    sys.stderr.write('Mean gradients: %.2f %.2f\n' %
                     (flex.sum(all) / all.size(), flex.sum(cen) / cen.size()))
Beispiel #2
0
Datei: npp.py Projekt: xia2/xia2
def npp(hklin):
  from iotbx.reflection_file_reader import any_reflection_file
  from xia2.Toolkit.NPP import npp_ify, mean_variance
  from scitbx.array_family import flex
  import math
  import sys
  reader = any_reflection_file(hklin)
  mtz_object = reader.file_content()
  intensities = [ma for ma in reader.as_miller_arrays(merge_equivalents=False)
                 if ma.info().labels == ['I', 'SIGI']][0]
  indices = intensities.indices()

  # merging: use external variance i.e. variances derived from SIGI column
  merger = intensities.merge_equivalents(use_internal_variance=False)
  mult = merger.redundancies().data()
  imean = merger.array()
  unique = imean.indices()
  iobs = imean.data()
  # scale up variance to account for sqrt(multiplicity) effective scaling
  variobs = (imean.sigmas() ** 2) * mult.as_double()

  all = flex.double()
  cen = flex.double()

  for hkl, i, v, m in zip(unique, iobs, variobs, mult):

    # only consider if meaningful number of observations
    if m < 3:
      continue

    sel = indices == hkl
    data = intensities.select(sel).data()

    assert(m == len(data))

    _x, _y = npp_ify(data, input_mean_variance=(i,v))

    # perform linreg on (i) all data and (ii) subset between +/- 2 sigma

    sel = (flex.abs(_x) < 2)
    _x_ = _x.select(sel)
    _y_ = _y.select(sel)

    fit_all = flex.linear_regression(_x, _y)
    fit_cen = flex.linear_regression(_x_, _y_)

    all.append(fit_all.slope())
    cen.append(fit_cen.slope())

    print '%3d %3d %3d' % hkl, '%.2f %.2f %.2f' % (i, v, i/math.sqrt(v)), \
      '%.2f %.2f' % (fit_all.slope(), fit_cen.slope()), '%d' % m

  sys.stderr.write('Mean gradients: %.2f %.2f\n' % (flex.sum(all) / all.size(),
                                                    flex.sum(cen) / cen.size()))
Beispiel #3
0
Datei: NPP.py Projekt: xia2/xia2
def test():
  numbers = variate(poisson_distribution(mean = 1000))
  data = flex.double()
  for j in range(1000):
    data.append(numbers.next())

  _x, _y = npp_ify(data)
  fit = flex.linear_regression(_x, _y)
  fit.show_summary()

  _x, _y = npp_ify(data, input_mean_variance=(1000, 1000))
  fit = flex.linear_regression(_x, _y)
  fit.show_summary()
Beispiel #4
0
def test():
  numbers = variate(poisson_distribution(mean = 1000))
  data = flex.double()
  for j in range(1000):
    data.append(next(numbers))

  _x, _y = npp_ify(data)
  fit = flex.linear_regression(_x, _y)
  fit.show_summary()

  _x, _y = npp_ify(data, input_mean_variance=(1000, 1000))
  fit = flex.linear_regression(_x, _y)
  fit.show_summary()
Beispiel #5
0
 def compute_rg_from_data(self,q,i):
   q_sq = q*q
   ln_i = flex.log( i )
   cc_obj = flex.linear_regression( q_sq, ln_i )
   rg2 = -cc_obj.slope()*3.0
   lni = cc_obj.y_intercept()
   return rg2, lni
  def estimate_cc_sig_fac(self):

    # A1.1. Estimation of sigma(CC) as a function of sample size.

    binner = self.intensities.setup_binner_counting_sorted(reflections_per_bin=200)

    a = flex.double()
    b = flex.double()
    for i in range(binner.n_bins_all()):
      count = binner.counts()[i]
      if count == 0:
        continue
      bin_isel = binner.array_indices(i)
      p = flex.random_permutation(count)
      p = p[:2 * (count // 2)] # ensure even count
      a.extend(self.intensities.data().select(bin_isel.select(p[:count//2])))
      b.extend(self.intensities.data().select(bin_isel.select(p[count//2:])))

    perm = flex.random_selection(a.size(), min(20000, a.size()))
    a = a.select(perm)
    b = b.select(perm)

    self.corr_unrelated = CorrelationCoefficientAccumulator(a, b)

    n_pairs = a.size()
    min_num_groups = 10 # minimum number of groups
    max_n_group = int(min(n_pairs/min_num_groups, 200)) # maximum number in group
    min_n_group = int(min(5, max_n_group)) # minimum number in group

    mean_ccs = flex.double()
    rms_ccs = flex.double()
    ns = flex.double()
    for n in range(min_n_group, max_n_group):
      ns.append(n)
      ccs = flex.double()
      for i in range(200):
        isel = flex.random_selection(a.size(), n)
        corr = CorrelationCoefficientAccumulator(a.select(isel), b.select(isel))
        ccs.append(corr.coefficient())

      mean_ccs.append(flex.mean(ccs))
      rms_ccs.append(flex.mean(flex.pow2(ccs))**0.5)

    x = 1/flex.pow(ns, 0.5)
    y = rms_ccs
    fit = flex.linear_regression(x, y)

    assert fit.is_well_defined()
    self.cc_sig_fac = fit.slope()

    if 0:
      from matplotlib import pyplot as plt
      plt.plot(x, y)
      plt.plot(
        plt.xlim(), [fit.slope() * x_ + fit.y_intercept() for x_ in plt.xlim()])
      plt.show()
Beispiel #7
0
def get_r_free_stats (miller_array, test_flag_value) :
  from scitbx.array_family import flex
  array = get_r_free_as_bool(miller_array, test_flag_value)
  n_free = array.data().count(True)
  accu =  array.sort(by_value="resolution").r_free_flags_accumulation()
  lr = flex.linear_regression(accu.reflection_counts.as_double(),
                              accu.free_fractions)
  assert lr.is_well_defined()
  slope = lr.slope()
  y_ideal = accu.reflection_counts.as_double() * slope
  sse = 0
  n_bins = 0
  n_ref_last = 0
  sse = flex.sum(flex.pow(y_ideal - accu.free_fractions, 2))
  for x in accu.reflection_counts :
    if x > (n_ref_last + 1) :
      n_bins += 1
    n_ref_last = x
  return (n_bins, n_free, sse, accu)
Beispiel #8
0
def get_r_free_stats(miller_array, test_flag_value):
    from scitbx.array_family import flex
    array = get_r_free_as_bool(miller_array, test_flag_value)
    n_free = array.data().count(True)
    accu = array.sort(by_value="resolution").r_free_flags_accumulation()
    lr = flex.linear_regression(accu.reflection_counts.as_double(),
                                accu.free_fractions)
    assert lr.is_well_defined()
    slope = lr.slope()
    y_ideal = accu.reflection_counts.as_double() * slope
    sse = 0
    n_bins = 0
    n_ref_last = 0
    sse = flex.sum(flex.pow(y_ideal - accu.free_fractions, 2))
    for x in accu.reflection_counts:
        if x > (n_ref_last + 1):
            n_bins += 1
        n_ref_last = x
    return (n_bins, n_free, sse, accu)
def find_delta(rho_map, tol):
  """ Find delta as hinted on fig. 1 of ref. [1] in module charge_flipping """
  rho = rho_map.real_map_unpadded().as_1d()
  max_rho = flex.max(rho)
  rho /= max_rho
  sorting = flex.sort_permutation(rho)
  sorted_rho = rho.select(sorting)
  n = len(sorted_rho)
  p,q = n//4, 3*n//4
  indexes = flex.double_range(p,q)
  values = sorted_rho[p:q]
  c = flex.linear_correlation(indexes, values)
  assert c.is_well_defined() and c.coefficient() > 0.99
  r = flex.linear_regression(indexes, values)
  a,b = r.y_intercept(), r.slope()
  deviation = flex.abs(a + b*flex.double_range(n) - sorted_rho)
  non_linear_sel = deviation > tol
  low = flex.first_index(non_linear_sel, False)
  high = flex.last_index(non_linear_sel, False)
  assert non_linear_sel[low:high].count(False)/(high-low+1) > 0.99
  assert sorted_rho[low] < 0 and sorted_rho[high] > 0
  return min(sorted_rho[high], -sorted_rho[low]), max_rho
Beispiel #10
0
    def _estimate_cc_sig_fac(self):
        """Estimation of sigma(CC) as a function of sample size.

        Estimate the error in the correlation coefficient, sigma(CC) by using
        pairs of reflections at similar resolutions that are not related by
        potential symmetry. Using pairs of unrelated reflections at similar
        resolutions, calculate sigma(CC) == rms(CC) for groups of size N = 3..200.
        The constant CCsigFac is obtained from a linear fit of
        sigma(CC) to 1/N^(1/2), i.e.:
            sigma(CC) = CCsigFac/N^(1/2)
        """

        max_bins = 500
        reflections_per_bin = max(
            200, int(math.ceil(self.intensities.size() / max_bins)))
        binner = self.intensities.setup_binner_counting_sorted(
            reflections_per_bin=reflections_per_bin)

        a = flex.double()
        b = flex.double()
        ma_tmp = self.intensities.customized_copy(
            crystal_symmetry=crystal.symmetry(
                space_group=self.lattice_group,
                unit_cell=self.intensities.unit_cell(),
                assert_is_compatible_unit_cell=False,
            )).map_to_asu()
        for i in range(binner.n_bins_all()):
            count = binner.counts()[i]
            if count == 0:
                continue
            bin_isel = binner.array_indices(i)
            p = flex.random_permutation(count)
            p = p[:2 * (count // 2)]  # ensure even count
            ma_a = ma_tmp.select(bin_isel.select(p[:count // 2]))
            ma_b = ma_tmp.select(bin_isel.select(p[count // 2:]))
            # only choose pairs of reflections that don't have the same indices
            # in the asu of the lattice group
            sel = ma_a.indices() != ma_b.indices()
            a.extend(ma_a.data().select(sel))
            b.extend(ma_b.data().select(sel))

        perm = flex.random_selection(a.size(), min(20000, a.size()))
        a = a.select(perm)
        b = b.select(perm)

        self.corr_unrelated = CorrelationCoefficientAccumulator(a, b)

        n_pairs = a.size()
        min_num_groups = 10  # minimum number of groups
        max_n_group = int(min(n_pairs / min_num_groups,
                              200))  # maximum number in group
        min_n_group = int(min(5, max_n_group))  # minimum number in group

        if (max_n_group - min_n_group) < 4:
            self.cc_sig_fac = 0
            return

        mean_ccs = flex.double()
        rms_ccs = flex.double()
        ns = flex.double()
        for n in range(min_n_group, max_n_group + 1):
            ns.append(n)
            ccs = flex.double()
            for i in range(200):
                isel = flex.random_selection(a.size(), n)
                corr = CorrelationCoefficientAccumulator(
                    a.select(isel), b.select(isel))
                ccs.append(corr.coefficient())

            mean_ccs.append(flex.mean(ccs))
            rms_ccs.append(flex.mean(flex.pow2(ccs))**0.5)

        x = 1 / flex.pow(ns, 0.5)
        y = rms_ccs
        fit = flex.linear_regression(x, y)

        if fit.is_well_defined():
            self.cc_sig_fac = fit.slope()
        else:
            self.cc_sig_fac = 0
Beispiel #11
0
def semisynthetic_variance_analysis(semisynthetic_integrated_data_files,
                                    value_column):
    import six.moves.cPickle as pickle
    import math
    from dials.array_family import flex
    from dials.util.add_hash import add_hash

    integrated_data_sets = [
        pickle.load(open(data_file, "rb"))
        for data_file in semisynthetic_integrated_data_files
    ]

    # check column, find variance
    integrated_data = integrated_data_sets[0]
    assert value_column in integrated_data
    variance_column = None
    for column in [
            "%s.variance" % value_column,
            value_column.replace("value", "variance"),
    ]:
        if column in integrated_data:
            variance_column = column
            break
    assert variance_column
    data = integrated_data[value_column]
    if hasattr(data, "parts"):
        multicolumn = len(data.parts())
    else:
        multicolumn = 0

    # first prepare the data files i.e. remove partials, keep only integrated
    # reflections, add the hash column, add weight column

    hash_set = None

    hashed_data_sets = []

    for integrated_data in integrated_data_sets:
        size0 = integrated_data.size()
        if "intensity" in value_column:
            sel = integrated_data.get_flags(integrated_data.flags.integrated)
            integrated_data = integrated_data.select(sel)
            sel = integrated_data["partiality"] > 0.99
            integrated_data = integrated_data.select(sel)
        elif "xyzobs" in value_column:
            sel = integrated_data.get_flags(integrated_data.flags.indexed)
            integrated_data = integrated_data.select(sel)
        integrated_data = add_hash(integrated_data)
        hashed_data_sets.append(integrated_data)
        if hash_set is None:
            hash_set = set(integrated_data["hash"])
        else:
            hash_set = hash_set.intersection(set(integrated_data["hash"]))
        size1 = integrated_data.size()

    duplicate = []
    for h in hash_set:
        # check for duplicates i.e. reflection at 0, 2pi
        for i in hashed_data_sets:
            sel = i["hash"] == h
            isel = sel.iselection()
            if len(isel) > 1:
                duplicate.append(h)

    for d in duplicate:
        hash_set.discard(d)

    # now analyse those reflections found to be in all data sets (here looking
    # at the profile fitted intensity and variance thereof)

    for h in hash_set:
        if not multicolumn:
            values = flex.double()
            variances = flex.double()
            for i in hashed_data_sets:
                sel = i["hash"] == h
                isel = sel.iselection()
                assert len(isel) == 1
                values.append(i[isel[0]][value_column])
                variances.append(i[isel[0]][variance_column])
            weighted_mean, weighted_variance = weighted_mean_variance(
                values, variances)
            expected, scaled = npp(values, (weighted_mean, weighted_variance))
            fit = flex.linear_regression(expected, scaled)
            # since I have everything needed to compute chi-square here...
            n = len(values)
            chi2 = (sum([((v - weighted_mean)**2) / weighted_variance
                         for v in values]) / n)
            print("%.3f %.3f %.3f" %
                  (weighted_mean / math.sqrt(weighted_variance), fit.slope(),
                   chi2))
        else:
            values = {}
            variances = {}
            for m in range(multicolumn):
                values[m] = flex.double()
                variances[m] = flex.double()
            for i in hashed_data_sets:
                sel = i["hash"] == h
                isel = sel.iselection()
                assert len(isel) == 1
                data = i[isel[0]][value_column]
                variance = i[isel[0]][variance_column]
                for m in range(multicolumn):
                    values[m].append(data[m])
                    variances[m].append(variance[m])
            result = ""
            for m in range(multicolumn):
                weighted_mean, weighted_variance = weighted_mean_variance(
                    values[m], variances[m])
                expected, scaled = npp(values[m],
                                       (weighted_mean, weighted_variance))
                fit = flex.linear_regression(expected, scaled)
                # since I have everything needed to compute chi-square here...
                n = len(values[m])
                chi2 = (sum([((v - weighted_mean)**2) / weighted_variance
                             for v in values[m]]) / n)
                result += "%f %.3f %.3f " % (
                    math.sqrt(weighted_variance),
                    fit.slope(),
                    chi2,
                )
            print(result)
def semisynthetic_variance_analysis(semisynthetic_integrated_data_files,
                                    value_column):
  import cPickle as pickle
  import math
  from dials.array_family import flex
  from dials.util.add_hash import add_hash, dehash

  integrated_data_sets = [pickle.load(open(data_file, 'rb')) for
                          data_file in semisynthetic_integrated_data_files]

  # check column, find variance
  integrated_data = integrated_data_sets[0]
  assert value_column in integrated_data
  variance_column = None
  for column in ['%s.variance' % value_column,
                 value_column.replace('value', 'variance')]:
    if column in integrated_data:
      variance_column = column
      break
  assert(variance_column)
  data = integrated_data[value_column]
  if hasattr(data, 'parts'):
    multicolumn = len(data.parts())
  else:
    multicolumn = 0

  # first prepare the data files i.e. remove partials, keep only integrated
  # reflections, add the hash column, add weight column

  hash_set = None

  hashed_data_sets = []

  for integrated_data in integrated_data_sets:
    size0 = integrated_data.size()
    if 'intensity' in value_column:
      sel = integrated_data.get_flags(integrated_data.flags.integrated)
      integrated_data = integrated_data.select(sel)
      sel = integrated_data['partiality'] > 0.99
      integrated_data = integrated_data.select(sel)
    elif 'xyzobs' in value_column:
      sel = integrated_data.get_flags(integrated_data.flags.indexed)
      integrated_data = integrated_data.select(sel)
    integrated_data = add_hash(integrated_data)
    hashed_data_sets.append(integrated_data)
    if hash_set is None:
      hash_set = set(integrated_data['hash'])
    else:
      hash_set = hash_set.intersection(set(integrated_data['hash']))
    size1 = integrated_data.size()

  duplicate = []
  for h in hash_set:
    # check for duplicates i.e. reflection at 0, 2pi
    for i in hashed_data_sets:
      sel = i['hash'] == h
      isel = sel.iselection()
      if len(isel) > 1:
        duplicate.append(h)

  for d in duplicate:
    hash_set.discard(d)

  # now analyse those reflections found to be in all data sets (here looking
  # at the profile fitted intensity and variance thereof)

  for h in hash_set:
    if not multicolumn:
      values = flex.double()
      variances = flex.double()
      for i in hashed_data_sets:
        sel = i['hash'] == h
        isel = sel.iselection()
        assert(len(isel) == 1)
        values.append(i[isel[0]][value_column])
        variances.append(i[isel[0]][variance_column])
      weighted_mean, weighted_variance = weighted_mean_variance(values,
                                                                variances)
      expected, scaled = npp(values, (weighted_mean, weighted_variance))
      fit = flex.linear_regression(expected, scaled)
      # since I have everything needed to compute chi-square here...
      n = len(values)
      chi2 = sum([((v - weighted_mean) ** 2) / weighted_variance for v in values]) / n
      print '%.3f %.3f %.3f' % (weighted_mean / math.sqrt(weighted_variance), fit.slope(), chi2)
    else:
      values = { }
      variances = { }
      for m in range(multicolumn):
        values[m] = flex.double()
        variances[m] = flex.double()
      for i in hashed_data_sets:
        sel = i['hash'] == h
        isel = sel.iselection()
        assert(len(isel) == 1)
        data = i[isel[0]][value_column]
        variance = i[isel[0]][variance_column]
        for m in range(multicolumn):
          values[m].append(data[m])
          variances[m].append(variance[m])
      result = ''
      for m in range(multicolumn):
        weighted_mean, weighted_variance = weighted_mean_variance(values[m],
                                                                  variances[m])
        expected, scaled = npp(values[m], (weighted_mean, weighted_variance))
        fit = flex.linear_regression(expected, scaled)
        # since I have everything needed to compute chi-square here...
        n = len(values[m])
        chi2 = sum([((v - weighted_mean) ** 2) / weighted_variance for v in values[m]]) / n
        result += '%f %.3f %.3f ' % (math.sqrt(weighted_variance), fit.slope(), chi2)
      print result