def runge_phenomenon(self, n=41, nt=35, print_it=False):
    x_e = 2.0 * (flex.double(xrange(n)) / float(n - 1) - 0.5)
    y_e = 1 / (1 + x_e * x_e * 25)
    fit_e = chebyshev_lsq_fit.chebyshev_lsq_fit(
        nt,
        x_e,
        y_e,
    )
    fit_e = chebyshev_polynome(nt, fit_e.low_limit, fit_e.high_limit,
                               fit_e.coefs)

    x_c = chebyshev_lsq_fit.chebyshev_nodes(n, -1, 1, True)
    y_c = 1 / (1 + x_c * x_c * 25)
    fit_c = chebyshev_lsq_fit.chebyshev_lsq_fit(
        nt,
        x_c,
        y_c,
    )
    fit_c = chebyshev_polynome(nt, fit_c.low_limit, fit_c.high_limit,
                               fit_c.coefs)

    x_plot = 2.0 * (flex.double(xrange(3 * n)) / float(3 * n - 1) - 0.5)
    y_plot_e = fit_e.f(x_plot)
    y_plot_c = fit_c.f(x_plot)
    y_id = 1 / (1 + x_plot * x_plot * 25)
    if print_it:
        for x, y, yy, yyy in zip(x_plot, y_id, y_plot_e, y_plot_c):
            print x, y, yy, yyy
def another_example(np=41,nt=5):
  x = flex.double( range(np) )/(np-1)
  y = 0.99*flex.exp(-x*x*0.5)
  y = -flex.log(1.0/y-1)
  w = y*y/1.0
  d = (flex.random_double(np)-0.5)*w
  y_obs = y+d

  y = 1.0/( 1.0 + flex.exp(-y) )

  fit_w = chebyshev_lsq_fit.chebyshev_lsq_fit(nt,
                                              x,
                                              y_obs,
                                              w )
  fit_w_f = chebyshev_polynome(
    nt, fit_w.low_limit, fit_w.high_limit, fit_w.coefs)


  fit_nw = chebyshev_lsq_fit.chebyshev_lsq_fit(nt,
                                              x,
                                              y_obs)
  fit_nw_f = chebyshev_polynome(
    nt, fit_nw.low_limit, fit_nw.high_limit, fit_nw.coefs)
  print
  print "Coefficients from weighted lsq"
  print list( fit_w.coefs )
  print "Coefficients from non-weighted lsq"
  print list( fit_nw.coefs )
  assert flex.max( flex.abs(fit_nw.coefs-fit_w.coefs) ) > 0
def runge_phenomenon(self,n=41,nt=35,print_it=False):
  x_e = 2.0*(flex.double( xrange(n) )/float(n-1)-0.5)
  y_e = 1/(1+x_e*x_e*25)
  fit_e = chebyshev_lsq_fit.chebyshev_lsq_fit(nt,
                                              x_e,
                                              y_e,
                                              )
  fit_e = chebyshev_polynome(
    nt, fit_e.low_limit, fit_e.high_limit, fit_e.coefs)


  x_c = chebyshev_lsq_fit.chebyshev_nodes(n, -1, 1, True)
  y_c = 1/(1+x_c*x_c*25)
  fit_c = chebyshev_lsq_fit.chebyshev_lsq_fit(nt,
                                              x_c,
                                              y_c,
                                              )
  fit_c = chebyshev_polynome(
    nt, fit_c.low_limit, fit_c.high_limit, fit_c.coefs)


  x_plot = 2.0*(flex.double( xrange(3*n) )/float(3*n-1)-0.5)
  y_plot_e = fit_e.f( x_plot )
  y_plot_c = fit_c.f( x_plot )
  y_id =  1/(1+x_plot*x_plot*25)
  if print_it:
    for x,y,yy,yyy in zip(x_plot,y_id,y_plot_e,y_plot_c):
      print x,y,yy,yyy
Exemple #4
0
 def chebyshev_fit(self, x_obs, y_obs, w_obs, n_terms=None):
     from scitbx.math import chebyshev_polynome
     from scitbx.math import chebyshev_lsq_fit
     if n_terms is None:
         # determining the number of terms takes much, much longer than the fit
         n_terms = chebyshev_lsq_fit.cross_validate_to_determine_number_of_terms(
             x_obs,
             y_obs,
             w_obs,
             min_terms=5,
             max_terms=20,
             n_goes=20,
             n_free=20)
     self.logger.info("Fitting with %i terms" % n_terms)
     fit = chebyshev_lsq_fit.chebyshev_lsq_fit(n_terms, x_obs, y_obs, w_obs)
     self.logger.info("Least Squares residual: %7.6f" % (fit.f))
     fit_funct = chebyshev_polynome(n_terms, fit.low_limit, fit.high_limit,
                                    fit.coefs)
     y_fitted = fit_funct.f(x_obs)
     if 0:
         # debugging plots
         from matplotlib import pyplot
         pyplot.clf()
         pyplot.plot(x_obs, y_obs)
         pyplot.plot(x_obs, y_fitted)
         pyplot.draw()
         pyplot.show()
     return y_fitted
def estimate_signal_to_noise(x, y):
  raise
  if 1:
    x, y = interpolate(x, y)
    #x, y_tr = fourier_filter(x, y)
    x, y_tr = savitzky_golay_filter(x, y)
    noise = y - y_tr
  else:

    from scitbx.math import chebyshev_polynome
    from scitbx.math import chebyshev_lsq_fit

    x_obs, y_obs = x, y
    w_obs = flex.double(y_obs.size(), 1)
    w_obs[0] = 1e16
    w_obs[-1] = 1e16
    ## determining the number of terms takes much, much longer than the fit
    n_terms = chebyshev_lsq_fit.cross_validate_to_determine_number_of_terms(
      x_obs, y_obs, w_obs,
      min_terms=2, max_terms=30,
      n_goes=20, n_free=20)
    #n_terms = 7
    print "n_terms:", n_terms
    fit = chebyshev_lsq_fit.chebyshev_lsq_fit(n_terms, x_obs, y_obs, w_obs)
    fit_funct = chebyshev_polynome(
      n_terms, fit.low_limit, fit.high_limit, fit.coefs)
    y_fitted = fit_funct.f(x)
    y_tr = y_fitted
    n = y_tr.size()
    noise = y - y_tr


  noise_sq = flex.pow2(noise)
  from xfel.command_line.view_pixel_histograms import sliding_average
  #sigma_sq = sliding_average(noise_sq, n=31)
  sigma_sq = sliding_average(noise_sq, n=15)
  #sigma_sq = sliding_average(sigma_sq)
  #signal_to_noise = y/flex.sqrt(sigma_sq)
  import math
  signal_to_noise = y/math.sqrt(flex.mean(noise_sq[50:200]))
  #pyplot.plot(noise)
  #pyplot.plot(x,y)
  #pyplot.show()
  offset = 0.2 * flex.max(y)
  offset = 0
  pyplot.plot(x, y, linewidth=2)
  pyplot.plot(x, offset+y_tr, linewidth=2)
  pyplot.show()
  pyplot.plot(x, noise, linewidth=2)
  #pyplot.plot(x, flex.sqrt(sigma_sq), linewidth=2)
  #ax2 = pyplot.twinx()
  #ax2.plot(x, y)
  pyplot.show()
  pyplot.plot(x[:375], signal_to_noise[:375])
  #pyplot.xlim(
  #ax2 = pyplot.twinx()
  #ax2.plot(x, y)
  pyplot.show()
def another_example(np=41, nt=5):
    x = flex.double(range(np)) / (np - 1)
    y = 0.99 * flex.exp(-x * x * 0.5)
    y = -flex.log(1.0 / y - 1)
    w = y * y / 1.0
    d = (flex.random_double(np) - 0.5) * w
    y_obs = y + d

    y = 1.0 / (1.0 + flex.exp(-y))

    fit_w = chebyshev_lsq_fit.chebyshev_lsq_fit(nt, x, y_obs, w)
    fit_w_f = chebyshev_polynome(nt, fit_w.low_limit, fit_w.high_limit,
                                 fit_w.coefs)

    fit_nw = chebyshev_lsq_fit.chebyshev_lsq_fit(nt, x, y_obs)
    fit_nw_f = chebyshev_polynome(nt, fit_nw.low_limit, fit_nw.high_limit,
                                  fit_nw.coefs)
    print
    print "Coefficients from weighted lsq"
    print list(fit_w.coefs)
    print "Coefficients from non-weighted lsq"
    print list(fit_nw.coefs)
    assert flex.max(flex.abs(fit_nw.coefs - fit_w.coefs)) > 0
 def chebyshev_fit(self, x_obs, y_obs, w_obs, n_terms=None):
   from scitbx.math import chebyshev_polynome
   from scitbx.math import chebyshev_lsq_fit
   if n_terms is None:
     # determining the number of terms takes much, much longer than the fit
     n_terms = chebyshev_lsq_fit.cross_validate_to_determine_number_of_terms(
       x_obs, y_obs, w_obs,
       min_terms=5, max_terms=20,
       n_goes=20, n_free=20)
   self.logger.info("Fitting with %i terms" %n_terms)
   fit = chebyshev_lsq_fit.chebyshev_lsq_fit(n_terms, x_obs, y_obs, w_obs)
   self.logger.info("Least Squares residual: %7.6f" %(fit.f))
   fit_funct = chebyshev_polynome(
     n_terms, fit.low_limit, fit.high_limit, fit.coefs)
   y_fitted = fit_funct.f(x_obs)
   if 0:
     # debugging plots
     from matplotlib import pyplot
     pyplot.clf()
     pyplot.plot(x_obs, y_obs)
     pyplot.plot(x_obs, y_fitted)
     pyplot.draw()
     pyplot.show()
   return y_fitted
def estimate_signal_to_noise(x, y):
    raise
    if 1:
        x, y = interpolate(x, y)
        #x, y_tr = fourier_filter(x, y)
        x, y_tr = savitzky_golay_filter(x, y)
        noise = y - y_tr
    else:

        from scitbx.math import chebyshev_polynome
        from scitbx.math import chebyshev_lsq_fit

        x_obs, y_obs = x, y
        w_obs = flex.double(y_obs.size(), 1)
        w_obs[0] = 1e16
        w_obs[-1] = 1e16
        ## determining the number of terms takes much, much longer than the fit
        n_terms = chebyshev_lsq_fit.cross_validate_to_determine_number_of_terms(
            x_obs,
            y_obs,
            w_obs,
            min_terms=2,
            max_terms=30,
            n_goes=20,
            n_free=20)
        #n_terms = 7
        print "n_terms:", n_terms
        fit = chebyshev_lsq_fit.chebyshev_lsq_fit(n_terms, x_obs, y_obs, w_obs)
        fit_funct = chebyshev_polynome(n_terms, fit.low_limit, fit.high_limit,
                                       fit.coefs)
        y_fitted = fit_funct.f(x)
        y_tr = y_fitted
        n = y_tr.size()
        noise = y - y_tr

    noise_sq = flex.pow2(noise)
    from xfel.command_line.view_pixel_histograms import sliding_average
    #sigma_sq = sliding_average(noise_sq, n=31)
    sigma_sq = sliding_average(noise_sq, n=15)
    #sigma_sq = sliding_average(sigma_sq)
    #signal_to_noise = y/flex.sqrt(sigma_sq)
    import math
    signal_to_noise = y / math.sqrt(flex.mean(noise_sq[50:200]))
    #pyplot.plot(noise)
    #pyplot.plot(x,y)
    #pyplot.show()
    offset = 0.2 * flex.max(y)
    offset = 0
    pyplot.plot(x, y, linewidth=2)
    pyplot.plot(x, offset + y_tr, linewidth=2)
    pyplot.show()
    pyplot.plot(x, noise, linewidth=2)
    #pyplot.plot(x, flex.sqrt(sigma_sq), linewidth=2)
    #ax2 = pyplot.twinx()
    #ax2.plot(x, y)
    pyplot.show()
    pyplot.plot(x[:375], signal_to_noise[:375])
    #pyplot.xlim(
    #ax2 = pyplot.twinx()
    #ax2.plot(x, y)
    pyplot.show()
def example():
    x_obs = (flex.double(range(100)) + 1.0) / 101.0
    y_ideal = flex.sin(x_obs * 6.0 * 3.1415) + flex.exp(x_obs)
    y_obs = y_ideal + (flex.random_double(size=x_obs.size()) - 0.5) * 0.5
    w_obs = flex.double(x_obs.size(), 1)
    print "Trying to determine the best number of terms "
    print " via cross validation techniques"
    print
    n_terms = chebyshev_lsq_fit.cross_validate_to_determine_number_of_terms(
        x_obs, y_obs, w_obs, min_terms=5, max_terms=20, n_goes=20, n_free=20)
    print "Fitting with", n_terms, "terms"
    print
    fit = chebyshev_lsq_fit.chebyshev_lsq_fit(n_terms, x_obs, y_obs)
    print "Least Squares residual: %7.6f" % (fit.f)
    print "  R2-value            : %7.6f" % (fit.f / flex.sum(y_obs * y_obs))
    print
    fit_funct = chebyshev_polynome(n_terms, fit.low_limit, fit.high_limit,
                                   fit.coefs)

    y_fitted = fit_funct.f(x_obs)
    abs_deviation = flex.max(flex.abs((y_ideal - y_fitted)))
    print "Maximum deviation between fitted and error free data:"
    print "    %4.3f" % (abs_deviation)
    abs_deviation = flex.mean(flex.abs((y_ideal - y_fitted)))
    print "Mean deviation between fitted and error free data:"
    print "    %4.3f" % (abs_deviation)
    print
    abs_deviation = flex.max(flex.abs((y_obs - y_fitted)))
    print "Maximum deviation between fitted and observed data:"
    print "    %4.3f" % (abs_deviation)
    abs_deviation = flex.mean(flex.abs((y_obs - y_fitted)))
    print "Mean deviation between fitted and observed data:"
    print "    %4.3f" % (abs_deviation)
    print
    print "Showing 10 points"
    print "   x    y_obs y_ideal y_fit"
    for ii in range(10):
        print "%6.3f %6.3f %6.3f %6.3f" \
              %(x_obs[ii*9], y_obs[ii*9], y_ideal[ii*9], y_fitted[ii*9])

    try:
        from iotbx import data_plots
    except ImportError:
        pass
    else:
        print "Preparing output for loggraph in a file called"
        print "   chebyshev.loggraph"
        chebyshev_plot = data_plots.plot_data(plot_title='Chebyshev fitting',
                                              x_label='x values',
                                              y_label='y values',
                                              x_data=x_obs,
                                              y_data=y_obs,
                                              y_legend='Observed y values',
                                              comments='Chebyshev fit')
        chebyshev_plot.add_data(y_data=y_ideal, y_legend='Error free y values')
        chebyshev_plot.add_data(y_data=y_fitted,
                                y_legend='Fitted chebyshev approximation')
        output_logfile = open('chebyshev.loggraph', 'w')
        f = StringIO()
        data_plots.plot_data_loggraph(chebyshev_plot, f)
        output_logfile.write(f.getvalue())
Exemple #10
0
  def __init__(self,
               miller_obs,
               miller_calc,
               r_free_flags,
               kernel_width_free_reflections=None,
               kernel_width_d_star_cubed=None,
               kernel_in_bin_centers=False,
               kernel_on_chebyshev_nodes=True,
               n_sampling_points=20,
               n_chebyshev_terms=10,
               use_sampling_sum_weights=False,
               make_checks_and_clean_up=True):
    assert [kernel_width_free_reflections, kernel_width_d_star_cubed].count(None) == 1

    self.miller_obs = miller_obs
    self.miller_calc = abs(miller_calc)
    self.r_free_flags = r_free_flags
    self.kernel_width_free_reflections = kernel_width_free_reflections
    self.kernel_width_d_star_cubed = kernel_width_d_star_cubed
    self.n_chebyshev_terms = n_chebyshev_terms

    if make_checks_and_clean_up:
      self.miller_obs = self.miller_obs.map_to_asu()
      self.miller_calc = self.miller_calc.map_to_asu()
      self.r_free_flags = self.r_free_flags.map_to_asu()
      assert self.r_free_flags.indices().all_eq(
        self.miller_obs.indices() )
      self.miller_calc = self.miller_calc.common_set(
        self.miller_obs )
      assert self.r_free_flags.indices().all_eq(
        self.miller_calc.indices() )
      assert self.miller_obs.is_real_array()

      if self.miller_obs.is_xray_intensity_array():
        self.miller_obs = self.miller_obs.f_sq_as_f()
      assert self.miller_obs.observation_type() is None or \
             self.miller_obs.is_xray_amplitude_array()

    if self.miller_calc.observation_type() is None:
      self.miller_calc = self.miller_calc.set_observation_type(
        self.miller_obs)

    # get normalized data please
    self.normalized_obs_f = absolute_scaling.kernel_normalisation(
      self.miller_obs, auto_kernel=True)
    self.normalized_obs =self.normalized_obs_f.normalised_miller_dev_eps.f_sq_as_f()

    self.normalized_calc_f = absolute_scaling.kernel_normalisation(
      self.miller_calc, auto_kernel=True)
    self.normalized_calc =self.normalized_calc_f.normalised_miller_dev_eps.f_sq_as_f()

    # get the 'free data'

    if(self.r_free_flags.data().count(True) == 0):
      self.r_free_flags = self.r_free_flags.array(
        data = ~self.r_free_flags.data())

    self.free_norm_obs = self.normalized_obs.select( self.r_free_flags.data() )
    self.free_norm_calc= self.normalized_calc.select( self.r_free_flags.data() )

    if self.free_norm_obs.data().size() <= 0:
      raise RuntimeError("No free reflections.")

    if (self.kernel_width_d_star_cubed is None):
      self.kernel_width_d_star_cubed=sigmaa_estimator_kernel_width_d_star_cubed(
        r_free_flags=self.r_free_flags,
        kernel_width_free_reflections=self.kernel_width_free_reflections)

    self.sigma_target_functor = ext.sigmaa_estimator(
      e_obs     = self.free_norm_obs.data(),
      e_calc    = self.free_norm_calc.data(),
      centric   = self.free_norm_obs.centric_flags().data(),
      d_star_cubed = self.free_norm_obs.d_star_cubed().data() ,
      width=self.kernel_width_d_star_cubed)

    d_star_cubed_overall = self.miller_obs.d_star_cubed().data()
    self.min_h = flex.min( d_star_cubed_overall )
    self.max_h = flex.max( d_star_cubed_overall )
    self.h_array = None
    if (kernel_in_bin_centers):
      self.h_array = flex.double( range(1,n_sampling_points*2,2) )*(
        self.max_h-self.min_h)/(n_sampling_points*2)+self.min_h
    else:
      self.min_h *= 0.99
      self.max_h *= 1.01
      if kernel_on_chebyshev_nodes:
        self.h_array = chebyshev_lsq_fit.chebyshev_nodes(
          n=n_sampling_points,
          low=self.min_h,
          high=self.max_h,
          include_limits=True)
      else:
        self.h_array = flex.double( range(n_sampling_points) )*(
          self.max_h-self.min_h)/float(n_sampling_points-1.0)+self.min_h
    assert self.h_array.size() == n_sampling_points
    self.sigmaa_array = flex.double()
    self.sigmaa_array.reserve(self.h_array.size())
    self.sum_weights = flex.double()
    self.sum_weights.reserve(self.h_array.size())

    for h in self.h_array:
      stimator = sigmaa_point_estimator(self.sigma_target_functor, h)
      self.sigmaa_array.append( stimator.sigmaa )
      self.sum_weights.append(
        self.sigma_target_functor.sum_weights(d_star_cubed=h))

    # fit a smooth function
    reparam_sa = -flex.log( 1.0/self.sigmaa_array -1.0 )
    if (use_sampling_sum_weights):
      w_obs = flex.sqrt(self.sum_weights)
    else:
      w_obs = None
    fit_lsq = chebyshev_lsq_fit.chebyshev_lsq_fit(
      n_terms=self.n_chebyshev_terms,
      x_obs=self.h_array,
      y_obs=reparam_sa,
      w_obs=w_obs)

    cheb_pol = chebyshev_polynome(
        self.n_chebyshev_terms,
        self.min_h,
        self.max_h,
        fit_lsq.coefs)
    def reverse_reparam(values): return 1.0/(1.0 + flex.exp(-values))
    self.sigmaa_fitted = reverse_reparam(cheb_pol.f(self.h_array))
    self.sigmaa_miller_array = reverse_reparam(cheb_pol.f(d_star_cubed_overall))
    assert flex.min(self.sigmaa_miller_array) >= 0
    assert flex.max(self.sigmaa_miller_array) <= 1
    self.sigmaa_miller_array = self.miller_obs.array(data=self.sigmaa_miller_array)

    self.alpha = None
    self.beta = None
    self.fom_array = None
  def __init__(self,
               miller_array,
               kernel_width=None,
               n_bins=23,
               n_term=13,
               d_star_sq_low=None,
               d_star_sq_high=None,
               auto_kernel=False,
               number_of_sorted_reflections_for_auto_kernel=50):
    ## Autokernel is either False, true or a specific integer
    if kernel_width is None:
      assert (auto_kernel is not False)
    if auto_kernel is not False:
      assert (kernel_width==None)
    assert miller_array.size()>0
    ## intensity arrays please
    work_array = None
    if not miller_array.is_real_array():
      raise RuntimeError("Please provide real arrays only")
      ## I might have to change this upper condition
    if miller_array.is_xray_amplitude_array():
      work_array = miller_array.f_as_f_sq()
    if miller_array.is_xray_intensity_array():
      work_array = miller_array.deep_copy()
      work_array = work_array.set_observation_type(miller_array)
    ## If type is not intensity or amplitude
    ## raise an execption please
    if not miller_array.is_xray_intensity_array():
      if not miller_array.is_xray_amplitude_array():
        raise RuntimeError("Observation type unknown")
    ## declare some shorthands
    I_obs = work_array.data()
    epsilons = work_array.epsilons().data().as_double()
    d_star_sq_hkl = work_array.d_spacings().data()
    d_star_sq_hkl = 1.0/(d_star_sq_hkl*d_star_sq_hkl)
    ## Set up some limits
    if d_star_sq_low is None:
      d_star_sq_low = flex.min(d_star_sq_hkl)
    if d_star_sq_high is None:
      d_star_sq_high = flex.max(d_star_sq_hkl)
    ## A feeble attempt to determine an appropriate kernel width
    ## that seems to work reasonable in practice
    self.kernel_width=kernel_width
    if auto_kernel is not False:
      ## get the d_star_sq_array and sort it
      sort_permut = flex.sort_permutation(d_star_sq_hkl)
      ##
      if auto_kernel==True:
        number=number_of_sorted_reflections_for_auto_kernel
      else:
        number=int(auto_kernel)
      if number > d_star_sq_hkl.size():
        number = d_star_sq_hkl.size()-1
      self.kernel_width = d_star_sq_hkl[sort_permut[number]]-d_star_sq_low
      assert self.kernel_width > 0
    ## Making the d_star_sq_array
    assert (n_bins>1) ## assure that there are more then 1 bins for interpolation
    self.d_star_sq_array = chebyshev_lsq_fit.chebyshev_nodes(
      n=n_bins,
      low=d_star_sq_low,
      high=d_star_sq_high,
      include_limits=True)

    ## Now get the average intensity please
    ##
    ## This step can be reasonably time consuming
    self.mean_I_array = scaling.kernel_normalisation(
      d_star_sq_hkl = d_star_sq_hkl,
      I_hkl = I_obs,
      epsilon = epsilons,
      d_star_sq_array = self.d_star_sq_array,
      kernel_width = self.kernel_width
      )
    self.var_I_array = scaling.kernel_normalisation(
      d_star_sq_hkl = d_star_sq_hkl,
      I_hkl = I_obs*I_obs,
      epsilon = epsilons*epsilons,
      d_star_sq_array = self.d_star_sq_array,
      kernel_width = self.kernel_width
      )
    self.var_I_array = self.var_I_array - self.mean_I_array*self.mean_I_array
    self.weight_sum = self.var_I_array = scaling.kernel_normalisation(
      d_star_sq_hkl = d_star_sq_hkl,
      I_hkl = I_obs*0.0+1.0,
      epsilon = epsilons*0.0+1.0,
      d_star_sq_array = self.d_star_sq_array,
      kernel_width = self.kernel_width
      )
    eps = 1e-16 # XXX Maybe this should be larger?
    self.bin_selection = (self.mean_I_array > eps)
    sel_pos = self.bin_selection.iselection()
    # FIXME rare bug: this crashes when the majority of the data are zero,
    # e.g. because resolution limit was set too high and F/I filled in with 0.
    # it would be good to catch such cases in advance by inspecting the binned
    # values, and raise a different error message.
    assert sel_pos.size() > 0
    if (sel_pos.size() < self.mean_I_array.size() / 2) :
      raise Sorry("Analysis could not be continued because more than half "+
        "of the data have values below 1e-16.  This usually indicates either "+
        "an inappropriately high resolution cutoff, or an error in the data "+
        "file which artificially creates a higher resolution limit.")
    self.mean_I_array = self.mean_I_array.select(sel_pos)
    self.d_star_sq_array = self.d_star_sq_array.select(sel_pos)
    self.var_I_array = flex.log( self.var_I_array.select( sel_pos ) )
    self.weight_sum = self.weight_sum.select(sel_pos)
    self.mean_I_array = flex.log( self.mean_I_array )
    ## Fit a chebyshev polynome please
    normalizer_fit_lsq = chebyshev_lsq_fit.chebyshev_lsq_fit(
      n_term,
      self.d_star_sq_array,
      self.mean_I_array )
    self.normalizer = chebyshev_polynome(
      n_term,
      d_star_sq_low,
      d_star_sq_high,
      normalizer_fit_lsq.coefs)
    var_lsq_fit = chebyshev_lsq_fit.chebyshev_lsq_fit(
      n_term,
      self.d_star_sq_array,
      self.var_I_array )
    self.var_norm = chebyshev_polynome(
      n_term,
      d_star_sq_low,
      d_star_sq_high,
      var_lsq_fit.coefs)
    ws_fit = chebyshev_lsq_fit.chebyshev_lsq_fit(
      n_term,
      self.d_star_sq_array,
      self.weight_sum )
    self.weight_sum = chebyshev_polynome(
      n_term,
      d_star_sq_low,
      d_star_sq_high,
      ws_fit.coefs)

    ## The data wil now be normalised using the
    ## chebyshev polynome we have just obtained
    self.mean_I_array = flex.exp( self.mean_I_array)
    self.normalizer_for_miller_array =  flex.exp( self.normalizer.f(d_star_sq_hkl) )
    self.var_I_array = flex.exp( self.var_I_array )
    self.var_norm = flex.exp( self.var_norm.f(d_star_sq_hkl) )
    self.weight_sum = flex.exp( self.weight_sum.f(d_star_sq_hkl))
    self.normalised_miller = None
    self.normalised_miller_dev_eps = None
    if work_array.sigmas() is not None:
      self.normalised_miller = work_array.customized_copy(
        data = work_array.data()/self.normalizer_for_miller_array,
        sigmas = work_array.sigmas()/self.normalizer_for_miller_array
        ).set_observation_type(work_array)
      self.normalised_miller_dev_eps = self.normalised_miller.customized_copy(
        data = self.normalised_miller.data()/epsilons,
        sigmas = self.normalised_miller.sigmas()/epsilons)\
        .set_observation_type(work_array)
    else:
      self.normalised_miller = work_array.customized_copy(
        data = work_array.data()/self.normalizer_for_miller_array
        ).set_observation_type(work_array)
      self.normalised_miller_dev_eps = self.normalised_miller.customized_copy(
        data = self.normalised_miller.data()/epsilons)\
        .set_observation_type(work_array)
    def __init__(
        self,
        miller_obs,
        miller_calc,
        r_free_flags,
        kernel_width_free_reflections=None,
        kernel_width_d_star_cubed=None,
        kernel_in_bin_centers=False,
        kernel_on_chebyshev_nodes=True,
        n_sampling_points=20,
        n_chebyshev_terms=10,
        use_sampling_sum_weights=False,
        make_checks_and_clean_up=True,
    ):
        assert [kernel_width_free_reflections, kernel_width_d_star_cubed].count(None) == 1

        self.miller_obs = miller_obs
        self.miller_calc = abs(miller_calc)
        self.r_free_flags = r_free_flags
        self.kernel_width_free_reflections = kernel_width_free_reflections
        self.kernel_width_d_star_cubed = kernel_width_d_star_cubed
        self.n_chebyshev_terms = n_chebyshev_terms

        if make_checks_and_clean_up:
            self.miller_obs = self.miller_obs.map_to_asu()
            self.miller_calc = self.miller_calc.map_to_asu()
            self.r_free_flags = self.r_free_flags.map_to_asu()
            assert self.r_free_flags.indices().all_eq(self.miller_obs.indices())
            self.miller_calc = self.miller_calc.common_set(self.miller_obs)
            assert self.r_free_flags.indices().all_eq(self.miller_calc.indices())
            assert self.miller_obs.is_real_array()

            if self.miller_obs.is_xray_intensity_array():
                self.miller_obs = self.miller_obs.f_sq_as_f()
            assert self.miller_obs.observation_type() is None or self.miller_obs.is_xray_amplitude_array()

        if self.miller_calc.observation_type() is None:
            self.miller_calc = self.miller_calc.set_observation_type(self.miller_obs)

        # get normalized data please
        self.normalized_obs_f = absolute_scaling.kernel_normalisation(self.miller_obs, auto_kernel=True)
        self.normalized_obs = self.normalized_obs_f.normalised_miller_dev_eps.f_sq_as_f()

        self.normalized_calc_f = absolute_scaling.kernel_normalisation(self.miller_calc, auto_kernel=True)
        self.normalized_calc = self.normalized_calc_f.normalised_miller_dev_eps.f_sq_as_f()

        # get the 'free data'

        if self.r_free_flags.data().count(True) == 0:
            self.r_free_flags = self.r_free_flags.array(data=~self.r_free_flags.data())

        self.free_norm_obs = self.normalized_obs.select(self.r_free_flags.data())
        self.free_norm_calc = self.normalized_calc.select(self.r_free_flags.data())

        if self.free_norm_obs.data().size() <= 0:
            raise RuntimeError("No free reflections.")

        if self.kernel_width_d_star_cubed is None:
            self.kernel_width_d_star_cubed = sigmaa_estimator_kernel_width_d_star_cubed(
                r_free_flags=self.r_free_flags, kernel_width_free_reflections=self.kernel_width_free_reflections
            )

        self.sigma_target_functor = ext.sigmaa_estimator(
            e_obs=self.free_norm_obs.data(),
            e_calc=self.free_norm_calc.data(),
            centric=self.free_norm_obs.centric_flags().data(),
            d_star_cubed=self.free_norm_obs.d_star_cubed().data(),
            width=self.kernel_width_d_star_cubed,
        )

        d_star_cubed_overall = self.miller_obs.d_star_cubed().data()
        self.min_h = flex.min(d_star_cubed_overall)
        self.max_h = flex.max(d_star_cubed_overall)
        self.h_array = None
        if kernel_in_bin_centers:
            self.h_array = (
                flex.double(xrange(1, n_sampling_points * 2, 2)) * (self.max_h - self.min_h) / (n_sampling_points * 2)
                + self.min_h
            )
        else:
            self.min_h *= 0.99
            self.max_h *= 1.01
            if kernel_on_chebyshev_nodes:
                self.h_array = chebyshev_lsq_fit.chebyshev_nodes(
                    n=n_sampling_points, low=self.min_h, high=self.max_h, include_limits=True
                )
            else:
                self.h_array = (
                    flex.double(range(n_sampling_points)) * (self.max_h - self.min_h) / float(n_sampling_points - 1.0)
                    + self.min_h
                )
        assert self.h_array.size() == n_sampling_points
        self.sigmaa_array = flex.double()
        self.sigmaa_array.reserve(self.h_array.size())
        self.sum_weights = flex.double()
        self.sum_weights.reserve(self.h_array.size())

        for h in self.h_array:
            stimator = sigmaa_point_estimator(self.sigma_target_functor, h)
            self.sigmaa_array.append(stimator.sigmaa)
            self.sum_weights.append(self.sigma_target_functor.sum_weights(d_star_cubed=h))

        # fit a smooth function
        reparam_sa = -flex.log(1.0 / self.sigmaa_array - 1.0)
        if use_sampling_sum_weights:
            w_obs = flex.sqrt(self.sum_weights)
        else:
            w_obs = None
        fit_lsq = chebyshev_lsq_fit.chebyshev_lsq_fit(
            n_terms=self.n_chebyshev_terms, x_obs=self.h_array, y_obs=reparam_sa, w_obs=w_obs
        )

        cheb_pol = chebyshev_polynome(self.n_chebyshev_terms, self.min_h, self.max_h, fit_lsq.coefs)

        def reverse_reparam(values):
            return 1.0 / (1.0 + flex.exp(-values))

        self.sigmaa_fitted = reverse_reparam(cheb_pol.f(self.h_array))
        self.sigmaa_miller_array = reverse_reparam(cheb_pol.f(d_star_cubed_overall))
        assert flex.min(self.sigmaa_miller_array) >= 0
        assert flex.max(self.sigmaa_miller_array) <= 1
        self.sigmaa_miller_array = self.miller_obs.array(data=self.sigmaa_miller_array)

        self.alpha = None
        self.beta = None
        self.fom_array = None
    def __init__(self,
                 miller_array,
                 kernel_width=None,
                 n_bins=23,
                 n_term=13,
                 d_star_sq_low=None,
                 d_star_sq_high=None,
                 auto_kernel=False,
                 number_of_sorted_reflections_for_auto_kernel=50):
        ## Autokernel is either False, true or a specific integer
        if kernel_width is None:
            assert (auto_kernel is not False)
        if auto_kernel is not False:
            assert (kernel_width == None)
        assert miller_array.size() > 0
        ## intensity arrays please
        work_array = None
        if not miller_array.is_real_array():
            raise RuntimeError("Please provide real arrays only")
            ## I might have to change this upper condition
        if miller_array.is_xray_amplitude_array():
            work_array = miller_array.f_as_f_sq()
        if miller_array.is_xray_intensity_array():
            work_array = miller_array.deep_copy()
            work_array = work_array.set_observation_type(miller_array)
        ## If type is not intensity or amplitude
        ## raise an execption please
        if not miller_array.is_xray_intensity_array():
            if not miller_array.is_xray_amplitude_array():
                raise RuntimeError("Observation type unknown")
        ## declare some shorthands
        I_obs = work_array.data()
        epsilons = work_array.epsilons().data().as_double()
        d_star_sq_hkl = work_array.d_spacings().data()
        d_star_sq_hkl = 1.0 / (d_star_sq_hkl * d_star_sq_hkl)
        ## Set up some limits
        if d_star_sq_low is None:
            d_star_sq_low = flex.min(d_star_sq_hkl)
        if d_star_sq_high is None:
            d_star_sq_high = flex.max(d_star_sq_hkl)
        ## A feeble attempt to determine an appropriate kernel width
        ## that seems to work reasonable in practice
        self.kernel_width = kernel_width
        if auto_kernel is not False:
            ## get the d_star_sq_array and sort it
            sort_permut = flex.sort_permutation(d_star_sq_hkl)
            ##
            if auto_kernel == True:
                number = number_of_sorted_reflections_for_auto_kernel
            else:
                number = int(auto_kernel)
            if number > d_star_sq_hkl.size():
                number = d_star_sq_hkl.size() - 1
            self.kernel_width = d_star_sq_hkl[
                sort_permut[number]] - d_star_sq_low
            assert self.kernel_width > 0
        ## Making the d_star_sq_array
        assert (n_bins > 1
                )  ## assure that there are more then 1 bins for interpolation
        self.d_star_sq_array = chebyshev_lsq_fit.chebyshev_nodes(
            n=n_bins,
            low=d_star_sq_low,
            high=d_star_sq_high,
            include_limits=True)

        ## Now get the average intensity please
        ##
        ## This step can be reasonably time consuming
        self.mean_I_array = scaling.kernel_normalisation(
            d_star_sq_hkl=d_star_sq_hkl,
            I_hkl=I_obs,
            epsilon=epsilons,
            d_star_sq_array=self.d_star_sq_array,
            kernel_width=self.kernel_width)
        self.var_I_array = scaling.kernel_normalisation(
            d_star_sq_hkl=d_star_sq_hkl,
            I_hkl=I_obs * I_obs,
            epsilon=epsilons * epsilons,
            d_star_sq_array=self.d_star_sq_array,
            kernel_width=self.kernel_width)
        self.var_I_array = self.var_I_array - self.mean_I_array * self.mean_I_array
        self.weight_sum = self.var_I_array = scaling.kernel_normalisation(
            d_star_sq_hkl=d_star_sq_hkl,
            I_hkl=I_obs * 0.0 + 1.0,
            epsilon=epsilons * 0.0 + 1.0,
            d_star_sq_array=self.d_star_sq_array,
            kernel_width=self.kernel_width)
        eps = 1e-16  # XXX Maybe this should be larger?
        self.bin_selection = (self.mean_I_array > eps)
        sel_pos = self.bin_selection.iselection()
        # FIXME rare bug: this crashes when the majority of the data are zero,
        # e.g. because resolution limit was set too high and F/I filled in with 0.
        # it would be good to catch such cases in advance by inspecting the binned
        # values, and raise a different error message.
        assert sel_pos.size() > 0
        if (sel_pos.size() < self.mean_I_array.size() / 2):
            raise Sorry(
                "Analysis could not be continued because more than half " +
                "of the data have values below 1e-16.  This usually indicates either "
                +
                "an inappropriately high resolution cutoff, or an error in the data "
                + "file which artificially creates a higher resolution limit.")
        self.mean_I_array = self.mean_I_array.select(sel_pos)
        self.d_star_sq_array = self.d_star_sq_array.select(sel_pos)
        self.var_I_array = flex.log(self.var_I_array.select(sel_pos))
        self.weight_sum = self.weight_sum.select(sel_pos)
        self.mean_I_array = flex.log(self.mean_I_array)
        ## Fit a chebyshev polynome please
        normalizer_fit_lsq = chebyshev_lsq_fit.chebyshev_lsq_fit(
            n_term, self.d_star_sq_array, self.mean_I_array)
        self.normalizer = chebyshev_polynome(n_term, d_star_sq_low,
                                             d_star_sq_high,
                                             normalizer_fit_lsq.coefs)
        var_lsq_fit = chebyshev_lsq_fit.chebyshev_lsq_fit(
            n_term, self.d_star_sq_array, self.var_I_array)
        self.var_norm = chebyshev_polynome(n_term, d_star_sq_low,
                                           d_star_sq_high, var_lsq_fit.coefs)
        ws_fit = chebyshev_lsq_fit.chebyshev_lsq_fit(n_term,
                                                     self.d_star_sq_array,
                                                     self.weight_sum)
        self.weight_sum = chebyshev_polynome(n_term, d_star_sq_low,
                                             d_star_sq_high, ws_fit.coefs)

        ## The data wil now be normalised using the
        ## chebyshev polynome we have just obtained
        self.mean_I_array = flex.exp(self.mean_I_array)
        self.normalizer_for_miller_array = flex.exp(
            self.normalizer.f(d_star_sq_hkl))
        self.var_I_array = flex.exp(self.var_I_array)
        self.var_norm = flex.exp(self.var_norm.f(d_star_sq_hkl))
        self.weight_sum = flex.exp(self.weight_sum.f(d_star_sq_hkl))
        self.normalised_miller = None
        self.normalised_miller_dev_eps = None
        if work_array.sigmas() is not None:
            self.normalised_miller = work_array.customized_copy(
                data=work_array.data() / self.normalizer_for_miller_array,
                sigmas=work_array.sigmas() /
                self.normalizer_for_miller_array).set_observation_type(
                    work_array)
            self.normalised_miller_dev_eps = self.normalised_miller.customized_copy(
              data = self.normalised_miller.data()/epsilons,
              sigmas = self.normalised_miller.sigmas()/epsilons)\
              .set_observation_type(work_array)
        else:
            self.normalised_miller = work_array.customized_copy(
                data=work_array.data() /
                self.normalizer_for_miller_array).set_observation_type(
                    work_array)
            self.normalised_miller_dev_eps = self.normalised_miller.customized_copy(
              data = self.normalised_miller.data()/epsilons)\
              .set_observation_type(work_array)
def example():
  x_obs = (flex.double(range(100))+1.0)/101.0
  y_ideal = flex.sin(x_obs*6.0*3.1415) + flex.exp(x_obs)
  y_obs = y_ideal + (flex.random_double(size=x_obs.size())-0.5)*0.5
  w_obs = flex.double(x_obs.size(),1)
  print "Trying to determine the best number of terms "
  print " via cross validation techniques"
  print
  n_terms = chebyshev_lsq_fit.cross_validate_to_determine_number_of_terms(
    x_obs,y_obs,w_obs,
    min_terms=5 ,max_terms=20,
    n_goes=20,n_free=20)
  print "Fitting with", n_terms, "terms"
  print
  fit = chebyshev_lsq_fit.chebyshev_lsq_fit(n_terms,x_obs,y_obs)
  print "Least Squares residual: %7.6f" %(fit.f)
  print "  R2-value            : %7.6f" %(fit.f/flex.sum(y_obs*y_obs))
  print
  fit_funct = chebyshev_polynome(
    n_terms, fit.low_limit, fit.high_limit, fit.coefs)

  y_fitted = fit_funct.f(x_obs)
  abs_deviation = flex.max(
    flex.abs( (y_ideal- y_fitted) ) )
  print "Maximum deviation between fitted and error free data:"
  print "    %4.3f" %(abs_deviation)
  abs_deviation = flex.mean(
    flex.abs( (y_ideal- y_fitted) ) )
  print "Mean deviation between fitted and error free data:"
  print "    %4.3f" %(abs_deviation)
  print
  abs_deviation = flex.max(
    flex.abs( (y_obs- y_fitted) ) )
  print "Maximum deviation between fitted and observed data:"
  print "    %4.3f" %(abs_deviation)
  abs_deviation = flex.mean(
    flex.abs( (y_obs- y_fitted) ) )
  print "Mean deviation between fitted and observed data:"
  print "    %4.3f" %(abs_deviation)
  print
  print "Showing 10 points"
  print "   x    y_obs y_ideal y_fit"
  for ii in range(10):
    print "%6.3f %6.3f %6.3f %6.3f" \
          %(x_obs[ii*9], y_obs[ii*9], y_ideal[ii*9], y_fitted[ii*9])

  try:
    from iotbx import data_plots
  except ImportError:
    pass
  else:
    print "Preparing output for loggraph in a file called"
    print "   chebyshev.loggraph"
    chebyshev_plot = data_plots.plot_data(plot_title='Chebyshev fitting',
                                          x_label = 'x values',
                                          y_label = 'y values',
                                          x_data = x_obs,
                                          y_data = y_obs,
                                          y_legend = 'Observed y values',
                                          comments = 'Chebyshev fit')
    chebyshev_plot.add_data(y_data=y_ideal,
                            y_legend='Error free y values')
    chebyshev_plot.add_data(y_data=y_fitted,
                            y_legend='Fitted chebyshev approximation')
    output_logfile=open('chebyshev.loggraph','w')
    f = StringIO()
    data_plots.plot_data_loggraph(chebyshev_plot,f)
    output_logfile.write(f.getvalue())