Example #1
0
 def log_p_obs_given_gamma(self, gamma):
   dof = self.degrees_of_freedom
   x_gamma = (gamma * self.delta_fc2.data() - self.delta_fo2.data()) \
           / self.delta_fo2.sigmas()
   if self.probability_plot_slope is not None:
     x_gamma /= self.probability_plot_slope
   return -(1+dof)/2 * flex.sum(flex.log(flex.pow2(x_gamma) + dof))
Example #2
0
  def _calc_residuals(va, vb, pa, pb, sa, sb):

    mtch_indcs = va.miller_array.match_indices(vb.miller_array,
                                               assert_is_similar_symmetry=False)

    va_selection = mtch_indcs.pair_selection(0)
    vb_selection = mtch_indcs.pair_selection(1)

    sp_a = pa.select(va_selection) * sa.select(va_selection)
    sp_b = pb.select(vb_selection) * sb.select(vb_selection)

    ia_over_ib = va.miller_array.data().select(va_selection) / \
                 vb.miller_array.data().select(vb_selection)

    residuals = (flex.log(sp_a) - flex.log(sp_b) - flex.log(ia_over_ib))
    residuals = residuals.as_numpy_array()
    #logging.debug("Mean Residual: {}".format(np.mean(residuals)))
    return residuals[~np.isnan(residuals)]
 def sigmaa_model_error(self):
   x = 0.25*flex.pow( self.h_array, 2.0/3.0 )  # h was in d*^-3 !!!
   y = flex.log( self.sigmaa_fitted )
   #compute the slope please
   result = flex.linear_regression( x, y )
   result = -(result.slope()/math.pi*3)
   if result < 0:
     result = None
   else:
     result = math.sqrt( result )
   return result
def log_fit(x, y, degree=5):
    """Fit the values log(y(x)) then return exp() to this fit.

    x, y should be iterables containing floats of the same size. The order is the order
    of polynomial to use for this fit. This will be useful for e.g. I/sigma."""

    fit = curve_fitting.univariate_polynomial_fit(x,
                                                  flex.log(y),
                                                  degree=degree,
                                                  max_iterations=100)
    f = curve_fitting.univariate_polynomial(*fit.params)
    return flex.exp(f(x))
def log_inv_fit(x, y, degree=5):
    """Fit the values log(1 / y(x)) then return the inverse of this fit.

    x, y should be iterables, the order of the polynomial for the transformed
    fit needs to be specified. This will be useful for e.g. Rmerge."""

    fit = curve_fitting.univariate_polynomial_fit(x,
                                                  flex.log(1 / y),
                                                  degree=degree,
                                                  max_iterations=100)
    f = curve_fitting.univariate_polynomial(*fit.params)
    return 1 / flex.exp(f(x))
Example #6
0
    def __init__(self, f_obs, asu_contents, e_statistics=False):
        assert f_obs.is_real_array()
        self.info = f_obs.info()
        f_obs_selected = f_obs.select(f_obs.data() > 0)
        f_obs_selected.use_binning_of(f_obs)
        # compute <fobs^2> in resolution shells
        self.mean_fobs_sq = f_obs_selected.mean_sq(use_binning=True, use_multiplicities=True).data[1:-1]
        n_none = self.mean_fobs_sq.count(None)
        if n_none > 0:
            error_message = "wilson_plot error: %d empty bin%s:" % plural_s(n_none)
            if self.info is not None:
                error_message += "\n  Info: " + str(self.info)
            error_message += "\n  Number of bins: %d" % len(self.mean_fobs_sq)
            error_message += "\n  Number of f_obs > 0: %d" % (f_obs_selected.indices().size())
            error_message += "\n  Number of f_obs <= 0: %d" % (f_obs.indices().size() - f_obs_selected.indices().size())
            raise RuntimeError(error_message)
        self.mean_fobs_sq = flex.double(self.mean_fobs_sq)
        # compute <s^2> = <(sin(theta)/lambda)^2> in resolution shells
        stol_sq = f_obs_selected.sin_theta_over_lambda_sq()
        stol_sq.use_binner_of(f_obs_selected)
        self.mean_stol_sq = flex.double(stol_sq.mean(use_binning=True, use_multiplicities=True).data[1:-1])
        # cache scattering factor info
        gaussians = {}
        for chemical_type in asu_contents.keys():
            gaussians[chemical_type] = eltbx.xray_scattering.wk1995(chemical_type).fetch()
        # compute expected f_calc^2 in resolution shells
        self.expected_f_sq = flex.double()
        for stol_sq in self.mean_stol_sq:
            sum_fj_sq = 0
            for chemical_type, n_atoms in asu_contents.items():
                f0 = gaussians[chemical_type].at_stol_sq(stol_sq)
                sum_fj_sq += f0 * f0 * n_atoms
            self.expected_f_sq.append(sum_fj_sq)
        self.expected_f_sq *= f_obs_selected.space_group().order_z() * f_obs_selected.space_group().n_ltr()
        # fit to straight line
        self.x = self.mean_stol_sq
        self.y = flex.log(self.mean_fobs_sq / self.expected_f_sq)
        fit = flex.linear_regression(self.x, self.y)
        assert fit.is_well_defined()
        self.fit_y_intercept = fit.y_intercept()
        self.fit_slope = fit.slope()
        self.wilson_intensity_scale_factor = math.exp(self.fit_y_intercept)  # intensity scale factor
        self.wilson_k = math.sqrt(self.wilson_intensity_scale_factor)  # conversion to amplitude scale factor
        self.wilson_b = -self.fit_slope / 2
        self.fit_correlation = flex.linear_correlation(self.x, self.y).coefficient()

        if e_statistics:
            normalised = f_obs_selected.normalised_amplitudes(asu_contents, self)
            self.normalised_f_obs = normalised.array()
            self.mean_e_sq_minus_1 = normalised.mean_e_sq_minus_1()
            self.percent_e_sq_gt_2 = normalised.percent_e_sq_gt_2()
    def do_something_clever(self, obs, sobs, calc, mock):
        # first get the sort order
        # sort on the calculated data please
        sort_order = flex.sort_permutation(calc)
        inverse_sort_order = sort_order.inverse_permutation()

        sorted_obs = obs.select(sort_order)
        sorted_sobs = sobs.select(sort_order)
        sorted_calc = calc.select(sort_order)
        sorted_mock = mock.select(sort_order)

        log_calc = flex.log(sorted_mock)
        deltas = flex.log(sorted_obs) - flex.log(sorted_calc)

        old_deltas = deltas.deep_copy()

        # make bins on the basis of the order
        bin_size = float(sorted_obs.size()) / self.n_e_bins
        bin_size = int(bin_size) + 1
        ebin = flex.int()
        count = 0
        for ii in range(sorted_obs.size()):
            if ii % bin_size == 0:
                count += 1
            ebin.append(count - 1)

        # the bins have been setup, now we can reorder stuff
        for ibin in range(self.n_e_bins):
            this_bin_selection = flex.bool(ebin == ibin)
            tmp_n = (this_bin_selection).count(True)
            permute = flex.sort_permutation(flex.random_double(tmp_n))

            #select and swap
            selected_deltas = deltas.select(this_bin_selection)
            selected_deltas = selected_deltas.select(permute)
            selected_sobs = sorted_sobs.select(this_bin_selection)
            selected_sobs = selected_sobs.select(permute)

            # we have to make a sanity check so that the selected deltas are not very weerd
            # a safeguard to prevent the introductoin of outliers
            mean_delta = flex.mean(selected_deltas)
            std_delta = math.sqrt(
                flex.mean(selected_deltas * selected_deltas) -
                mean_delta * mean_delta)
            outliers = flex.bool(
                flex.abs(selected_deltas - mean_delta) > self.thres *
                std_delta)
            #print list( flex.abs(selected_deltas-mean_delta)/std_delta )
            #print list( outliers )

            if (outliers).count(True) > 0:
                non_out_delta = selected_deltas.select(~outliers)
                tmp_permut = flex.sort_permutation(
                    flex.random_double((~outliers).count(True)))
                tmp_delta = non_out_delta.select(tmp_permut)
                tmp_delta = tmp_delta[0:(outliers).count(True)]
                selected_deltas = selected_deltas.set_selected(
                    outliers.iselection(), tmp_delta)

            #set the deltas back please
            deltas = deltas.set_selected(this_bin_selection, selected_deltas)
            sorted_sobs = sorted_sobs.set_selected(this_bin_selection,
                                                   selected_sobs)

        #the deltas have been swapped, apply things back please
        log_calc = log_calc + deltas
        log_calc = flex.exp(log_calc)

        #now we have to get things back in proper order again thank you
        new_fobs = log_calc.select(inverse_sort_order)
        new_sobs = sorted_sobs.select(inverse_sort_order)
        return new_fobs, new_sobs
Example #8
0
    def run(self, args, command_name, out=sys.stdout):
        command_line = (iotbx_option_parser(
            usage="%s [options]" % command_name,
            description='Example: %s data.mtz data.mtz ref_model.pdb' %
            command_name).option(
                None,
                "--show_defaults",
                action="store_true",
                help="Show list of parameters.")).process(args=args)

        cif_file = None
        processed_args = utils.process_command_line_args(
            args=args, log=sys.stdout, master_params=master_phil)
        params = processed_args.params
        if (params is None): params = master_phil
        self.params = params.extract().ensemble_probability
        pdb_file_names = processed_args.pdb_file_names
        if len(pdb_file_names) != 1:
            raise Sorry("Only one PDB structure may be used")
        pdb_file = file_reader.any_file(pdb_file_names[0])
        self.log = multi_out()
        self.log.register(label="stdout", file_object=sys.stdout)
        self.log.register(label="log_buffer",
                          file_object=StringIO(),
                          atexit_send_to=None)
        sys.stderr = self.log
        log_file = open(
            pdb_file_names[0].split('/')[-1].replace('.pdb', '') +
            '_pensemble.log', "w")

        self.log.replace_stringio(old_label="log_buffer",
                                  new_label="log",
                                  new_file_object=log_file)
        utils.print_header(command_name, out=self.log)
        params.show(out=self.log)
        #
        f_obs = None
        r_free_flags = None
        reflection_files = processed_args.reflection_files

        if self.params.fobs_vs_fcalc_post_nll:
            if len(reflection_files) == 0:
                raise Sorry(
                    "Fobs from input MTZ required for fobs_vs_fcalc_post_nll")

        if len(reflection_files) > 0:
            crystal_symmetry = processed_args.crystal_symmetry
            print('Reflection file : ',
                  processed_args.reflection_file_names[0],
                  file=self.log)
            utils.print_header("Model and data statistics", out=self.log)
            rfs = reflection_file_server(
                crystal_symmetry=crystal_symmetry,
                reflection_files=processed_args.reflection_files,
                log=self.log)

            parameters = extract_xtal_data.data_and_flags_master_params(
            ).extract()
            determine_data_and_flags_result = extract_xtal_data.run(
                reflection_file_server=rfs,
                parameters=parameters,
                data_parameter_scope="refinement.input.xray_data",
                flags_parameter_scope="refinement.input.xray_data.r_free_flags",
                data_description="X-ray data",
                keep_going=True,
                log=self.log)
            f_obs = determine_data_and_flags_result.f_obs
            number_of_reflections = f_obs.indices().size()
            r_free_flags = determine_data_and_flags_result.r_free_flags
            test_flag_value = determine_data_and_flags_result.test_flag_value
            if (r_free_flags is None):
                r_free_flags = f_obs.array(
                    data=flex.bool(f_obs.data().size(), False))

        # process PDB
        pdb_file.assert_file_type("pdb")
        #
        pdb_in = hierarchy.input(file_name=pdb_file.file_name)
        ens_pdb_hierarchy = pdb_in.construct_hierarchy()
        ens_pdb_hierarchy.atoms().reset_i_seq()
        ens_pdb_xrs_s = pdb_in.input.xray_structures_simple()
        number_structures = len(ens_pdb_xrs_s)
        print('Number of structure in ensemble : ',
              number_structures,
              file=self.log)

        # Calculate sigmas from input map only
        if self.params.assign_sigma_from_map and self.params.ensemble_sigma_map_input is not None:
            # process MTZ
            input_file = file_reader.any_file(
                self.params.ensemble_sigma_map_input)
            if input_file.file_type == "hkl":
                if input_file.file_object.file_type() != "ccp4_mtz":
                    raise Sorry("Only MTZ format accepted for map input")
                else:
                    mtz_file = input_file
            else:
                raise Sorry("Only MTZ format accepted for map input")
            miller_arrays = mtz_file.file_server.miller_arrays
            map_coeffs_1 = miller_arrays[0]
            #
            xrs_list = []
            for n, ens_pdb_xrs in enumerate(ens_pdb_xrs_s):
                # get sigma levels from ensemble fc for each structure
                xrs = get_map_sigma(ens_pdb_hierarchy=ens_pdb_hierarchy,
                                    ens_pdb_xrs=ens_pdb_xrs,
                                    map_coeffs_1=map_coeffs_1,
                                    residue_detail=self.params.residue_detail,
                                    ignore_hd=self.params.ignore_hd,
                                    log=self.log)
                xrs_list.append(xrs)
            # write ensemble pdb file, occupancies as sigma level
            filename = pdb_file_names[0].split('/')[-1].replace(
                '.pdb',
                '') + '_vs_' + self.params.ensemble_sigma_map_input.replace(
                    '.mtz', '') + '_pensemble.pdb'
            write_ensemble_pdb(filename=filename,
                               xrs_list=xrs_list,
                               ens_pdb_hierarchy=ens_pdb_hierarchy)

        # Do full analysis vs Fobs
        else:
            model_map_coeffs = []
            fmodel = None
            # Get <fcalc>
            for model, ens_pdb_xrs in enumerate(ens_pdb_xrs_s):
                ens_pdb_xrs.set_occupancies(1.0)
                if model == 0:
                    # If mtz not supplied get fobs from xray structure...
                    # Use input Fobs for scoring against nll
                    if self.params.fobs_vs_fcalc_post_nll:
                        dummy_fobs = f_obs
                    else:
                        if f_obs == None:
                            if self.params.fcalc_high_resolution == None:
                                raise Sorry(
                                    "Please supply high resolution limit or input mtz file."
                                )
                            dummy_dmin = self.params.fcalc_high_resolution
                            dummy_dmax = self.params.fcalc_low_resolution
                        else:
                            print(
                                'Supplied mtz used to determine high and low resolution cuttoffs',
                                file=self.log)
                            dummy_dmax, dummy_dmin = f_obs.d_max_min()
                        #
                        dummy_fobs = abs(
                            ens_pdb_xrs.structure_factors(
                                d_min=dummy_dmin).f_calc())
                        dummy_fobs.set_observation_type_xray_amplitude()
                        # If mtz supplied, free flags are over written to prevent array size error
                        r_free_flags = dummy_fobs.array(
                            data=flex.bool(dummy_fobs.data().size(), False))
                    #
                    fmodel = utils.fmodel_simple(
                        scattering_table="wk1995",
                        xray_structures=[ens_pdb_xrs],
                        f_obs=dummy_fobs,
                        target_name='ls',
                        bulk_solvent_and_scaling=False,
                        r_free_flags=r_free_flags)
                    f_calc_ave = fmodel.f_calc().array(
                        data=fmodel.f_calc().data() * 0).deep_copy()
                    # XXX Important to ensure scale is identical for each model and <model>
                    fmodel.set_scale_switch = 1.0
                    f_calc_ave_total = fmodel.f_calc().data().deep_copy()
                else:
                    fmodel.update_xray_structure(xray_structure=ens_pdb_xrs,
                                                 update_f_calc=True,
                                                 update_f_mask=False)
                    f_calc_ave_total += fmodel.f_calc().data().deep_copy()
                print('Model :', model + 1, file=self.log)
                print("\nStructure vs real Fobs (no bulk solvent or scaling)",
                      file=self.log)
                print('Rwork          : %5.4f ' % fmodel.r_work(),
                      file=self.log)
                print('Rfree          : %5.4f ' % fmodel.r_free(),
                      file=self.log)
                print('K1             : %5.4f ' % fmodel.scale_k1(),
                      file=self.log)
                fcalc_edm = fmodel.electron_density_map()
                fcalc_map_coeffs = fcalc_edm.map_coefficients(map_type='Fc')
                fcalc_mtz_dataset = fcalc_map_coeffs.as_mtz_dataset(
                    column_root_label='Fc')
                if self.params.output_model_and_model_ave_mtz:
                    fcalc_mtz_dataset.mtz_object().write(
                        file_name=str(model + 1) + "_Fc.mtz")
                model_map_coeffs.append(fcalc_map_coeffs.deep_copy())

            fmodel.update(f_calc=f_calc_ave.array(f_calc_ave_total /
                                                  number_structures))
            print("\nEnsemble vs real Fobs (no bulk solvent or scaling)",
                  file=self.log)
            print('Rwork          : %5.4f ' % fmodel.r_work(), file=self.log)
            print('Rfree          : %5.4f ' % fmodel.r_free(), file=self.log)
            print('K1             : %5.4f ' % fmodel.scale_k1(), file=self.log)

            # Get <Fcalc> map
            fcalc_ave_edm = fmodel.electron_density_map()
            fcalc_ave_map_coeffs = fcalc_ave_edm.map_coefficients(
                map_type='Fc').deep_copy()
            fcalc_ave_mtz_dataset = fcalc_ave_map_coeffs.as_mtz_dataset(
                column_root_label='Fc')
            if self.params.output_model_and_model_ave_mtz:
                fcalc_ave_mtz_dataset.mtz_object().write(file_name="aveFc.mtz")
            fcalc_ave_map_coeffs = fcalc_ave_map_coeffs.fft_map()
            fcalc_ave_map_coeffs.apply_volume_scaling()
            fcalc_ave_map_data = fcalc_ave_map_coeffs.real_map_unpadded()
            fcalc_ave_map_stats = maptbx.statistics(fcalc_ave_map_data)

            print("<Fcalc> Map Stats :", file=self.log)
            fcalc_ave_map_stats.show_summary(f=self.log)
            offset = fcalc_ave_map_stats.min()
            model_neg_ll = []

            number_previous_scatters = 0

            # Run through structure list again and get probability
            xrs_list = []
            for model, ens_pdb_xrs in enumerate(ens_pdb_xrs_s):
                if self.params.verbose:
                    print('\n\nModel                   : ',
                          model + 1,
                          file=self.log)
                # Get model atom sigmas vs Fcalc
                fcalc_map = model_map_coeffs[model].fft_map()
                fcalc_map.apply_volume_scaling()
                fcalc_map_data = fcalc_map.real_map_unpadded()
                fcalc_map_stats = maptbx.statistics(fcalc_map_data)
                if self.params.verbose:
                    print("Fcalc map stats         :", file=self.log)
                fcalc_map_stats.show_summary(f=self.log)

                xrs = get_map_sigma(
                    ens_pdb_hierarchy=ens_pdb_hierarchy,
                    ens_pdb_xrs=ens_pdb_xrs,
                    fft_map_1=fcalc_map,
                    model_i=model,
                    residue_detail=self.params.residue_detail,
                    ignore_hd=self.params.ignore_hd,
                    number_previous_scatters=number_previous_scatters,
                    log=self.log)
                fcalc_sigmas = xrs.scatterers().extract_occupancies()
                del fcalc_map
                # Get model atom sigmas vs <Fcalc>
                xrs = get_map_sigma(
                    ens_pdb_hierarchy=ens_pdb_hierarchy,
                    ens_pdb_xrs=ens_pdb_xrs,
                    fft_map_1=fcalc_ave_map_coeffs,
                    model_i=model,
                    residue_detail=self.params.residue_detail,
                    ignore_hd=self.params.ignore_hd,
                    number_previous_scatters=number_previous_scatters,
                    log=self.log)

                ### For testing other residue averaging options
                #print xrs.residue_selections

                fcalc_ave_sigmas = xrs.scatterers().extract_occupancies()
                # Probability of model given <model>
                prob = fcalc_ave_sigmas / fcalc_sigmas
                # XXX debug option
                if False:
                    for n, p in enumerate(prob):
                        print(' {0:5d} {1:5.3f}'.format(n, p), file=self.log)
                # Set probabilty between 0 and 1
                # XXX Make Histogram / more stats
                prob_lss_zero = flex.bool(prob <= 0)
                prob_grt_one = flex.bool(prob > 1)
                prob.set_selected(prob_lss_zero, 0.001)
                prob.set_selected(prob_grt_one, 1.0)
                xrs.set_occupancies(prob)
                xrs_list.append(xrs)
                sum_neg_ll = sum(-flex.log(prob))
                model_neg_ll.append((sum_neg_ll, model))
                if self.params.verbose:
                    print('Model probability stats :', file=self.log)
                    print(prob.min_max_mean().show(), file=self.log)
                    print('  Count < 0.0 : ',
                          prob_lss_zero.count(True),
                          file=self.log)
                    print('  Count > 1.0 : ',
                          prob_grt_one.count(True),
                          file=self.log)

                # For averaging by residue
                number_previous_scatters += ens_pdb_xrs.sites_cart().size()

            # write ensemble pdb file, occupancies as sigma level
            write_ensemble_pdb(
                filename=pdb_file_names[0].split('/')[-1].replace('.pdb', '') +
                '_pensemble.pdb',
                xrs_list=xrs_list,
                ens_pdb_hierarchy=ens_pdb_hierarchy)

            # XXX Test ordering models by nll
            # XXX Test removing nth percentile atoms
            if self.params.sort_ensemble_by_nll_score or self.params.fobs_vs_fcalc_post_nll:
                for percentile in [1.0, 0.975, 0.95, 0.9, 0.8, 0.6, 0.2]:
                    model_neg_ll = sorted(model_neg_ll)
                    f_calc_ave_total_reordered = None
                    print_list = []
                    for i_neg_ll in model_neg_ll:
                        xrs = xrs_list[i_neg_ll[1]]
                        nll_occ = xrs.scatterers().extract_occupancies()

                        # Set q=0 nth percentile atoms
                        sorted_nll_occ = sorted(nll_occ, reverse=True)
                        number_atoms = len(sorted_nll_occ)
                        percentile_prob_cutoff = sorted_nll_occ[
                            int(number_atoms * percentile) - 1]
                        cutoff_selections = flex.bool(
                            nll_occ < percentile_prob_cutoff)
                        cutoff_nll_occ = flex.double(nll_occ.size(),
                                                     1.0).set_selected(
                                                         cutoff_selections,
                                                         0.0)
                        #XXX Debug
                        if False:
                            print('\nDebug')
                            for x in range(len(cutoff_selections)):
                                print(cutoff_selections[x], nll_occ[x],
                                      cutoff_nll_occ[x])
                            print(percentile)
                            print(percentile_prob_cutoff)
                            print(cutoff_selections.count(True))
                            print(cutoff_selections.size())
                            print(cutoff_nll_occ.count(0.0))
                            print('Count q = 1           : ',
                                  cutoff_nll_occ.count(1.0))
                            print('Count scatterers size : ',
                                  cutoff_nll_occ.size())

                        xrs.set_occupancies(cutoff_nll_occ)
                        fmodel.update_xray_structure(xray_structure=xrs,
                                                     update_f_calc=True,
                                                     update_f_mask=True)

                        if f_calc_ave_total_reordered == None:
                            f_calc_ave_total_reordered = fmodel.f_calc().data(
                            ).deep_copy()
                            f_mask_ave_total_reordered = fmodel.f_masks(
                            )[0].data().deep_copy()
                            cntr = 1
                        else:
                            f_calc_ave_total_reordered += fmodel.f_calc().data(
                            ).deep_copy()
                            f_mask_ave_total_reordered += fmodel.f_masks(
                            )[0].data().deep_copy()
                            cntr += 1
                        fmodel.update(
                            f_calc=f_calc_ave.array(
                                f_calc_ave_total_reordered / cntr).deep_copy(),
                            f_mask=f_calc_ave.array(
                                f_mask_ave_total_reordered / cntr).deep_copy())

                        # Update solvent and scale
                        # XXX Will need to apply_back_trace on latest version
                        fmodel.set_scale_switch = 0
                        fmodel.update_all_scales()

                        # Reset occ for outout
                        xrs.set_occupancies(nll_occ)
                        # k1 updated vs Fobs
                        if self.params.fobs_vs_fcalc_post_nll:
                            print_list.append([
                                cntr, i_neg_ll[0], i_neg_ll[1],
                                fmodel.r_work(),
                                fmodel.r_free()
                            ])

                    # Order models by nll and print summary
                    print(
                        '\nModels ranked by nll <Fcalc> R-factors recalculated',
                        file=self.log)
                    print('Percentile cutoff : {0:5.3f}'.format(percentile),
                          file=self.log)
                    xrs_list_sorted_nll = []
                    print('      |      NLL     <Rw>     <Rf>    Ens Model',
                          file=self.log)
                    for info in print_list:
                        print(' {0:4d} | {1:8.1f} {2:8.4f} {3:8.4f} {4:12d}'.
                              format(
                                  info[0],
                                  info[1],
                                  info[3],
                                  info[4],
                                  info[2] + 1,
                              ),
                              file=self.log)
                        xrs_list_sorted_nll.append(xrs_list[info[2]])

                # Output nll ordered ensemble

                write_ensemble_pdb(
                    filename='nll_ordered_' +
                    pdb_file_names[0].split('/')[-1].replace('.pdb', '') +
                    '_pensemble.pdb',
                    xrs_list=xrs_list_sorted_nll,
                    ens_pdb_hierarchy=ens_pdb_hierarchy)
Example #9
0
    def __init__(self, f_obs, asu_contents, e_statistics=False):
        assert f_obs.is_real_array()
        self.info = f_obs.info()
        f_obs_selected = f_obs.select(f_obs.data() > 0)
        f_obs_selected.use_binning_of(f_obs)
        # compute <fobs^2> in resolution shells
        self.mean_fobs_sq = f_obs_selected.mean_sq(
            use_binning=True, use_multiplicities=True).data[1:-1]
        n_none = self.mean_fobs_sq.count(None)
        if (n_none > 0):
            error_message = "wilson_plot error: %d empty bin%s:" % plural_s(
                n_none)
            if (self.info is not None):
                error_message += "\n  Info: " + str(self.info)
            error_message += "\n  Number of bins: %d" % len(self.mean_fobs_sq)
            error_message += "\n  Number of f_obs > 0: %d" % (
                f_obs_selected.indices().size())
            error_message += "\n  Number of f_obs <= 0: %d" % (
                f_obs.indices().size() - f_obs_selected.indices().size())
            raise RuntimeError(error_message)
        self.mean_fobs_sq = flex.double(self.mean_fobs_sq)
        # compute <s^2> = <(sin(theta)/lambda)^2> in resolution shells
        stol_sq = f_obs_selected.sin_theta_over_lambda_sq()
        stol_sq.use_binner_of(f_obs_selected)
        self.mean_stol_sq = flex.double(
            stol_sq.mean(use_binning=True, use_multiplicities=True).data[1:-1])
        # cache scattering factor info
        gaussians = {}
        for chemical_type in asu_contents.keys():
            gaussians[chemical_type] = eltbx.xray_scattering.wk1995(
                chemical_type).fetch()
        # compute expected f_calc^2 in resolution shells
        self.expected_f_sq = flex.double()
        for stol_sq in self.mean_stol_sq:
            sum_fj_sq = 0
            for chemical_type, n_atoms in asu_contents.items():
                f0 = gaussians[chemical_type].at_stol_sq(stol_sq)
                sum_fj_sq += f0 * f0 * n_atoms
            self.expected_f_sq.append(sum_fj_sq)
        self.expected_f_sq *= f_obs_selected.space_group().order_z() \
                            * f_obs_selected.space_group().n_ltr()
        # fit to straight line
        self.x = self.mean_stol_sq
        self.y = flex.log(self.mean_fobs_sq / self.expected_f_sq)
        fit = flex.linear_regression(self.x, self.y)
        assert fit.is_well_defined()
        self.fit_y_intercept = fit.y_intercept()
        self.fit_slope = fit.slope()
        self.wilson_intensity_scale_factor = math.exp(
            self.fit_y_intercept)  # intensity scale factor
        self.wilson_k = math.sqrt(self.wilson_intensity_scale_factor
                                  )  # conversion to amplitude scale factor
        self.wilson_b = -self.fit_slope / 2
        self.fit_correlation = flex.linear_correlation(self.x,
                                                       self.y).coefficient()

        if e_statistics:
            normalised = f_obs_selected.normalised_amplitudes(
                asu_contents, self)
            self.normalised_f_obs = normalised.array()
            self.mean_e_sq_minus_1 = normalised.mean_e_sq_minus_1()
            self.percent_e_sq_gt_2 = normalised.percent_e_sq_gt_2()
Example #10
0
  def __init__(self,
               miller_obs,
               miller_calc,
               r_free_flags,
               kernel_width_free_reflections=None,
               kernel_width_d_star_cubed=None,
               kernel_in_bin_centers=False,
               kernel_on_chebyshev_nodes=True,
               n_sampling_points=20,
               n_chebyshev_terms=10,
               use_sampling_sum_weights=False,
               make_checks_and_clean_up=True):
    assert [kernel_width_free_reflections, kernel_width_d_star_cubed].count(None) == 1

    self.miller_obs = miller_obs
    self.miller_calc = abs(miller_calc)
    self.r_free_flags = r_free_flags
    self.kernel_width_free_reflections = kernel_width_free_reflections
    self.kernel_width_d_star_cubed = kernel_width_d_star_cubed
    self.n_chebyshev_terms = n_chebyshev_terms

    if make_checks_and_clean_up:
      self.miller_obs = self.miller_obs.map_to_asu()
      self.miller_calc = self.miller_calc.map_to_asu()
      self.r_free_flags = self.r_free_flags.map_to_asu()
      assert self.r_free_flags.indices().all_eq(
        self.miller_obs.indices() )
      self.miller_calc = self.miller_calc.common_set(
        self.miller_obs )
      assert self.r_free_flags.indices().all_eq(
        self.miller_calc.indices() )
      assert self.miller_obs.is_real_array()

      if self.miller_obs.is_xray_intensity_array():
        self.miller_obs = self.miller_obs.f_sq_as_f()
      assert self.miller_obs.observation_type() is None or \
             self.miller_obs.is_xray_amplitude_array()

    if self.miller_calc.observation_type() is None:
      self.miller_calc = self.miller_calc.set_observation_type(
        self.miller_obs)

    # get normalized data please
    self.normalized_obs_f = absolute_scaling.kernel_normalisation(
      self.miller_obs, auto_kernel=True)
    self.normalized_obs =self.normalized_obs_f.normalised_miller_dev_eps.f_sq_as_f()

    self.normalized_calc_f = absolute_scaling.kernel_normalisation(
      self.miller_calc, auto_kernel=True)
    self.normalized_calc =self.normalized_calc_f.normalised_miller_dev_eps.f_sq_as_f()

    # get the 'free data'

    if(self.r_free_flags.data().count(True) == 0):
      self.r_free_flags = self.r_free_flags.array(
        data = ~self.r_free_flags.data())

    self.free_norm_obs = self.normalized_obs.select( self.r_free_flags.data() )
    self.free_norm_calc= self.normalized_calc.select( self.r_free_flags.data() )

    if self.free_norm_obs.data().size() <= 0:
      raise RuntimeError("No free reflections.")

    if (self.kernel_width_d_star_cubed is None):
      self.kernel_width_d_star_cubed=sigmaa_estimator_kernel_width_d_star_cubed(
        r_free_flags=self.r_free_flags,
        kernel_width_free_reflections=self.kernel_width_free_reflections)

    self.sigma_target_functor = ext.sigmaa_estimator(
      e_obs     = self.free_norm_obs.data(),
      e_calc    = self.free_norm_calc.data(),
      centric   = self.free_norm_obs.centric_flags().data(),
      d_star_cubed = self.free_norm_obs.d_star_cubed().data() ,
      width=self.kernel_width_d_star_cubed)

    d_star_cubed_overall = self.miller_obs.d_star_cubed().data()
    self.min_h = flex.min( d_star_cubed_overall )
    self.max_h = flex.max( d_star_cubed_overall )
    self.h_array = None
    if (kernel_in_bin_centers):
      self.h_array = flex.double( range(1,n_sampling_points*2,2) )*(
        self.max_h-self.min_h)/(n_sampling_points*2)+self.min_h
    else:
      self.min_h *= 0.99
      self.max_h *= 1.01
      if kernel_on_chebyshev_nodes:
        self.h_array = chebyshev_lsq_fit.chebyshev_nodes(
          n=n_sampling_points,
          low=self.min_h,
          high=self.max_h,
          include_limits=True)
      else:
        self.h_array = flex.double( range(n_sampling_points) )*(
          self.max_h-self.min_h)/float(n_sampling_points-1.0)+self.min_h
    assert self.h_array.size() == n_sampling_points
    self.sigmaa_array = flex.double()
    self.sigmaa_array.reserve(self.h_array.size())
    self.sum_weights = flex.double()
    self.sum_weights.reserve(self.h_array.size())

    for h in self.h_array:
      stimator = sigmaa_point_estimator(self.sigma_target_functor, h)
      self.sigmaa_array.append( stimator.sigmaa )
      self.sum_weights.append(
        self.sigma_target_functor.sum_weights(d_star_cubed=h))

    # fit a smooth function
    reparam_sa = -flex.log( 1.0/self.sigmaa_array -1.0 )
    if (use_sampling_sum_weights):
      w_obs = flex.sqrt(self.sum_weights)
    else:
      w_obs = None
    fit_lsq = chebyshev_lsq_fit.chebyshev_lsq_fit(
      n_terms=self.n_chebyshev_terms,
      x_obs=self.h_array,
      y_obs=reparam_sa,
      w_obs=w_obs)

    cheb_pol = chebyshev_polynome(
        self.n_chebyshev_terms,
        self.min_h,
        self.max_h,
        fit_lsq.coefs)
    def reverse_reparam(values): return 1.0/(1.0 + flex.exp(-values))
    self.sigmaa_fitted = reverse_reparam(cheb_pol.f(self.h_array))
    self.sigmaa_miller_array = reverse_reparam(cheb_pol.f(d_star_cubed_overall))
    assert flex.min(self.sigmaa_miller_array) >= 0
    assert flex.max(self.sigmaa_miller_array) <= 1
    self.sigmaa_miller_array = self.miller_obs.array(data=self.sigmaa_miller_array)

    self.alpha = None
    self.beta = None
    self.fom_array = None
  def __init__(self,
               miller_array,
               kernel_width=None,
               n_bins=23,
               n_term=13,
               d_star_sq_low=None,
               d_star_sq_high=None,
               auto_kernel=False,
               number_of_sorted_reflections_for_auto_kernel=50):
    ## Autokernel is either False, true or a specific integer
    if kernel_width is None:
      assert (auto_kernel is not False)
    if auto_kernel is not False:
      assert (kernel_width==None)
    assert miller_array.size()>0
    ## intensity arrays please
    work_array = None
    if not miller_array.is_real_array():
      raise RuntimeError("Please provide real arrays only")
      ## I might have to change this upper condition
    if miller_array.is_xray_amplitude_array():
      work_array = miller_array.f_as_f_sq()
    if miller_array.is_xray_intensity_array():
      work_array = miller_array.deep_copy()
      work_array = work_array.set_observation_type(miller_array)
    ## If type is not intensity or amplitude
    ## raise an execption please
    if not miller_array.is_xray_intensity_array():
      if not miller_array.is_xray_amplitude_array():
        raise RuntimeError("Observation type unknown")
    ## declare some shorthands
    I_obs = work_array.data()
    epsilons = work_array.epsilons().data().as_double()
    d_star_sq_hkl = work_array.d_spacings().data()
    d_star_sq_hkl = 1.0/(d_star_sq_hkl*d_star_sq_hkl)
    ## Set up some limits
    if d_star_sq_low is None:
      d_star_sq_low = flex.min(d_star_sq_hkl)
    if d_star_sq_high is None:
      d_star_sq_high = flex.max(d_star_sq_hkl)
    ## A feeble attempt to determine an appropriate kernel width
    ## that seems to work reasonable in practice
    self.kernel_width=kernel_width
    if auto_kernel is not False:
      ## get the d_star_sq_array and sort it
      sort_permut = flex.sort_permutation(d_star_sq_hkl)
      ##
      if auto_kernel==True:
        number=number_of_sorted_reflections_for_auto_kernel
      else:
        number=int(auto_kernel)
      if number > d_star_sq_hkl.size():
        number = d_star_sq_hkl.size()-1
      self.kernel_width = d_star_sq_hkl[sort_permut[number]]-d_star_sq_low
      assert self.kernel_width > 0
    ## Making the d_star_sq_array
    assert (n_bins>1) ## assure that there are more then 1 bins for interpolation
    self.d_star_sq_array = chebyshev_lsq_fit.chebyshev_nodes(
      n=n_bins,
      low=d_star_sq_low,
      high=d_star_sq_high,
      include_limits=True)

    ## Now get the average intensity please
    ##
    ## This step can be reasonably time consuming
    self.mean_I_array = scaling.kernel_normalisation(
      d_star_sq_hkl = d_star_sq_hkl,
      I_hkl = I_obs,
      epsilon = epsilons,
      d_star_sq_array = self.d_star_sq_array,
      kernel_width = self.kernel_width
      )
    self.var_I_array = scaling.kernel_normalisation(
      d_star_sq_hkl = d_star_sq_hkl,
      I_hkl = I_obs*I_obs,
      epsilon = epsilons*epsilons,
      d_star_sq_array = self.d_star_sq_array,
      kernel_width = self.kernel_width
      )
    self.var_I_array = self.var_I_array - self.mean_I_array*self.mean_I_array
    self.weight_sum = self.var_I_array = scaling.kernel_normalisation(
      d_star_sq_hkl = d_star_sq_hkl,
      I_hkl = I_obs*0.0+1.0,
      epsilon = epsilons*0.0+1.0,
      d_star_sq_array = self.d_star_sq_array,
      kernel_width = self.kernel_width
      )
    eps = 1e-16 # XXX Maybe this should be larger?
    self.bin_selection = (self.mean_I_array > eps)
    sel_pos = self.bin_selection.iselection()
    # FIXME rare bug: this crashes when the majority of the data are zero,
    # e.g. because resolution limit was set too high and F/I filled in with 0.
    # it would be good to catch such cases in advance by inspecting the binned
    # values, and raise a different error message.
    assert sel_pos.size() > 0
    if (sel_pos.size() < self.mean_I_array.size() / 2) :
      raise Sorry("Analysis could not be continued because more than half "+
        "of the data have values below 1e-16.  This usually indicates either "+
        "an inappropriately high resolution cutoff, or an error in the data "+
        "file which artificially creates a higher resolution limit.")
    self.mean_I_array = self.mean_I_array.select(sel_pos)
    self.d_star_sq_array = self.d_star_sq_array.select(sel_pos)
    self.var_I_array = flex.log( self.var_I_array.select( sel_pos ) )
    self.weight_sum = self.weight_sum.select(sel_pos)
    self.mean_I_array = flex.log( self.mean_I_array )
    ## Fit a chebyshev polynome please
    normalizer_fit_lsq = chebyshev_lsq_fit.chebyshev_lsq_fit(
      n_term,
      self.d_star_sq_array,
      self.mean_I_array )
    self.normalizer = chebyshev_polynome(
      n_term,
      d_star_sq_low,
      d_star_sq_high,
      normalizer_fit_lsq.coefs)
    var_lsq_fit = chebyshev_lsq_fit.chebyshev_lsq_fit(
      n_term,
      self.d_star_sq_array,
      self.var_I_array )
    self.var_norm = chebyshev_polynome(
      n_term,
      d_star_sq_low,
      d_star_sq_high,
      var_lsq_fit.coefs)
    ws_fit = chebyshev_lsq_fit.chebyshev_lsq_fit(
      n_term,
      self.d_star_sq_array,
      self.weight_sum )
    self.weight_sum = chebyshev_polynome(
      n_term,
      d_star_sq_low,
      d_star_sq_high,
      ws_fit.coefs)

    ## The data wil now be normalised using the
    ## chebyshev polynome we have just obtained
    self.mean_I_array = flex.exp( self.mean_I_array)
    self.normalizer_for_miller_array =  flex.exp( self.normalizer.f(d_star_sq_hkl) )
    self.var_I_array = flex.exp( self.var_I_array )
    self.var_norm = flex.exp( self.var_norm.f(d_star_sq_hkl) )
    self.weight_sum = flex.exp( self.weight_sum.f(d_star_sq_hkl))
    self.normalised_miller = None
    self.normalised_miller_dev_eps = None
    if work_array.sigmas() is not None:
      self.normalised_miller = work_array.customized_copy(
        data = work_array.data()/self.normalizer_for_miller_array,
        sigmas = work_array.sigmas()/self.normalizer_for_miller_array
        ).set_observation_type(work_array)
      self.normalised_miller_dev_eps = self.normalised_miller.customized_copy(
        data = self.normalised_miller.data()/epsilons,
        sigmas = self.normalised_miller.sigmas()/epsilons)\
        .set_observation_type(work_array)
    else:
      self.normalised_miller = work_array.customized_copy(
        data = work_array.data()/self.normalizer_for_miller_array
        ).set_observation_type(work_array)
      self.normalised_miller_dev_eps = self.normalised_miller.customized_copy(
        data = self.normalised_miller.data()/epsilons)\
        .set_observation_type(work_array)
    def __init__(
        self,
        miller_obs,
        miller_calc,
        r_free_flags,
        kernel_width_free_reflections=None,
        kernel_width_d_star_cubed=None,
        kernel_in_bin_centers=False,
        kernel_on_chebyshev_nodes=True,
        n_sampling_points=20,
        n_chebyshev_terms=10,
        use_sampling_sum_weights=False,
        make_checks_and_clean_up=True,
    ):
        assert [kernel_width_free_reflections, kernel_width_d_star_cubed].count(None) == 1

        self.miller_obs = miller_obs
        self.miller_calc = abs(miller_calc)
        self.r_free_flags = r_free_flags
        self.kernel_width_free_reflections = kernel_width_free_reflections
        self.kernel_width_d_star_cubed = kernel_width_d_star_cubed
        self.n_chebyshev_terms = n_chebyshev_terms

        if make_checks_and_clean_up:
            self.miller_obs = self.miller_obs.map_to_asu()
            self.miller_calc = self.miller_calc.map_to_asu()
            self.r_free_flags = self.r_free_flags.map_to_asu()
            assert self.r_free_flags.indices().all_eq(self.miller_obs.indices())
            self.miller_calc = self.miller_calc.common_set(self.miller_obs)
            assert self.r_free_flags.indices().all_eq(self.miller_calc.indices())
            assert self.miller_obs.is_real_array()

            if self.miller_obs.is_xray_intensity_array():
                self.miller_obs = self.miller_obs.f_sq_as_f()
            assert self.miller_obs.observation_type() is None or self.miller_obs.is_xray_amplitude_array()

        if self.miller_calc.observation_type() is None:
            self.miller_calc = self.miller_calc.set_observation_type(self.miller_obs)

        # get normalized data please
        self.normalized_obs_f = absolute_scaling.kernel_normalisation(self.miller_obs, auto_kernel=True)
        self.normalized_obs = self.normalized_obs_f.normalised_miller_dev_eps.f_sq_as_f()

        self.normalized_calc_f = absolute_scaling.kernel_normalisation(self.miller_calc, auto_kernel=True)
        self.normalized_calc = self.normalized_calc_f.normalised_miller_dev_eps.f_sq_as_f()

        # get the 'free data'

        if self.r_free_flags.data().count(True) == 0:
            self.r_free_flags = self.r_free_flags.array(data=~self.r_free_flags.data())

        self.free_norm_obs = self.normalized_obs.select(self.r_free_flags.data())
        self.free_norm_calc = self.normalized_calc.select(self.r_free_flags.data())

        if self.free_norm_obs.data().size() <= 0:
            raise RuntimeError("No free reflections.")

        if self.kernel_width_d_star_cubed is None:
            self.kernel_width_d_star_cubed = sigmaa_estimator_kernel_width_d_star_cubed(
                r_free_flags=self.r_free_flags, kernel_width_free_reflections=self.kernel_width_free_reflections
            )

        self.sigma_target_functor = ext.sigmaa_estimator(
            e_obs=self.free_norm_obs.data(),
            e_calc=self.free_norm_calc.data(),
            centric=self.free_norm_obs.centric_flags().data(),
            d_star_cubed=self.free_norm_obs.d_star_cubed().data(),
            width=self.kernel_width_d_star_cubed,
        )

        d_star_cubed_overall = self.miller_obs.d_star_cubed().data()
        self.min_h = flex.min(d_star_cubed_overall)
        self.max_h = flex.max(d_star_cubed_overall)
        self.h_array = None
        if kernel_in_bin_centers:
            self.h_array = (
                flex.double(xrange(1, n_sampling_points * 2, 2)) * (self.max_h - self.min_h) / (n_sampling_points * 2)
                + self.min_h
            )
        else:
            self.min_h *= 0.99
            self.max_h *= 1.01
            if kernel_on_chebyshev_nodes:
                self.h_array = chebyshev_lsq_fit.chebyshev_nodes(
                    n=n_sampling_points, low=self.min_h, high=self.max_h, include_limits=True
                )
            else:
                self.h_array = (
                    flex.double(range(n_sampling_points)) * (self.max_h - self.min_h) / float(n_sampling_points - 1.0)
                    + self.min_h
                )
        assert self.h_array.size() == n_sampling_points
        self.sigmaa_array = flex.double()
        self.sigmaa_array.reserve(self.h_array.size())
        self.sum_weights = flex.double()
        self.sum_weights.reserve(self.h_array.size())

        for h in self.h_array:
            stimator = sigmaa_point_estimator(self.sigma_target_functor, h)
            self.sigmaa_array.append(stimator.sigmaa)
            self.sum_weights.append(self.sigma_target_functor.sum_weights(d_star_cubed=h))

        # fit a smooth function
        reparam_sa = -flex.log(1.0 / self.sigmaa_array - 1.0)
        if use_sampling_sum_weights:
            w_obs = flex.sqrt(self.sum_weights)
        else:
            w_obs = None
        fit_lsq = chebyshev_lsq_fit.chebyshev_lsq_fit(
            n_terms=self.n_chebyshev_terms, x_obs=self.h_array, y_obs=reparam_sa, w_obs=w_obs
        )

        cheb_pol = chebyshev_polynome(self.n_chebyshev_terms, self.min_h, self.max_h, fit_lsq.coefs)

        def reverse_reparam(values):
            return 1.0 / (1.0 + flex.exp(-values))

        self.sigmaa_fitted = reverse_reparam(cheb_pol.f(self.h_array))
        self.sigmaa_miller_array = reverse_reparam(cheb_pol.f(d_star_cubed_overall))
        assert flex.min(self.sigmaa_miller_array) >= 0
        assert flex.max(self.sigmaa_miller_array) <= 1
        self.sigmaa_miller_array = self.miller_obs.array(data=self.sigmaa_miller_array)

        self.alpha = None
        self.beta = None
        self.fom_array = None
Example #13
0
    def __init__(self,
                 miller_array,
                 kernel_width=None,
                 n_bins=23,
                 n_term=13,
                 d_star_sq_low=None,
                 d_star_sq_high=None,
                 auto_kernel=False,
                 number_of_sorted_reflections_for_auto_kernel=50):
        ## Autokernel is either False, true or a specific integer
        if kernel_width is None:
            assert (auto_kernel is not False)
        if auto_kernel is not False:
            assert (kernel_width == None)
        assert miller_array.size() > 0
        ## intensity arrays please
        work_array = None
        if not miller_array.is_real_array():
            raise RuntimeError("Please provide real arrays only")
            ## I might have to change this upper condition
        if miller_array.is_xray_amplitude_array():
            work_array = miller_array.f_as_f_sq()
        if miller_array.is_xray_intensity_array():
            work_array = miller_array.deep_copy()
            work_array = work_array.set_observation_type(miller_array)
        ## If type is not intensity or amplitude
        ## raise an execption please
        if not miller_array.is_xray_intensity_array():
            if not miller_array.is_xray_amplitude_array():
                raise RuntimeError("Observation type unknown")
        ## declare some shorthands
        I_obs = work_array.data()
        epsilons = work_array.epsilons().data().as_double()
        d_star_sq_hkl = work_array.d_spacings().data()
        d_star_sq_hkl = 1.0 / (d_star_sq_hkl * d_star_sq_hkl)
        ## Set up some limits
        if d_star_sq_low is None:
            d_star_sq_low = flex.min(d_star_sq_hkl)
        if d_star_sq_high is None:
            d_star_sq_high = flex.max(d_star_sq_hkl)
        ## A feeble attempt to determine an appropriate kernel width
        ## that seems to work reasonable in practice
        self.kernel_width = kernel_width
        if auto_kernel is not False:
            ## get the d_star_sq_array and sort it
            sort_permut = flex.sort_permutation(d_star_sq_hkl)
            ##
            if auto_kernel == True:
                number = number_of_sorted_reflections_for_auto_kernel
            else:
                number = int(auto_kernel)
            if number > d_star_sq_hkl.size():
                number = d_star_sq_hkl.size() - 1
            self.kernel_width = d_star_sq_hkl[
                sort_permut[number]] - d_star_sq_low
            assert self.kernel_width > 0
        ## Making the d_star_sq_array
        assert (n_bins > 1
                )  ## assure that there are more then 1 bins for interpolation
        self.d_star_sq_array = chebyshev_lsq_fit.chebyshev_nodes(
            n=n_bins,
            low=d_star_sq_low,
            high=d_star_sq_high,
            include_limits=True)

        ## Now get the average intensity please
        ##
        ## This step can be reasonably time consuming
        self.mean_I_array = scaling.kernel_normalisation(
            d_star_sq_hkl=d_star_sq_hkl,
            I_hkl=I_obs,
            epsilon=epsilons,
            d_star_sq_array=self.d_star_sq_array,
            kernel_width=self.kernel_width)
        self.var_I_array = scaling.kernel_normalisation(
            d_star_sq_hkl=d_star_sq_hkl,
            I_hkl=I_obs * I_obs,
            epsilon=epsilons * epsilons,
            d_star_sq_array=self.d_star_sq_array,
            kernel_width=self.kernel_width)
        self.var_I_array = self.var_I_array - self.mean_I_array * self.mean_I_array
        self.weight_sum = self.var_I_array = scaling.kernel_normalisation(
            d_star_sq_hkl=d_star_sq_hkl,
            I_hkl=I_obs * 0.0 + 1.0,
            epsilon=epsilons * 0.0 + 1.0,
            d_star_sq_array=self.d_star_sq_array,
            kernel_width=self.kernel_width)
        eps = 1e-16  # XXX Maybe this should be larger?
        self.bin_selection = (self.mean_I_array > eps)
        sel_pos = self.bin_selection.iselection()
        # FIXME rare bug: this crashes when the majority of the data are zero,
        # e.g. because resolution limit was set too high and F/I filled in with 0.
        # it would be good to catch such cases in advance by inspecting the binned
        # values, and raise a different error message.
        assert sel_pos.size() > 0
        if (sel_pos.size() < self.mean_I_array.size() / 2):
            raise Sorry(
                "Analysis could not be continued because more than half " +
                "of the data have values below 1e-16.  This usually indicates either "
                +
                "an inappropriately high resolution cutoff, or an error in the data "
                + "file which artificially creates a higher resolution limit.")
        self.mean_I_array = self.mean_I_array.select(sel_pos)
        self.d_star_sq_array = self.d_star_sq_array.select(sel_pos)
        self.var_I_array = flex.log(self.var_I_array.select(sel_pos))
        self.weight_sum = self.weight_sum.select(sel_pos)
        self.mean_I_array = flex.log(self.mean_I_array)
        ## Fit a chebyshev polynome please
        normalizer_fit_lsq = chebyshev_lsq_fit.chebyshev_lsq_fit(
            n_term, self.d_star_sq_array, self.mean_I_array)
        self.normalizer = chebyshev_polynome(n_term, d_star_sq_low,
                                             d_star_sq_high,
                                             normalizer_fit_lsq.coefs)
        var_lsq_fit = chebyshev_lsq_fit.chebyshev_lsq_fit(
            n_term, self.d_star_sq_array, self.var_I_array)
        self.var_norm = chebyshev_polynome(n_term, d_star_sq_low,
                                           d_star_sq_high, var_lsq_fit.coefs)
        ws_fit = chebyshev_lsq_fit.chebyshev_lsq_fit(n_term,
                                                     self.d_star_sq_array,
                                                     self.weight_sum)
        self.weight_sum = chebyshev_polynome(n_term, d_star_sq_low,
                                             d_star_sq_high, ws_fit.coefs)

        ## The data wil now be normalised using the
        ## chebyshev polynome we have just obtained
        self.mean_I_array = flex.exp(self.mean_I_array)
        self.normalizer_for_miller_array = flex.exp(
            self.normalizer.f(d_star_sq_hkl))
        self.var_I_array = flex.exp(self.var_I_array)
        self.var_norm = flex.exp(self.var_norm.f(d_star_sq_hkl))
        self.weight_sum = flex.exp(self.weight_sum.f(d_star_sq_hkl))
        self.normalised_miller = None
        self.normalised_miller_dev_eps = None
        if work_array.sigmas() is not None:
            self.normalised_miller = work_array.customized_copy(
                data=work_array.data() / self.normalizer_for_miller_array,
                sigmas=work_array.sigmas() /
                self.normalizer_for_miller_array).set_observation_type(
                    work_array)
            self.normalised_miller_dev_eps = self.normalised_miller.customized_copy(
              data = self.normalised_miller.data()/epsilons,
              sigmas = self.normalised_miller.sigmas()/epsilons)\
              .set_observation_type(work_array)
        else:
            self.normalised_miller = work_array.customized_copy(
                data=work_array.data() /
                self.normalizer_for_miller_array).set_observation_type(
                    work_array)
            self.normalised_miller_dev_eps = self.normalised_miller.customized_copy(
              data = self.normalised_miller.data()/epsilons)\
              .set_observation_type(work_array)
  def do_something_clever(self,obs,sobs,calc,mock):
    # first get the sort order
    # sort on the calculated data please
    sort_order = flex.sort_permutation( calc )
    inverse_sort_order = sort_order.inverse_permutation()

    sorted_obs  = obs.select(sort_order)
    sorted_sobs = sobs.select(sort_order)
    sorted_calc = calc.select(sort_order)
    sorted_mock = mock.select(sort_order)

    log_calc = flex.log(sorted_mock)
    deltas   = flex.log(sorted_obs) - flex.log(sorted_calc)

    old_deltas = deltas.deep_copy()

    # make bins on the basis of the order
    bin_size = float(sorted_obs.size())/self.n_e_bins
    bin_size = int(bin_size) + 1
    ebin = flex.int()
    count=0
    for ii in xrange( sorted_obs.size() ):
      if ii%bin_size==0:
        count+=1
      ebin.append( count-1 )

    # the bins have been setup, now we can reorder stuff
    for ibin in xrange(self.n_e_bins):
      this_bin_selection = flex.bool( ebin == ibin )
      tmp_n = (this_bin_selection).count(True)
      permute = flex.sort_permutation( flex.random_double( tmp_n ) )

      #select and swap
      selected_deltas = deltas.select( this_bin_selection )
      selected_deltas = selected_deltas.select( permute )
      selected_sobs   = sorted_sobs.select( this_bin_selection )
      selected_sobs   = selected_sobs.select( permute )


      # we have to make a sanity check so that the selected deltas are not very weerd
      # a safeguard to prevent the introductoin of outliers
      mean_delta = flex.mean( selected_deltas )
      std_delta  = math.sqrt( flex.mean( selected_deltas*selected_deltas ) - mean_delta*mean_delta )
      outliers = flex.bool( flex.abs(selected_deltas-mean_delta)>self.thres*std_delta )
      #print list( flex.abs(selected_deltas-mean_delta)/std_delta )
      #print list( outliers )

      if (outliers).count(True) > 0 :
        non_out_delta   = selected_deltas.select( ~outliers )
        tmp_permut      = flex.sort_permutation( flex.random_double( (~outliers).count(True)  ) )
        tmp_delta       = non_out_delta.select( tmp_permut )
        tmp_delta       = tmp_delta[0:(outliers).count(True)]
        selected_deltas = selected_deltas.set_selected( outliers.iselection(), tmp_delta )


      #set the deltas back please
      deltas = deltas.set_selected(this_bin_selection, selected_deltas)
      sorted_sobs = sorted_sobs.set_selected(this_bin_selection, selected_sobs)

    #the deltas have been swapped, apply things back please
    log_calc = log_calc + deltas
    log_calc = flex.exp(log_calc)

    #now we have to get things back in proper order again thank you
    new_fobs = log_calc.select(inverse_sort_order)
    new_sobs = sorted_sobs.select(inverse_sort_order)
    return new_fobs, new_sobs
  def run(self, args, command_name, out=sys.stdout):
    command_line = (iotbx_option_parser(
      usage="%s [options]" % command_name,
      description='Example: %s data.mtz data.mtz ref_model.pdb'%command_name)
      .option(None, "--show_defaults",
        action="store_true",
        help="Show list of parameters.")
      ).process(args=args)

    cif_file = None
    processed_args = utils.process_command_line_args(
                       args          = args,
                       log           = sys.stdout,
                       master_params = master_phil)
    params = processed_args.params
    if(params is None): params = master_phil
    self.params = params.extract().ensemble_probability
    pdb_file_names = processed_args.pdb_file_names
    if len(pdb_file_names) != 1 :
      raise Sorry("Only one PDB structure may be used")
    pdb_file = file_reader.any_file(pdb_file_names[0])
    self.log = multi_out()
    self.log.register(label="stdout", file_object=sys.stdout)
    self.log.register(
      label="log_buffer",
      file_object=StringIO(),
      atexit_send_to=None)
    sys.stderr = self.log
    log_file = open(pdb_file_names[0].split('/')[-1].replace('.pdb','') + '_pensemble.log', "w")

    self.log.replace_stringio(
        old_label="log_buffer",
        new_label="log",
        new_file_object=log_file)
    utils.print_header(command_name, out = self.log)
    params.show(out = self.log)
    #
    f_obs = None
    r_free_flags = None
    reflection_files = processed_args.reflection_files

    if self.params.fobs_vs_fcalc_post_nll:
      if len(reflection_files) == 0:
        raise Sorry("Fobs from input MTZ required for fobs_vs_fcalc_post_nll")

    if len(reflection_files) > 0:
      crystal_symmetry = processed_args.crystal_symmetry
      print >> self.log, 'Reflection file : ', processed_args.reflection_file_names[0]
      utils.print_header("Model and data statistics", out = self.log)
      rfs = reflection_file_server(
        crystal_symmetry = crystal_symmetry,
        reflection_files = processed_args.reflection_files,
        log              = self.log)

      parameters = utils.data_and_flags_master_params().extract()
      determine_data_and_flags_result = utils.determine_data_and_flags(
        reflection_file_server  = rfs,
        parameters              = parameters,
        data_parameter_scope    = "refinement.input.xray_data",
        flags_parameter_scope   = "refinement.input.xray_data.r_free_flags",
        data_description        = "X-ray data",
        keep_going              = True,
        log                     = self.log)
      f_obs = determine_data_and_flags_result.f_obs
      number_of_reflections = f_obs.indices().size()
      r_free_flags = determine_data_and_flags_result.r_free_flags
      test_flag_value = determine_data_and_flags_result.test_flag_value
      if(r_free_flags is None):
        r_free_flags=f_obs.array(data=flex.bool(f_obs.data().size(), False))

    # process PDB
    pdb_file.assert_file_type("pdb")
    #
    pdb_in = hierarchy.input(file_name=pdb_file.file_name)
    ens_pdb_hierarchy = pdb_in.construct_hierarchy()
    ens_pdb_hierarchy.atoms().reset_i_seq()
    ens_pdb_xrs_s = pdb_in.input.xray_structures_simple()
    number_structures = len(ens_pdb_xrs_s)
    print >> self.log, 'Number of structure in ensemble : ', number_structures

    # Calculate sigmas from input map only
    if self.params.assign_sigma_from_map and self.params.ensemble_sigma_map_input is not None:
      # process MTZ
      input_file = file_reader.any_file(self.params.ensemble_sigma_map_input)
      if input_file.file_type == "hkl" :
        if input_file.file_object.file_type() != "ccp4_mtz" :
           raise Sorry("Only MTZ format accepted for map input")
        else:
          mtz_file = input_file
      else:
        raise Sorry("Only MTZ format accepted for map input")
      miller_arrays = mtz_file.file_server.miller_arrays
      map_coeffs_1 = miller_arrays[0]
      #
      xrs_list = []
      for n, ens_pdb_xrs in enumerate(ens_pdb_xrs_s):
        # get sigma levels from ensemble fc for each structure
        xrs = get_map_sigma(ens_pdb_hierarchy = ens_pdb_hierarchy,
                          ens_pdb_xrs       = ens_pdb_xrs,
                          map_coeffs_1      = map_coeffs_1,
                          residue_detail    = self.params.residue_detail,
                          ignore_hd         = self.params.ignore_hd,
                          log               = self.log)
        xrs_list.append(xrs)
      # write ensemble pdb file, occupancies as sigma level
      filename = pdb_file_names[0].split('/')[-1].replace('.pdb','') + '_vs_' + self.params.ensemble_sigma_map_input.replace('.mtz','') + '_pensemble.pdb'
      write_ensemble_pdb(filename = filename,
                         xrs_list = xrs_list,
                         ens_pdb_hierarchy = ens_pdb_hierarchy
                         )

    # Do full analysis vs Fobs
    else:
      model_map_coeffs = []
      fmodel = None
      # Get <fcalc>
      for model, ens_pdb_xrs in enumerate(ens_pdb_xrs_s):
        ens_pdb_xrs.set_occupancies(1.0)
        if model == 0:
          # If mtz not supplied get fobs from xray structure...
          # Use input Fobs for scoring against nll
          if self.params.fobs_vs_fcalc_post_nll:
            dummy_fobs = f_obs
          else:
            if f_obs == None:
              if self.params.fcalc_high_resolution == None:
                raise Sorry("Please supply high resolution limit or input mtz file.")
              dummy_dmin = self.params.fcalc_high_resolution
              dummy_dmax = self.params.fcalc_low_resolution
            else:
              print >> self.log, 'Supplied mtz used to determine high and low resolution cuttoffs'
              dummy_dmax, dummy_dmin = f_obs.d_max_min()
            #
            dummy_fobs = abs(ens_pdb_xrs.structure_factors(d_min = dummy_dmin).f_calc())
            dummy_fobs.set_observation_type_xray_amplitude()
            # If mtz supplied, free flags are over written to prevent array size error
            r_free_flags = dummy_fobs.array(data=flex.bool(dummy_fobs.data().size(),False))
          #
          fmodel = utils.fmodel_simple(
                     scattering_table         = "wk1995",
                     xray_structures          = [ens_pdb_xrs],
                     f_obs                    = dummy_fobs,
                     target_name              = 'ls',
                     bulk_solvent_and_scaling = False,
                     r_free_flags             = r_free_flags
                     )
          f_calc_ave = fmodel.f_calc().array(data = fmodel.f_calc().data()*0).deep_copy()
          # XXX Important to ensure scale is identical for each model and <model>
          fmodel.set_scale_switch = 1.0
          f_calc_ave_total = fmodel.f_calc().data().deep_copy()
        else:
          fmodel.update_xray_structure(xray_structure  = ens_pdb_xrs,
                                       update_f_calc   = True,
                                       update_f_mask   = False)
          f_calc_ave_total += fmodel.f_calc().data().deep_copy()
        print >> self.log, 'Model :', model+1
        print >> self.log, "\nStructure vs real Fobs (no bulk solvent or scaling)"
        print >> self.log, 'Rwork          : %5.4f '%fmodel.r_work()
        print >> self.log, 'Rfree          : %5.4f '%fmodel.r_free()
        print >> self.log, 'K1             : %5.4f '%fmodel.scale_k1()
        fcalc_edm        = fmodel.electron_density_map()
        fcalc_map_coeffs = fcalc_edm.map_coefficients(map_type = 'Fc')
        fcalc_mtz_dataset = fcalc_map_coeffs.as_mtz_dataset(column_root_label ='Fc')
        if self.params.output_model_and_model_ave_mtz:
          fcalc_mtz_dataset.mtz_object().write(file_name = str(model+1)+"_Fc.mtz")
        model_map_coeffs.append(fcalc_map_coeffs.deep_copy())

      fmodel.update(f_calc = f_calc_ave.array(f_calc_ave_total / number_structures))
      print >> self.log, "\nEnsemble vs real Fobs (no bulk solvent or scaling)"
      print >> self.log, 'Rwork          : %5.4f '%fmodel.r_work()
      print >> self.log, 'Rfree          : %5.4f '%fmodel.r_free()
      print >> self.log, 'K1             : %5.4f '%fmodel.scale_k1()

      # Get <Fcalc> map
      fcalc_ave_edm        = fmodel.electron_density_map()
      fcalc_ave_map_coeffs = fcalc_ave_edm.map_coefficients(map_type = 'Fc').deep_copy()
      fcalc_ave_mtz_dataset = fcalc_ave_map_coeffs.as_mtz_dataset(column_root_label ='Fc')
      if self.params.output_model_and_model_ave_mtz:
        fcalc_ave_mtz_dataset.mtz_object().write(file_name = "aveFc.mtz")
      fcalc_ave_map_coeffs = fcalc_ave_map_coeffs.fft_map()
      fcalc_ave_map_coeffs.apply_volume_scaling()
      fcalc_ave_map_data   = fcalc_ave_map_coeffs.real_map_unpadded()
      fcalc_ave_map_stats  = maptbx.statistics(fcalc_ave_map_data)

      print >> self.log, "<Fcalc> Map Stats :"
      fcalc_ave_map_stats.show_summary(f = self.log)
      offset = fcalc_ave_map_stats.min()
      model_neg_ll = []

      number_previous_scatters = 0

      # Run through structure list again and get probability
      xrs_list = []
      for model, ens_pdb_xrs in enumerate(ens_pdb_xrs_s):
        if self.params.verbose:
          print >> self.log, '\n\nModel                   : ', model+1
        # Get model atom sigmas vs Fcalc
        fcalc_map = model_map_coeffs[model].fft_map()
        fcalc_map.apply_volume_scaling()
        fcalc_map_data  = fcalc_map.real_map_unpadded()
        fcalc_map_stats  = maptbx.statistics(fcalc_map_data)
        if self.params.verbose:
          print >> self.log, "Fcalc map stats         :"
        fcalc_map_stats.show_summary(f = self.log)

        xrs = get_map_sigma(ens_pdb_hierarchy = ens_pdb_hierarchy,
                            ens_pdb_xrs       = ens_pdb_xrs,
                            fft_map_1         = fcalc_map,
                            model_i           = model,
                            residue_detail    = self.params.residue_detail,
                            ignore_hd         = self.params.ignore_hd,
                            number_previous_scatters = number_previous_scatters,
                            log               = self.log)
        fcalc_sigmas = xrs.scatterers().extract_occupancies()
        del fcalc_map
        # Get model atom sigmas vs <Fcalc>
        xrs = get_map_sigma(ens_pdb_hierarchy = ens_pdb_hierarchy,
                            ens_pdb_xrs       = ens_pdb_xrs,
                            fft_map_1         = fcalc_ave_map_coeffs,
                            model_i           = model,
                            residue_detail    = self.params.residue_detail,
                            ignore_hd         = self.params.ignore_hd,
                            number_previous_scatters = number_previous_scatters,
                            log               = self.log)

        ### For testing other residue averaging options
        #print xrs.residue_selections

        fcalc_ave_sigmas = xrs.scatterers().extract_occupancies()
        # Probability of model given <model>
        prob = fcalc_ave_sigmas / fcalc_sigmas
        # XXX debug option
        if False:
          for n,p in enumerate(prob):
            print >> self.log, ' {0:5d} {1:5.3f}'.format(n,p)
        # Set probabilty between 0 and 1
        # XXX Make Histogram / more stats
        prob_lss_zero = flex.bool(prob <= 0)
        prob_grt_one = flex.bool(prob > 1)
        prob.set_selected(prob_lss_zero, 0.001)
        prob.set_selected(prob_grt_one, 1.0)
        xrs.set_occupancies(prob)
        xrs_list.append(xrs)
        sum_neg_ll = sum(-flex.log(prob))
        model_neg_ll.append((sum_neg_ll, model))
        if self.params.verbose:
          print >> self.log, 'Model probability stats :'
          print >> self.log, prob.min_max_mean().show()
          print >> self.log, '  Count < 0.0 : ', prob_lss_zero.count(True)
          print >> self.log, '  Count > 1.0 : ', prob_grt_one.count(True)

        # For averaging by residue
        number_previous_scatters += ens_pdb_xrs.sites_cart().size()

      # write ensemble pdb file, occupancies as sigma level
      write_ensemble_pdb(filename = pdb_file_names[0].split('/')[-1].replace('.pdb','') + '_pensemble.pdb',
                       xrs_list = xrs_list,
                       ens_pdb_hierarchy = ens_pdb_hierarchy
                       )

      # XXX Test ordering models by nll
      # XXX Test removing nth percentile atoms
      if self.params.sort_ensemble_by_nll_score or self.params.fobs_vs_fcalc_post_nll:
        for percentile in [1.0,0.975,0.95,0.9,0.8,0.6,0.2]:
          model_neg_ll = sorted(model_neg_ll)
          f_calc_ave_total_reordered = None
          print_list = []
          for i_neg_ll in model_neg_ll:
            xrs = xrs_list[i_neg_ll[1]]
            nll_occ = xrs.scatterers().extract_occupancies()

            # Set q=0 nth percentile atoms
            sorted_nll_occ = sorted(nll_occ, reverse=True)
            number_atoms = len(sorted_nll_occ)
            percentile_prob_cutoff = sorted_nll_occ[int(number_atoms * percentile)-1]
            cutoff_selections = flex.bool(nll_occ < percentile_prob_cutoff)
            cutoff_nll_occ = flex.double(nll_occ.size(), 1.0).set_selected(cutoff_selections, 0.0)
            #XXX Debug
            if False:
              print '\nDebug'
              for x in xrange(len(cutoff_selections)):
                print cutoff_selections[x], nll_occ[x], cutoff_nll_occ[x]
              print percentile
              print percentile_prob_cutoff
              print cutoff_selections.count(True)
              print cutoff_selections.size()
              print cutoff_nll_occ.count(0.0)
              print 'Count q = 1           : ', cutoff_nll_occ.count(1.0)
              print 'Count scatterers size : ', cutoff_nll_occ.size()

            xrs.set_occupancies(cutoff_nll_occ)
            fmodel.update_xray_structure(xray_structure  = xrs,
                                         update_f_calc   = True,
                                         update_f_mask   = True)

            if f_calc_ave_total_reordered == None:
              f_calc_ave_total_reordered = fmodel.f_calc().data().deep_copy()
              f_mask_ave_total_reordered = fmodel.f_masks()[0].data().deep_copy()
              cntr = 1
            else:
              f_calc_ave_total_reordered += fmodel.f_calc().data().deep_copy()
              f_mask_ave_total_reordered += fmodel.f_masks()[0].data().deep_copy()
              cntr+=1
            fmodel.update(f_calc = f_calc_ave.array(f_calc_ave_total_reordered / cntr).deep_copy(),
                          f_mask = f_calc_ave.array(f_mask_ave_total_reordered / cntr).deep_copy()
                          )

            # Update solvent and scale
            # XXX Will need to apply_back_trace on latest version
            fmodel.set_scale_switch = 0
            fmodel.update_all_scales()

            # Reset occ for outout
            xrs.set_occupancies(nll_occ)
            # k1 updated vs Fobs
            if self.params.fobs_vs_fcalc_post_nll:
              print_list.append([cntr, i_neg_ll[0], i_neg_ll[1], fmodel.r_work(), fmodel.r_free()])

          # Order models by nll and print summary
          print >> self.log, '\nModels ranked by nll <Fcalc> R-factors recalculated'
          print >> self.log, 'Percentile cutoff : {0:5.3f}'.format(percentile)
          xrs_list_sorted_nll = []
          print >> self.log, '      |      NLL     <Rw>     <Rf>    Ens Model'
          for info in print_list:
            print >> self.log, ' {0:4d} | {1:8.1f} {2:8.4f} {3:8.4f} {4:12d}'.format(
              info[0],
              info[1],
              info[3],
              info[4],
              info[2]+1,
              )
            xrs_list_sorted_nll.append(xrs_list[info[2]])

        # Output nll ordered ensemble

        write_ensemble_pdb(filename = 'nll_ordered_' + pdb_file_names[0].split('/')[-1].replace('.pdb','') + '_pensemble.pdb',
                       xrs_list = xrs_list_sorted_nll,
                       ens_pdb_hierarchy = ens_pdb_hierarchy
                       )