def log_p_obs_given_gamma(self, gamma): dof = self.degrees_of_freedom x_gamma = (gamma * self.delta_fc2.data() - self.delta_fo2.data()) \ / self.delta_fo2.sigmas() if self.probability_plot_slope is not None: x_gamma /= self.probability_plot_slope return -(1+dof)/2 * flex.sum(flex.log(flex.pow2(x_gamma) + dof))
def _calc_residuals(va, vb, pa, pb, sa, sb): mtch_indcs = va.miller_array.match_indices(vb.miller_array, assert_is_similar_symmetry=False) va_selection = mtch_indcs.pair_selection(0) vb_selection = mtch_indcs.pair_selection(1) sp_a = pa.select(va_selection) * sa.select(va_selection) sp_b = pb.select(vb_selection) * sb.select(vb_selection) ia_over_ib = va.miller_array.data().select(va_selection) / \ vb.miller_array.data().select(vb_selection) residuals = (flex.log(sp_a) - flex.log(sp_b) - flex.log(ia_over_ib)) residuals = residuals.as_numpy_array() #logging.debug("Mean Residual: {}".format(np.mean(residuals))) return residuals[~np.isnan(residuals)]
def sigmaa_model_error(self): x = 0.25*flex.pow( self.h_array, 2.0/3.0 ) # h was in d*^-3 !!! y = flex.log( self.sigmaa_fitted ) #compute the slope please result = flex.linear_regression( x, y ) result = -(result.slope()/math.pi*3) if result < 0: result = None else: result = math.sqrt( result ) return result
def log_fit(x, y, degree=5): """Fit the values log(y(x)) then return exp() to this fit. x, y should be iterables containing floats of the same size. The order is the order of polynomial to use for this fit. This will be useful for e.g. I/sigma.""" fit = curve_fitting.univariate_polynomial_fit(x, flex.log(y), degree=degree, max_iterations=100) f = curve_fitting.univariate_polynomial(*fit.params) return flex.exp(f(x))
def log_inv_fit(x, y, degree=5): """Fit the values log(1 / y(x)) then return the inverse of this fit. x, y should be iterables, the order of the polynomial for the transformed fit needs to be specified. This will be useful for e.g. Rmerge.""" fit = curve_fitting.univariate_polynomial_fit(x, flex.log(1 / y), degree=degree, max_iterations=100) f = curve_fitting.univariate_polynomial(*fit.params) return 1 / flex.exp(f(x))
def __init__(self, f_obs, asu_contents, e_statistics=False): assert f_obs.is_real_array() self.info = f_obs.info() f_obs_selected = f_obs.select(f_obs.data() > 0) f_obs_selected.use_binning_of(f_obs) # compute <fobs^2> in resolution shells self.mean_fobs_sq = f_obs_selected.mean_sq(use_binning=True, use_multiplicities=True).data[1:-1] n_none = self.mean_fobs_sq.count(None) if n_none > 0: error_message = "wilson_plot error: %d empty bin%s:" % plural_s(n_none) if self.info is not None: error_message += "\n Info: " + str(self.info) error_message += "\n Number of bins: %d" % len(self.mean_fobs_sq) error_message += "\n Number of f_obs > 0: %d" % (f_obs_selected.indices().size()) error_message += "\n Number of f_obs <= 0: %d" % (f_obs.indices().size() - f_obs_selected.indices().size()) raise RuntimeError(error_message) self.mean_fobs_sq = flex.double(self.mean_fobs_sq) # compute <s^2> = <(sin(theta)/lambda)^2> in resolution shells stol_sq = f_obs_selected.sin_theta_over_lambda_sq() stol_sq.use_binner_of(f_obs_selected) self.mean_stol_sq = flex.double(stol_sq.mean(use_binning=True, use_multiplicities=True).data[1:-1]) # cache scattering factor info gaussians = {} for chemical_type in asu_contents.keys(): gaussians[chemical_type] = eltbx.xray_scattering.wk1995(chemical_type).fetch() # compute expected f_calc^2 in resolution shells self.expected_f_sq = flex.double() for stol_sq in self.mean_stol_sq: sum_fj_sq = 0 for chemical_type, n_atoms in asu_contents.items(): f0 = gaussians[chemical_type].at_stol_sq(stol_sq) sum_fj_sq += f0 * f0 * n_atoms self.expected_f_sq.append(sum_fj_sq) self.expected_f_sq *= f_obs_selected.space_group().order_z() * f_obs_selected.space_group().n_ltr() # fit to straight line self.x = self.mean_stol_sq self.y = flex.log(self.mean_fobs_sq / self.expected_f_sq) fit = flex.linear_regression(self.x, self.y) assert fit.is_well_defined() self.fit_y_intercept = fit.y_intercept() self.fit_slope = fit.slope() self.wilson_intensity_scale_factor = math.exp(self.fit_y_intercept) # intensity scale factor self.wilson_k = math.sqrt(self.wilson_intensity_scale_factor) # conversion to amplitude scale factor self.wilson_b = -self.fit_slope / 2 self.fit_correlation = flex.linear_correlation(self.x, self.y).coefficient() if e_statistics: normalised = f_obs_selected.normalised_amplitudes(asu_contents, self) self.normalised_f_obs = normalised.array() self.mean_e_sq_minus_1 = normalised.mean_e_sq_minus_1() self.percent_e_sq_gt_2 = normalised.percent_e_sq_gt_2()
def do_something_clever(self, obs, sobs, calc, mock): # first get the sort order # sort on the calculated data please sort_order = flex.sort_permutation(calc) inverse_sort_order = sort_order.inverse_permutation() sorted_obs = obs.select(sort_order) sorted_sobs = sobs.select(sort_order) sorted_calc = calc.select(sort_order) sorted_mock = mock.select(sort_order) log_calc = flex.log(sorted_mock) deltas = flex.log(sorted_obs) - flex.log(sorted_calc) old_deltas = deltas.deep_copy() # make bins on the basis of the order bin_size = float(sorted_obs.size()) / self.n_e_bins bin_size = int(bin_size) + 1 ebin = flex.int() count = 0 for ii in range(sorted_obs.size()): if ii % bin_size == 0: count += 1 ebin.append(count - 1) # the bins have been setup, now we can reorder stuff for ibin in range(self.n_e_bins): this_bin_selection = flex.bool(ebin == ibin) tmp_n = (this_bin_selection).count(True) permute = flex.sort_permutation(flex.random_double(tmp_n)) #select and swap selected_deltas = deltas.select(this_bin_selection) selected_deltas = selected_deltas.select(permute) selected_sobs = sorted_sobs.select(this_bin_selection) selected_sobs = selected_sobs.select(permute) # we have to make a sanity check so that the selected deltas are not very weerd # a safeguard to prevent the introductoin of outliers mean_delta = flex.mean(selected_deltas) std_delta = math.sqrt( flex.mean(selected_deltas * selected_deltas) - mean_delta * mean_delta) outliers = flex.bool( flex.abs(selected_deltas - mean_delta) > self.thres * std_delta) #print list( flex.abs(selected_deltas-mean_delta)/std_delta ) #print list( outliers ) if (outliers).count(True) > 0: non_out_delta = selected_deltas.select(~outliers) tmp_permut = flex.sort_permutation( flex.random_double((~outliers).count(True))) tmp_delta = non_out_delta.select(tmp_permut) tmp_delta = tmp_delta[0:(outliers).count(True)] selected_deltas = selected_deltas.set_selected( outliers.iselection(), tmp_delta) #set the deltas back please deltas = deltas.set_selected(this_bin_selection, selected_deltas) sorted_sobs = sorted_sobs.set_selected(this_bin_selection, selected_sobs) #the deltas have been swapped, apply things back please log_calc = log_calc + deltas log_calc = flex.exp(log_calc) #now we have to get things back in proper order again thank you new_fobs = log_calc.select(inverse_sort_order) new_sobs = sorted_sobs.select(inverse_sort_order) return new_fobs, new_sobs
def run(self, args, command_name, out=sys.stdout): command_line = (iotbx_option_parser( usage="%s [options]" % command_name, description='Example: %s data.mtz data.mtz ref_model.pdb' % command_name).option( None, "--show_defaults", action="store_true", help="Show list of parameters.")).process(args=args) cif_file = None processed_args = utils.process_command_line_args( args=args, log=sys.stdout, master_params=master_phil) params = processed_args.params if (params is None): params = master_phil self.params = params.extract().ensemble_probability pdb_file_names = processed_args.pdb_file_names if len(pdb_file_names) != 1: raise Sorry("Only one PDB structure may be used") pdb_file = file_reader.any_file(pdb_file_names[0]) self.log = multi_out() self.log.register(label="stdout", file_object=sys.stdout) self.log.register(label="log_buffer", file_object=StringIO(), atexit_send_to=None) sys.stderr = self.log log_file = open( pdb_file_names[0].split('/')[-1].replace('.pdb', '') + '_pensemble.log', "w") self.log.replace_stringio(old_label="log_buffer", new_label="log", new_file_object=log_file) utils.print_header(command_name, out=self.log) params.show(out=self.log) # f_obs = None r_free_flags = None reflection_files = processed_args.reflection_files if self.params.fobs_vs_fcalc_post_nll: if len(reflection_files) == 0: raise Sorry( "Fobs from input MTZ required for fobs_vs_fcalc_post_nll") if len(reflection_files) > 0: crystal_symmetry = processed_args.crystal_symmetry print('Reflection file : ', processed_args.reflection_file_names[0], file=self.log) utils.print_header("Model and data statistics", out=self.log) rfs = reflection_file_server( crystal_symmetry=crystal_symmetry, reflection_files=processed_args.reflection_files, log=self.log) parameters = extract_xtal_data.data_and_flags_master_params( ).extract() determine_data_and_flags_result = extract_xtal_data.run( reflection_file_server=rfs, parameters=parameters, data_parameter_scope="refinement.input.xray_data", flags_parameter_scope="refinement.input.xray_data.r_free_flags", data_description="X-ray data", keep_going=True, log=self.log) f_obs = determine_data_and_flags_result.f_obs number_of_reflections = f_obs.indices().size() r_free_flags = determine_data_and_flags_result.r_free_flags test_flag_value = determine_data_and_flags_result.test_flag_value if (r_free_flags is None): r_free_flags = f_obs.array( data=flex.bool(f_obs.data().size(), False)) # process PDB pdb_file.assert_file_type("pdb") # pdb_in = hierarchy.input(file_name=pdb_file.file_name) ens_pdb_hierarchy = pdb_in.construct_hierarchy() ens_pdb_hierarchy.atoms().reset_i_seq() ens_pdb_xrs_s = pdb_in.input.xray_structures_simple() number_structures = len(ens_pdb_xrs_s) print('Number of structure in ensemble : ', number_structures, file=self.log) # Calculate sigmas from input map only if self.params.assign_sigma_from_map and self.params.ensemble_sigma_map_input is not None: # process MTZ input_file = file_reader.any_file( self.params.ensemble_sigma_map_input) if input_file.file_type == "hkl": if input_file.file_object.file_type() != "ccp4_mtz": raise Sorry("Only MTZ format accepted for map input") else: mtz_file = input_file else: raise Sorry("Only MTZ format accepted for map input") miller_arrays = mtz_file.file_server.miller_arrays map_coeffs_1 = miller_arrays[0] # xrs_list = [] for n, ens_pdb_xrs in enumerate(ens_pdb_xrs_s): # get sigma levels from ensemble fc for each structure xrs = get_map_sigma(ens_pdb_hierarchy=ens_pdb_hierarchy, ens_pdb_xrs=ens_pdb_xrs, map_coeffs_1=map_coeffs_1, residue_detail=self.params.residue_detail, ignore_hd=self.params.ignore_hd, log=self.log) xrs_list.append(xrs) # write ensemble pdb file, occupancies as sigma level filename = pdb_file_names[0].split('/')[-1].replace( '.pdb', '') + '_vs_' + self.params.ensemble_sigma_map_input.replace( '.mtz', '') + '_pensemble.pdb' write_ensemble_pdb(filename=filename, xrs_list=xrs_list, ens_pdb_hierarchy=ens_pdb_hierarchy) # Do full analysis vs Fobs else: model_map_coeffs = [] fmodel = None # Get <fcalc> for model, ens_pdb_xrs in enumerate(ens_pdb_xrs_s): ens_pdb_xrs.set_occupancies(1.0) if model == 0: # If mtz not supplied get fobs from xray structure... # Use input Fobs for scoring against nll if self.params.fobs_vs_fcalc_post_nll: dummy_fobs = f_obs else: if f_obs == None: if self.params.fcalc_high_resolution == None: raise Sorry( "Please supply high resolution limit or input mtz file." ) dummy_dmin = self.params.fcalc_high_resolution dummy_dmax = self.params.fcalc_low_resolution else: print( 'Supplied mtz used to determine high and low resolution cuttoffs', file=self.log) dummy_dmax, dummy_dmin = f_obs.d_max_min() # dummy_fobs = abs( ens_pdb_xrs.structure_factors( d_min=dummy_dmin).f_calc()) dummy_fobs.set_observation_type_xray_amplitude() # If mtz supplied, free flags are over written to prevent array size error r_free_flags = dummy_fobs.array( data=flex.bool(dummy_fobs.data().size(), False)) # fmodel = utils.fmodel_simple( scattering_table="wk1995", xray_structures=[ens_pdb_xrs], f_obs=dummy_fobs, target_name='ls', bulk_solvent_and_scaling=False, r_free_flags=r_free_flags) f_calc_ave = fmodel.f_calc().array( data=fmodel.f_calc().data() * 0).deep_copy() # XXX Important to ensure scale is identical for each model and <model> fmodel.set_scale_switch = 1.0 f_calc_ave_total = fmodel.f_calc().data().deep_copy() else: fmodel.update_xray_structure(xray_structure=ens_pdb_xrs, update_f_calc=True, update_f_mask=False) f_calc_ave_total += fmodel.f_calc().data().deep_copy() print('Model :', model + 1, file=self.log) print("\nStructure vs real Fobs (no bulk solvent or scaling)", file=self.log) print('Rwork : %5.4f ' % fmodel.r_work(), file=self.log) print('Rfree : %5.4f ' % fmodel.r_free(), file=self.log) print('K1 : %5.4f ' % fmodel.scale_k1(), file=self.log) fcalc_edm = fmodel.electron_density_map() fcalc_map_coeffs = fcalc_edm.map_coefficients(map_type='Fc') fcalc_mtz_dataset = fcalc_map_coeffs.as_mtz_dataset( column_root_label='Fc') if self.params.output_model_and_model_ave_mtz: fcalc_mtz_dataset.mtz_object().write( file_name=str(model + 1) + "_Fc.mtz") model_map_coeffs.append(fcalc_map_coeffs.deep_copy()) fmodel.update(f_calc=f_calc_ave.array(f_calc_ave_total / number_structures)) print("\nEnsemble vs real Fobs (no bulk solvent or scaling)", file=self.log) print('Rwork : %5.4f ' % fmodel.r_work(), file=self.log) print('Rfree : %5.4f ' % fmodel.r_free(), file=self.log) print('K1 : %5.4f ' % fmodel.scale_k1(), file=self.log) # Get <Fcalc> map fcalc_ave_edm = fmodel.electron_density_map() fcalc_ave_map_coeffs = fcalc_ave_edm.map_coefficients( map_type='Fc').deep_copy() fcalc_ave_mtz_dataset = fcalc_ave_map_coeffs.as_mtz_dataset( column_root_label='Fc') if self.params.output_model_and_model_ave_mtz: fcalc_ave_mtz_dataset.mtz_object().write(file_name="aveFc.mtz") fcalc_ave_map_coeffs = fcalc_ave_map_coeffs.fft_map() fcalc_ave_map_coeffs.apply_volume_scaling() fcalc_ave_map_data = fcalc_ave_map_coeffs.real_map_unpadded() fcalc_ave_map_stats = maptbx.statistics(fcalc_ave_map_data) print("<Fcalc> Map Stats :", file=self.log) fcalc_ave_map_stats.show_summary(f=self.log) offset = fcalc_ave_map_stats.min() model_neg_ll = [] number_previous_scatters = 0 # Run through structure list again and get probability xrs_list = [] for model, ens_pdb_xrs in enumerate(ens_pdb_xrs_s): if self.params.verbose: print('\n\nModel : ', model + 1, file=self.log) # Get model atom sigmas vs Fcalc fcalc_map = model_map_coeffs[model].fft_map() fcalc_map.apply_volume_scaling() fcalc_map_data = fcalc_map.real_map_unpadded() fcalc_map_stats = maptbx.statistics(fcalc_map_data) if self.params.verbose: print("Fcalc map stats :", file=self.log) fcalc_map_stats.show_summary(f=self.log) xrs = get_map_sigma( ens_pdb_hierarchy=ens_pdb_hierarchy, ens_pdb_xrs=ens_pdb_xrs, fft_map_1=fcalc_map, model_i=model, residue_detail=self.params.residue_detail, ignore_hd=self.params.ignore_hd, number_previous_scatters=number_previous_scatters, log=self.log) fcalc_sigmas = xrs.scatterers().extract_occupancies() del fcalc_map # Get model atom sigmas vs <Fcalc> xrs = get_map_sigma( ens_pdb_hierarchy=ens_pdb_hierarchy, ens_pdb_xrs=ens_pdb_xrs, fft_map_1=fcalc_ave_map_coeffs, model_i=model, residue_detail=self.params.residue_detail, ignore_hd=self.params.ignore_hd, number_previous_scatters=number_previous_scatters, log=self.log) ### For testing other residue averaging options #print xrs.residue_selections fcalc_ave_sigmas = xrs.scatterers().extract_occupancies() # Probability of model given <model> prob = fcalc_ave_sigmas / fcalc_sigmas # XXX debug option if False: for n, p in enumerate(prob): print(' {0:5d} {1:5.3f}'.format(n, p), file=self.log) # Set probabilty between 0 and 1 # XXX Make Histogram / more stats prob_lss_zero = flex.bool(prob <= 0) prob_grt_one = flex.bool(prob > 1) prob.set_selected(prob_lss_zero, 0.001) prob.set_selected(prob_grt_one, 1.0) xrs.set_occupancies(prob) xrs_list.append(xrs) sum_neg_ll = sum(-flex.log(prob)) model_neg_ll.append((sum_neg_ll, model)) if self.params.verbose: print('Model probability stats :', file=self.log) print(prob.min_max_mean().show(), file=self.log) print(' Count < 0.0 : ', prob_lss_zero.count(True), file=self.log) print(' Count > 1.0 : ', prob_grt_one.count(True), file=self.log) # For averaging by residue number_previous_scatters += ens_pdb_xrs.sites_cart().size() # write ensemble pdb file, occupancies as sigma level write_ensemble_pdb( filename=pdb_file_names[0].split('/')[-1].replace('.pdb', '') + '_pensemble.pdb', xrs_list=xrs_list, ens_pdb_hierarchy=ens_pdb_hierarchy) # XXX Test ordering models by nll # XXX Test removing nth percentile atoms if self.params.sort_ensemble_by_nll_score or self.params.fobs_vs_fcalc_post_nll: for percentile in [1.0, 0.975, 0.95, 0.9, 0.8, 0.6, 0.2]: model_neg_ll = sorted(model_neg_ll) f_calc_ave_total_reordered = None print_list = [] for i_neg_ll in model_neg_ll: xrs = xrs_list[i_neg_ll[1]] nll_occ = xrs.scatterers().extract_occupancies() # Set q=0 nth percentile atoms sorted_nll_occ = sorted(nll_occ, reverse=True) number_atoms = len(sorted_nll_occ) percentile_prob_cutoff = sorted_nll_occ[ int(number_atoms * percentile) - 1] cutoff_selections = flex.bool( nll_occ < percentile_prob_cutoff) cutoff_nll_occ = flex.double(nll_occ.size(), 1.0).set_selected( cutoff_selections, 0.0) #XXX Debug if False: print('\nDebug') for x in range(len(cutoff_selections)): print(cutoff_selections[x], nll_occ[x], cutoff_nll_occ[x]) print(percentile) print(percentile_prob_cutoff) print(cutoff_selections.count(True)) print(cutoff_selections.size()) print(cutoff_nll_occ.count(0.0)) print('Count q = 1 : ', cutoff_nll_occ.count(1.0)) print('Count scatterers size : ', cutoff_nll_occ.size()) xrs.set_occupancies(cutoff_nll_occ) fmodel.update_xray_structure(xray_structure=xrs, update_f_calc=True, update_f_mask=True) if f_calc_ave_total_reordered == None: f_calc_ave_total_reordered = fmodel.f_calc().data( ).deep_copy() f_mask_ave_total_reordered = fmodel.f_masks( )[0].data().deep_copy() cntr = 1 else: f_calc_ave_total_reordered += fmodel.f_calc().data( ).deep_copy() f_mask_ave_total_reordered += fmodel.f_masks( )[0].data().deep_copy() cntr += 1 fmodel.update( f_calc=f_calc_ave.array( f_calc_ave_total_reordered / cntr).deep_copy(), f_mask=f_calc_ave.array( f_mask_ave_total_reordered / cntr).deep_copy()) # Update solvent and scale # XXX Will need to apply_back_trace on latest version fmodel.set_scale_switch = 0 fmodel.update_all_scales() # Reset occ for outout xrs.set_occupancies(nll_occ) # k1 updated vs Fobs if self.params.fobs_vs_fcalc_post_nll: print_list.append([ cntr, i_neg_ll[0], i_neg_ll[1], fmodel.r_work(), fmodel.r_free() ]) # Order models by nll and print summary print( '\nModels ranked by nll <Fcalc> R-factors recalculated', file=self.log) print('Percentile cutoff : {0:5.3f}'.format(percentile), file=self.log) xrs_list_sorted_nll = [] print(' | NLL <Rw> <Rf> Ens Model', file=self.log) for info in print_list: print(' {0:4d} | {1:8.1f} {2:8.4f} {3:8.4f} {4:12d}'. format( info[0], info[1], info[3], info[4], info[2] + 1, ), file=self.log) xrs_list_sorted_nll.append(xrs_list[info[2]]) # Output nll ordered ensemble write_ensemble_pdb( filename='nll_ordered_' + pdb_file_names[0].split('/')[-1].replace('.pdb', '') + '_pensemble.pdb', xrs_list=xrs_list_sorted_nll, ens_pdb_hierarchy=ens_pdb_hierarchy)
def __init__(self, f_obs, asu_contents, e_statistics=False): assert f_obs.is_real_array() self.info = f_obs.info() f_obs_selected = f_obs.select(f_obs.data() > 0) f_obs_selected.use_binning_of(f_obs) # compute <fobs^2> in resolution shells self.mean_fobs_sq = f_obs_selected.mean_sq( use_binning=True, use_multiplicities=True).data[1:-1] n_none = self.mean_fobs_sq.count(None) if (n_none > 0): error_message = "wilson_plot error: %d empty bin%s:" % plural_s( n_none) if (self.info is not None): error_message += "\n Info: " + str(self.info) error_message += "\n Number of bins: %d" % len(self.mean_fobs_sq) error_message += "\n Number of f_obs > 0: %d" % ( f_obs_selected.indices().size()) error_message += "\n Number of f_obs <= 0: %d" % ( f_obs.indices().size() - f_obs_selected.indices().size()) raise RuntimeError(error_message) self.mean_fobs_sq = flex.double(self.mean_fobs_sq) # compute <s^2> = <(sin(theta)/lambda)^2> in resolution shells stol_sq = f_obs_selected.sin_theta_over_lambda_sq() stol_sq.use_binner_of(f_obs_selected) self.mean_stol_sq = flex.double( stol_sq.mean(use_binning=True, use_multiplicities=True).data[1:-1]) # cache scattering factor info gaussians = {} for chemical_type in asu_contents.keys(): gaussians[chemical_type] = eltbx.xray_scattering.wk1995( chemical_type).fetch() # compute expected f_calc^2 in resolution shells self.expected_f_sq = flex.double() for stol_sq in self.mean_stol_sq: sum_fj_sq = 0 for chemical_type, n_atoms in asu_contents.items(): f0 = gaussians[chemical_type].at_stol_sq(stol_sq) sum_fj_sq += f0 * f0 * n_atoms self.expected_f_sq.append(sum_fj_sq) self.expected_f_sq *= f_obs_selected.space_group().order_z() \ * f_obs_selected.space_group().n_ltr() # fit to straight line self.x = self.mean_stol_sq self.y = flex.log(self.mean_fobs_sq / self.expected_f_sq) fit = flex.linear_regression(self.x, self.y) assert fit.is_well_defined() self.fit_y_intercept = fit.y_intercept() self.fit_slope = fit.slope() self.wilson_intensity_scale_factor = math.exp( self.fit_y_intercept) # intensity scale factor self.wilson_k = math.sqrt(self.wilson_intensity_scale_factor ) # conversion to amplitude scale factor self.wilson_b = -self.fit_slope / 2 self.fit_correlation = flex.linear_correlation(self.x, self.y).coefficient() if e_statistics: normalised = f_obs_selected.normalised_amplitudes( asu_contents, self) self.normalised_f_obs = normalised.array() self.mean_e_sq_minus_1 = normalised.mean_e_sq_minus_1() self.percent_e_sq_gt_2 = normalised.percent_e_sq_gt_2()
def __init__(self, miller_obs, miller_calc, r_free_flags, kernel_width_free_reflections=None, kernel_width_d_star_cubed=None, kernel_in_bin_centers=False, kernel_on_chebyshev_nodes=True, n_sampling_points=20, n_chebyshev_terms=10, use_sampling_sum_weights=False, make_checks_and_clean_up=True): assert [kernel_width_free_reflections, kernel_width_d_star_cubed].count(None) == 1 self.miller_obs = miller_obs self.miller_calc = abs(miller_calc) self.r_free_flags = r_free_flags self.kernel_width_free_reflections = kernel_width_free_reflections self.kernel_width_d_star_cubed = kernel_width_d_star_cubed self.n_chebyshev_terms = n_chebyshev_terms if make_checks_and_clean_up: self.miller_obs = self.miller_obs.map_to_asu() self.miller_calc = self.miller_calc.map_to_asu() self.r_free_flags = self.r_free_flags.map_to_asu() assert self.r_free_flags.indices().all_eq( self.miller_obs.indices() ) self.miller_calc = self.miller_calc.common_set( self.miller_obs ) assert self.r_free_flags.indices().all_eq( self.miller_calc.indices() ) assert self.miller_obs.is_real_array() if self.miller_obs.is_xray_intensity_array(): self.miller_obs = self.miller_obs.f_sq_as_f() assert self.miller_obs.observation_type() is None or \ self.miller_obs.is_xray_amplitude_array() if self.miller_calc.observation_type() is None: self.miller_calc = self.miller_calc.set_observation_type( self.miller_obs) # get normalized data please self.normalized_obs_f = absolute_scaling.kernel_normalisation( self.miller_obs, auto_kernel=True) self.normalized_obs =self.normalized_obs_f.normalised_miller_dev_eps.f_sq_as_f() self.normalized_calc_f = absolute_scaling.kernel_normalisation( self.miller_calc, auto_kernel=True) self.normalized_calc =self.normalized_calc_f.normalised_miller_dev_eps.f_sq_as_f() # get the 'free data' if(self.r_free_flags.data().count(True) == 0): self.r_free_flags = self.r_free_flags.array( data = ~self.r_free_flags.data()) self.free_norm_obs = self.normalized_obs.select( self.r_free_flags.data() ) self.free_norm_calc= self.normalized_calc.select( self.r_free_flags.data() ) if self.free_norm_obs.data().size() <= 0: raise RuntimeError("No free reflections.") if (self.kernel_width_d_star_cubed is None): self.kernel_width_d_star_cubed=sigmaa_estimator_kernel_width_d_star_cubed( r_free_flags=self.r_free_flags, kernel_width_free_reflections=self.kernel_width_free_reflections) self.sigma_target_functor = ext.sigmaa_estimator( e_obs = self.free_norm_obs.data(), e_calc = self.free_norm_calc.data(), centric = self.free_norm_obs.centric_flags().data(), d_star_cubed = self.free_norm_obs.d_star_cubed().data() , width=self.kernel_width_d_star_cubed) d_star_cubed_overall = self.miller_obs.d_star_cubed().data() self.min_h = flex.min( d_star_cubed_overall ) self.max_h = flex.max( d_star_cubed_overall ) self.h_array = None if (kernel_in_bin_centers): self.h_array = flex.double( range(1,n_sampling_points*2,2) )*( self.max_h-self.min_h)/(n_sampling_points*2)+self.min_h else: self.min_h *= 0.99 self.max_h *= 1.01 if kernel_on_chebyshev_nodes: self.h_array = chebyshev_lsq_fit.chebyshev_nodes( n=n_sampling_points, low=self.min_h, high=self.max_h, include_limits=True) else: self.h_array = flex.double( range(n_sampling_points) )*( self.max_h-self.min_h)/float(n_sampling_points-1.0)+self.min_h assert self.h_array.size() == n_sampling_points self.sigmaa_array = flex.double() self.sigmaa_array.reserve(self.h_array.size()) self.sum_weights = flex.double() self.sum_weights.reserve(self.h_array.size()) for h in self.h_array: stimator = sigmaa_point_estimator(self.sigma_target_functor, h) self.sigmaa_array.append( stimator.sigmaa ) self.sum_weights.append( self.sigma_target_functor.sum_weights(d_star_cubed=h)) # fit a smooth function reparam_sa = -flex.log( 1.0/self.sigmaa_array -1.0 ) if (use_sampling_sum_weights): w_obs = flex.sqrt(self.sum_weights) else: w_obs = None fit_lsq = chebyshev_lsq_fit.chebyshev_lsq_fit( n_terms=self.n_chebyshev_terms, x_obs=self.h_array, y_obs=reparam_sa, w_obs=w_obs) cheb_pol = chebyshev_polynome( self.n_chebyshev_terms, self.min_h, self.max_h, fit_lsq.coefs) def reverse_reparam(values): return 1.0/(1.0 + flex.exp(-values)) self.sigmaa_fitted = reverse_reparam(cheb_pol.f(self.h_array)) self.sigmaa_miller_array = reverse_reparam(cheb_pol.f(d_star_cubed_overall)) assert flex.min(self.sigmaa_miller_array) >= 0 assert flex.max(self.sigmaa_miller_array) <= 1 self.sigmaa_miller_array = self.miller_obs.array(data=self.sigmaa_miller_array) self.alpha = None self.beta = None self.fom_array = None
def __init__(self, miller_array, kernel_width=None, n_bins=23, n_term=13, d_star_sq_low=None, d_star_sq_high=None, auto_kernel=False, number_of_sorted_reflections_for_auto_kernel=50): ## Autokernel is either False, true or a specific integer if kernel_width is None: assert (auto_kernel is not False) if auto_kernel is not False: assert (kernel_width==None) assert miller_array.size()>0 ## intensity arrays please work_array = None if not miller_array.is_real_array(): raise RuntimeError("Please provide real arrays only") ## I might have to change this upper condition if miller_array.is_xray_amplitude_array(): work_array = miller_array.f_as_f_sq() if miller_array.is_xray_intensity_array(): work_array = miller_array.deep_copy() work_array = work_array.set_observation_type(miller_array) ## If type is not intensity or amplitude ## raise an execption please if not miller_array.is_xray_intensity_array(): if not miller_array.is_xray_amplitude_array(): raise RuntimeError("Observation type unknown") ## declare some shorthands I_obs = work_array.data() epsilons = work_array.epsilons().data().as_double() d_star_sq_hkl = work_array.d_spacings().data() d_star_sq_hkl = 1.0/(d_star_sq_hkl*d_star_sq_hkl) ## Set up some limits if d_star_sq_low is None: d_star_sq_low = flex.min(d_star_sq_hkl) if d_star_sq_high is None: d_star_sq_high = flex.max(d_star_sq_hkl) ## A feeble attempt to determine an appropriate kernel width ## that seems to work reasonable in practice self.kernel_width=kernel_width if auto_kernel is not False: ## get the d_star_sq_array and sort it sort_permut = flex.sort_permutation(d_star_sq_hkl) ## if auto_kernel==True: number=number_of_sorted_reflections_for_auto_kernel else: number=int(auto_kernel) if number > d_star_sq_hkl.size(): number = d_star_sq_hkl.size()-1 self.kernel_width = d_star_sq_hkl[sort_permut[number]]-d_star_sq_low assert self.kernel_width > 0 ## Making the d_star_sq_array assert (n_bins>1) ## assure that there are more then 1 bins for interpolation self.d_star_sq_array = chebyshev_lsq_fit.chebyshev_nodes( n=n_bins, low=d_star_sq_low, high=d_star_sq_high, include_limits=True) ## Now get the average intensity please ## ## This step can be reasonably time consuming self.mean_I_array = scaling.kernel_normalisation( d_star_sq_hkl = d_star_sq_hkl, I_hkl = I_obs, epsilon = epsilons, d_star_sq_array = self.d_star_sq_array, kernel_width = self.kernel_width ) self.var_I_array = scaling.kernel_normalisation( d_star_sq_hkl = d_star_sq_hkl, I_hkl = I_obs*I_obs, epsilon = epsilons*epsilons, d_star_sq_array = self.d_star_sq_array, kernel_width = self.kernel_width ) self.var_I_array = self.var_I_array - self.mean_I_array*self.mean_I_array self.weight_sum = self.var_I_array = scaling.kernel_normalisation( d_star_sq_hkl = d_star_sq_hkl, I_hkl = I_obs*0.0+1.0, epsilon = epsilons*0.0+1.0, d_star_sq_array = self.d_star_sq_array, kernel_width = self.kernel_width ) eps = 1e-16 # XXX Maybe this should be larger? self.bin_selection = (self.mean_I_array > eps) sel_pos = self.bin_selection.iselection() # FIXME rare bug: this crashes when the majority of the data are zero, # e.g. because resolution limit was set too high and F/I filled in with 0. # it would be good to catch such cases in advance by inspecting the binned # values, and raise a different error message. assert sel_pos.size() > 0 if (sel_pos.size() < self.mean_I_array.size() / 2) : raise Sorry("Analysis could not be continued because more than half "+ "of the data have values below 1e-16. This usually indicates either "+ "an inappropriately high resolution cutoff, or an error in the data "+ "file which artificially creates a higher resolution limit.") self.mean_I_array = self.mean_I_array.select(sel_pos) self.d_star_sq_array = self.d_star_sq_array.select(sel_pos) self.var_I_array = flex.log( self.var_I_array.select( sel_pos ) ) self.weight_sum = self.weight_sum.select(sel_pos) self.mean_I_array = flex.log( self.mean_I_array ) ## Fit a chebyshev polynome please normalizer_fit_lsq = chebyshev_lsq_fit.chebyshev_lsq_fit( n_term, self.d_star_sq_array, self.mean_I_array ) self.normalizer = chebyshev_polynome( n_term, d_star_sq_low, d_star_sq_high, normalizer_fit_lsq.coefs) var_lsq_fit = chebyshev_lsq_fit.chebyshev_lsq_fit( n_term, self.d_star_sq_array, self.var_I_array ) self.var_norm = chebyshev_polynome( n_term, d_star_sq_low, d_star_sq_high, var_lsq_fit.coefs) ws_fit = chebyshev_lsq_fit.chebyshev_lsq_fit( n_term, self.d_star_sq_array, self.weight_sum ) self.weight_sum = chebyshev_polynome( n_term, d_star_sq_low, d_star_sq_high, ws_fit.coefs) ## The data wil now be normalised using the ## chebyshev polynome we have just obtained self.mean_I_array = flex.exp( self.mean_I_array) self.normalizer_for_miller_array = flex.exp( self.normalizer.f(d_star_sq_hkl) ) self.var_I_array = flex.exp( self.var_I_array ) self.var_norm = flex.exp( self.var_norm.f(d_star_sq_hkl) ) self.weight_sum = flex.exp( self.weight_sum.f(d_star_sq_hkl)) self.normalised_miller = None self.normalised_miller_dev_eps = None if work_array.sigmas() is not None: self.normalised_miller = work_array.customized_copy( data = work_array.data()/self.normalizer_for_miller_array, sigmas = work_array.sigmas()/self.normalizer_for_miller_array ).set_observation_type(work_array) self.normalised_miller_dev_eps = self.normalised_miller.customized_copy( data = self.normalised_miller.data()/epsilons, sigmas = self.normalised_miller.sigmas()/epsilons)\ .set_observation_type(work_array) else: self.normalised_miller = work_array.customized_copy( data = work_array.data()/self.normalizer_for_miller_array ).set_observation_type(work_array) self.normalised_miller_dev_eps = self.normalised_miller.customized_copy( data = self.normalised_miller.data()/epsilons)\ .set_observation_type(work_array)
def __init__( self, miller_obs, miller_calc, r_free_flags, kernel_width_free_reflections=None, kernel_width_d_star_cubed=None, kernel_in_bin_centers=False, kernel_on_chebyshev_nodes=True, n_sampling_points=20, n_chebyshev_terms=10, use_sampling_sum_weights=False, make_checks_and_clean_up=True, ): assert [kernel_width_free_reflections, kernel_width_d_star_cubed].count(None) == 1 self.miller_obs = miller_obs self.miller_calc = abs(miller_calc) self.r_free_flags = r_free_flags self.kernel_width_free_reflections = kernel_width_free_reflections self.kernel_width_d_star_cubed = kernel_width_d_star_cubed self.n_chebyshev_terms = n_chebyshev_terms if make_checks_and_clean_up: self.miller_obs = self.miller_obs.map_to_asu() self.miller_calc = self.miller_calc.map_to_asu() self.r_free_flags = self.r_free_flags.map_to_asu() assert self.r_free_flags.indices().all_eq(self.miller_obs.indices()) self.miller_calc = self.miller_calc.common_set(self.miller_obs) assert self.r_free_flags.indices().all_eq(self.miller_calc.indices()) assert self.miller_obs.is_real_array() if self.miller_obs.is_xray_intensity_array(): self.miller_obs = self.miller_obs.f_sq_as_f() assert self.miller_obs.observation_type() is None or self.miller_obs.is_xray_amplitude_array() if self.miller_calc.observation_type() is None: self.miller_calc = self.miller_calc.set_observation_type(self.miller_obs) # get normalized data please self.normalized_obs_f = absolute_scaling.kernel_normalisation(self.miller_obs, auto_kernel=True) self.normalized_obs = self.normalized_obs_f.normalised_miller_dev_eps.f_sq_as_f() self.normalized_calc_f = absolute_scaling.kernel_normalisation(self.miller_calc, auto_kernel=True) self.normalized_calc = self.normalized_calc_f.normalised_miller_dev_eps.f_sq_as_f() # get the 'free data' if self.r_free_flags.data().count(True) == 0: self.r_free_flags = self.r_free_flags.array(data=~self.r_free_flags.data()) self.free_norm_obs = self.normalized_obs.select(self.r_free_flags.data()) self.free_norm_calc = self.normalized_calc.select(self.r_free_flags.data()) if self.free_norm_obs.data().size() <= 0: raise RuntimeError("No free reflections.") if self.kernel_width_d_star_cubed is None: self.kernel_width_d_star_cubed = sigmaa_estimator_kernel_width_d_star_cubed( r_free_flags=self.r_free_flags, kernel_width_free_reflections=self.kernel_width_free_reflections ) self.sigma_target_functor = ext.sigmaa_estimator( e_obs=self.free_norm_obs.data(), e_calc=self.free_norm_calc.data(), centric=self.free_norm_obs.centric_flags().data(), d_star_cubed=self.free_norm_obs.d_star_cubed().data(), width=self.kernel_width_d_star_cubed, ) d_star_cubed_overall = self.miller_obs.d_star_cubed().data() self.min_h = flex.min(d_star_cubed_overall) self.max_h = flex.max(d_star_cubed_overall) self.h_array = None if kernel_in_bin_centers: self.h_array = ( flex.double(xrange(1, n_sampling_points * 2, 2)) * (self.max_h - self.min_h) / (n_sampling_points * 2) + self.min_h ) else: self.min_h *= 0.99 self.max_h *= 1.01 if kernel_on_chebyshev_nodes: self.h_array = chebyshev_lsq_fit.chebyshev_nodes( n=n_sampling_points, low=self.min_h, high=self.max_h, include_limits=True ) else: self.h_array = ( flex.double(range(n_sampling_points)) * (self.max_h - self.min_h) / float(n_sampling_points - 1.0) + self.min_h ) assert self.h_array.size() == n_sampling_points self.sigmaa_array = flex.double() self.sigmaa_array.reserve(self.h_array.size()) self.sum_weights = flex.double() self.sum_weights.reserve(self.h_array.size()) for h in self.h_array: stimator = sigmaa_point_estimator(self.sigma_target_functor, h) self.sigmaa_array.append(stimator.sigmaa) self.sum_weights.append(self.sigma_target_functor.sum_weights(d_star_cubed=h)) # fit a smooth function reparam_sa = -flex.log(1.0 / self.sigmaa_array - 1.0) if use_sampling_sum_weights: w_obs = flex.sqrt(self.sum_weights) else: w_obs = None fit_lsq = chebyshev_lsq_fit.chebyshev_lsq_fit( n_terms=self.n_chebyshev_terms, x_obs=self.h_array, y_obs=reparam_sa, w_obs=w_obs ) cheb_pol = chebyshev_polynome(self.n_chebyshev_terms, self.min_h, self.max_h, fit_lsq.coefs) def reverse_reparam(values): return 1.0 / (1.0 + flex.exp(-values)) self.sigmaa_fitted = reverse_reparam(cheb_pol.f(self.h_array)) self.sigmaa_miller_array = reverse_reparam(cheb_pol.f(d_star_cubed_overall)) assert flex.min(self.sigmaa_miller_array) >= 0 assert flex.max(self.sigmaa_miller_array) <= 1 self.sigmaa_miller_array = self.miller_obs.array(data=self.sigmaa_miller_array) self.alpha = None self.beta = None self.fom_array = None
def __init__(self, miller_array, kernel_width=None, n_bins=23, n_term=13, d_star_sq_low=None, d_star_sq_high=None, auto_kernel=False, number_of_sorted_reflections_for_auto_kernel=50): ## Autokernel is either False, true or a specific integer if kernel_width is None: assert (auto_kernel is not False) if auto_kernel is not False: assert (kernel_width == None) assert miller_array.size() > 0 ## intensity arrays please work_array = None if not miller_array.is_real_array(): raise RuntimeError("Please provide real arrays only") ## I might have to change this upper condition if miller_array.is_xray_amplitude_array(): work_array = miller_array.f_as_f_sq() if miller_array.is_xray_intensity_array(): work_array = miller_array.deep_copy() work_array = work_array.set_observation_type(miller_array) ## If type is not intensity or amplitude ## raise an execption please if not miller_array.is_xray_intensity_array(): if not miller_array.is_xray_amplitude_array(): raise RuntimeError("Observation type unknown") ## declare some shorthands I_obs = work_array.data() epsilons = work_array.epsilons().data().as_double() d_star_sq_hkl = work_array.d_spacings().data() d_star_sq_hkl = 1.0 / (d_star_sq_hkl * d_star_sq_hkl) ## Set up some limits if d_star_sq_low is None: d_star_sq_low = flex.min(d_star_sq_hkl) if d_star_sq_high is None: d_star_sq_high = flex.max(d_star_sq_hkl) ## A feeble attempt to determine an appropriate kernel width ## that seems to work reasonable in practice self.kernel_width = kernel_width if auto_kernel is not False: ## get the d_star_sq_array and sort it sort_permut = flex.sort_permutation(d_star_sq_hkl) ## if auto_kernel == True: number = number_of_sorted_reflections_for_auto_kernel else: number = int(auto_kernel) if number > d_star_sq_hkl.size(): number = d_star_sq_hkl.size() - 1 self.kernel_width = d_star_sq_hkl[ sort_permut[number]] - d_star_sq_low assert self.kernel_width > 0 ## Making the d_star_sq_array assert (n_bins > 1 ) ## assure that there are more then 1 bins for interpolation self.d_star_sq_array = chebyshev_lsq_fit.chebyshev_nodes( n=n_bins, low=d_star_sq_low, high=d_star_sq_high, include_limits=True) ## Now get the average intensity please ## ## This step can be reasonably time consuming self.mean_I_array = scaling.kernel_normalisation( d_star_sq_hkl=d_star_sq_hkl, I_hkl=I_obs, epsilon=epsilons, d_star_sq_array=self.d_star_sq_array, kernel_width=self.kernel_width) self.var_I_array = scaling.kernel_normalisation( d_star_sq_hkl=d_star_sq_hkl, I_hkl=I_obs * I_obs, epsilon=epsilons * epsilons, d_star_sq_array=self.d_star_sq_array, kernel_width=self.kernel_width) self.var_I_array = self.var_I_array - self.mean_I_array * self.mean_I_array self.weight_sum = self.var_I_array = scaling.kernel_normalisation( d_star_sq_hkl=d_star_sq_hkl, I_hkl=I_obs * 0.0 + 1.0, epsilon=epsilons * 0.0 + 1.0, d_star_sq_array=self.d_star_sq_array, kernel_width=self.kernel_width) eps = 1e-16 # XXX Maybe this should be larger? self.bin_selection = (self.mean_I_array > eps) sel_pos = self.bin_selection.iselection() # FIXME rare bug: this crashes when the majority of the data are zero, # e.g. because resolution limit was set too high and F/I filled in with 0. # it would be good to catch such cases in advance by inspecting the binned # values, and raise a different error message. assert sel_pos.size() > 0 if (sel_pos.size() < self.mean_I_array.size() / 2): raise Sorry( "Analysis could not be continued because more than half " + "of the data have values below 1e-16. This usually indicates either " + "an inappropriately high resolution cutoff, or an error in the data " + "file which artificially creates a higher resolution limit.") self.mean_I_array = self.mean_I_array.select(sel_pos) self.d_star_sq_array = self.d_star_sq_array.select(sel_pos) self.var_I_array = flex.log(self.var_I_array.select(sel_pos)) self.weight_sum = self.weight_sum.select(sel_pos) self.mean_I_array = flex.log(self.mean_I_array) ## Fit a chebyshev polynome please normalizer_fit_lsq = chebyshev_lsq_fit.chebyshev_lsq_fit( n_term, self.d_star_sq_array, self.mean_I_array) self.normalizer = chebyshev_polynome(n_term, d_star_sq_low, d_star_sq_high, normalizer_fit_lsq.coefs) var_lsq_fit = chebyshev_lsq_fit.chebyshev_lsq_fit( n_term, self.d_star_sq_array, self.var_I_array) self.var_norm = chebyshev_polynome(n_term, d_star_sq_low, d_star_sq_high, var_lsq_fit.coefs) ws_fit = chebyshev_lsq_fit.chebyshev_lsq_fit(n_term, self.d_star_sq_array, self.weight_sum) self.weight_sum = chebyshev_polynome(n_term, d_star_sq_low, d_star_sq_high, ws_fit.coefs) ## The data wil now be normalised using the ## chebyshev polynome we have just obtained self.mean_I_array = flex.exp(self.mean_I_array) self.normalizer_for_miller_array = flex.exp( self.normalizer.f(d_star_sq_hkl)) self.var_I_array = flex.exp(self.var_I_array) self.var_norm = flex.exp(self.var_norm.f(d_star_sq_hkl)) self.weight_sum = flex.exp(self.weight_sum.f(d_star_sq_hkl)) self.normalised_miller = None self.normalised_miller_dev_eps = None if work_array.sigmas() is not None: self.normalised_miller = work_array.customized_copy( data=work_array.data() / self.normalizer_for_miller_array, sigmas=work_array.sigmas() / self.normalizer_for_miller_array).set_observation_type( work_array) self.normalised_miller_dev_eps = self.normalised_miller.customized_copy( data = self.normalised_miller.data()/epsilons, sigmas = self.normalised_miller.sigmas()/epsilons)\ .set_observation_type(work_array) else: self.normalised_miller = work_array.customized_copy( data=work_array.data() / self.normalizer_for_miller_array).set_observation_type( work_array) self.normalised_miller_dev_eps = self.normalised_miller.customized_copy( data = self.normalised_miller.data()/epsilons)\ .set_observation_type(work_array)
def do_something_clever(self,obs,sobs,calc,mock): # first get the sort order # sort on the calculated data please sort_order = flex.sort_permutation( calc ) inverse_sort_order = sort_order.inverse_permutation() sorted_obs = obs.select(sort_order) sorted_sobs = sobs.select(sort_order) sorted_calc = calc.select(sort_order) sorted_mock = mock.select(sort_order) log_calc = flex.log(sorted_mock) deltas = flex.log(sorted_obs) - flex.log(sorted_calc) old_deltas = deltas.deep_copy() # make bins on the basis of the order bin_size = float(sorted_obs.size())/self.n_e_bins bin_size = int(bin_size) + 1 ebin = flex.int() count=0 for ii in xrange( sorted_obs.size() ): if ii%bin_size==0: count+=1 ebin.append( count-1 ) # the bins have been setup, now we can reorder stuff for ibin in xrange(self.n_e_bins): this_bin_selection = flex.bool( ebin == ibin ) tmp_n = (this_bin_selection).count(True) permute = flex.sort_permutation( flex.random_double( tmp_n ) ) #select and swap selected_deltas = deltas.select( this_bin_selection ) selected_deltas = selected_deltas.select( permute ) selected_sobs = sorted_sobs.select( this_bin_selection ) selected_sobs = selected_sobs.select( permute ) # we have to make a sanity check so that the selected deltas are not very weerd # a safeguard to prevent the introductoin of outliers mean_delta = flex.mean( selected_deltas ) std_delta = math.sqrt( flex.mean( selected_deltas*selected_deltas ) - mean_delta*mean_delta ) outliers = flex.bool( flex.abs(selected_deltas-mean_delta)>self.thres*std_delta ) #print list( flex.abs(selected_deltas-mean_delta)/std_delta ) #print list( outliers ) if (outliers).count(True) > 0 : non_out_delta = selected_deltas.select( ~outliers ) tmp_permut = flex.sort_permutation( flex.random_double( (~outliers).count(True) ) ) tmp_delta = non_out_delta.select( tmp_permut ) tmp_delta = tmp_delta[0:(outliers).count(True)] selected_deltas = selected_deltas.set_selected( outliers.iselection(), tmp_delta ) #set the deltas back please deltas = deltas.set_selected(this_bin_selection, selected_deltas) sorted_sobs = sorted_sobs.set_selected(this_bin_selection, selected_sobs) #the deltas have been swapped, apply things back please log_calc = log_calc + deltas log_calc = flex.exp(log_calc) #now we have to get things back in proper order again thank you new_fobs = log_calc.select(inverse_sort_order) new_sobs = sorted_sobs.select(inverse_sort_order) return new_fobs, new_sobs
def run(self, args, command_name, out=sys.stdout): command_line = (iotbx_option_parser( usage="%s [options]" % command_name, description='Example: %s data.mtz data.mtz ref_model.pdb'%command_name) .option(None, "--show_defaults", action="store_true", help="Show list of parameters.") ).process(args=args) cif_file = None processed_args = utils.process_command_line_args( args = args, log = sys.stdout, master_params = master_phil) params = processed_args.params if(params is None): params = master_phil self.params = params.extract().ensemble_probability pdb_file_names = processed_args.pdb_file_names if len(pdb_file_names) != 1 : raise Sorry("Only one PDB structure may be used") pdb_file = file_reader.any_file(pdb_file_names[0]) self.log = multi_out() self.log.register(label="stdout", file_object=sys.stdout) self.log.register( label="log_buffer", file_object=StringIO(), atexit_send_to=None) sys.stderr = self.log log_file = open(pdb_file_names[0].split('/')[-1].replace('.pdb','') + '_pensemble.log', "w") self.log.replace_stringio( old_label="log_buffer", new_label="log", new_file_object=log_file) utils.print_header(command_name, out = self.log) params.show(out = self.log) # f_obs = None r_free_flags = None reflection_files = processed_args.reflection_files if self.params.fobs_vs_fcalc_post_nll: if len(reflection_files) == 0: raise Sorry("Fobs from input MTZ required for fobs_vs_fcalc_post_nll") if len(reflection_files) > 0: crystal_symmetry = processed_args.crystal_symmetry print >> self.log, 'Reflection file : ', processed_args.reflection_file_names[0] utils.print_header("Model and data statistics", out = self.log) rfs = reflection_file_server( crystal_symmetry = crystal_symmetry, reflection_files = processed_args.reflection_files, log = self.log) parameters = utils.data_and_flags_master_params().extract() determine_data_and_flags_result = utils.determine_data_and_flags( reflection_file_server = rfs, parameters = parameters, data_parameter_scope = "refinement.input.xray_data", flags_parameter_scope = "refinement.input.xray_data.r_free_flags", data_description = "X-ray data", keep_going = True, log = self.log) f_obs = determine_data_and_flags_result.f_obs number_of_reflections = f_obs.indices().size() r_free_flags = determine_data_and_flags_result.r_free_flags test_flag_value = determine_data_and_flags_result.test_flag_value if(r_free_flags is None): r_free_flags=f_obs.array(data=flex.bool(f_obs.data().size(), False)) # process PDB pdb_file.assert_file_type("pdb") # pdb_in = hierarchy.input(file_name=pdb_file.file_name) ens_pdb_hierarchy = pdb_in.construct_hierarchy() ens_pdb_hierarchy.atoms().reset_i_seq() ens_pdb_xrs_s = pdb_in.input.xray_structures_simple() number_structures = len(ens_pdb_xrs_s) print >> self.log, 'Number of structure in ensemble : ', number_structures # Calculate sigmas from input map only if self.params.assign_sigma_from_map and self.params.ensemble_sigma_map_input is not None: # process MTZ input_file = file_reader.any_file(self.params.ensemble_sigma_map_input) if input_file.file_type == "hkl" : if input_file.file_object.file_type() != "ccp4_mtz" : raise Sorry("Only MTZ format accepted for map input") else: mtz_file = input_file else: raise Sorry("Only MTZ format accepted for map input") miller_arrays = mtz_file.file_server.miller_arrays map_coeffs_1 = miller_arrays[0] # xrs_list = [] for n, ens_pdb_xrs in enumerate(ens_pdb_xrs_s): # get sigma levels from ensemble fc for each structure xrs = get_map_sigma(ens_pdb_hierarchy = ens_pdb_hierarchy, ens_pdb_xrs = ens_pdb_xrs, map_coeffs_1 = map_coeffs_1, residue_detail = self.params.residue_detail, ignore_hd = self.params.ignore_hd, log = self.log) xrs_list.append(xrs) # write ensemble pdb file, occupancies as sigma level filename = pdb_file_names[0].split('/')[-1].replace('.pdb','') + '_vs_' + self.params.ensemble_sigma_map_input.replace('.mtz','') + '_pensemble.pdb' write_ensemble_pdb(filename = filename, xrs_list = xrs_list, ens_pdb_hierarchy = ens_pdb_hierarchy ) # Do full analysis vs Fobs else: model_map_coeffs = [] fmodel = None # Get <fcalc> for model, ens_pdb_xrs in enumerate(ens_pdb_xrs_s): ens_pdb_xrs.set_occupancies(1.0) if model == 0: # If mtz not supplied get fobs from xray structure... # Use input Fobs for scoring against nll if self.params.fobs_vs_fcalc_post_nll: dummy_fobs = f_obs else: if f_obs == None: if self.params.fcalc_high_resolution == None: raise Sorry("Please supply high resolution limit or input mtz file.") dummy_dmin = self.params.fcalc_high_resolution dummy_dmax = self.params.fcalc_low_resolution else: print >> self.log, 'Supplied mtz used to determine high and low resolution cuttoffs' dummy_dmax, dummy_dmin = f_obs.d_max_min() # dummy_fobs = abs(ens_pdb_xrs.structure_factors(d_min = dummy_dmin).f_calc()) dummy_fobs.set_observation_type_xray_amplitude() # If mtz supplied, free flags are over written to prevent array size error r_free_flags = dummy_fobs.array(data=flex.bool(dummy_fobs.data().size(),False)) # fmodel = utils.fmodel_simple( scattering_table = "wk1995", xray_structures = [ens_pdb_xrs], f_obs = dummy_fobs, target_name = 'ls', bulk_solvent_and_scaling = False, r_free_flags = r_free_flags ) f_calc_ave = fmodel.f_calc().array(data = fmodel.f_calc().data()*0).deep_copy() # XXX Important to ensure scale is identical for each model and <model> fmodel.set_scale_switch = 1.0 f_calc_ave_total = fmodel.f_calc().data().deep_copy() else: fmodel.update_xray_structure(xray_structure = ens_pdb_xrs, update_f_calc = True, update_f_mask = False) f_calc_ave_total += fmodel.f_calc().data().deep_copy() print >> self.log, 'Model :', model+1 print >> self.log, "\nStructure vs real Fobs (no bulk solvent or scaling)" print >> self.log, 'Rwork : %5.4f '%fmodel.r_work() print >> self.log, 'Rfree : %5.4f '%fmodel.r_free() print >> self.log, 'K1 : %5.4f '%fmodel.scale_k1() fcalc_edm = fmodel.electron_density_map() fcalc_map_coeffs = fcalc_edm.map_coefficients(map_type = 'Fc') fcalc_mtz_dataset = fcalc_map_coeffs.as_mtz_dataset(column_root_label ='Fc') if self.params.output_model_and_model_ave_mtz: fcalc_mtz_dataset.mtz_object().write(file_name = str(model+1)+"_Fc.mtz") model_map_coeffs.append(fcalc_map_coeffs.deep_copy()) fmodel.update(f_calc = f_calc_ave.array(f_calc_ave_total / number_structures)) print >> self.log, "\nEnsemble vs real Fobs (no bulk solvent or scaling)" print >> self.log, 'Rwork : %5.4f '%fmodel.r_work() print >> self.log, 'Rfree : %5.4f '%fmodel.r_free() print >> self.log, 'K1 : %5.4f '%fmodel.scale_k1() # Get <Fcalc> map fcalc_ave_edm = fmodel.electron_density_map() fcalc_ave_map_coeffs = fcalc_ave_edm.map_coefficients(map_type = 'Fc').deep_copy() fcalc_ave_mtz_dataset = fcalc_ave_map_coeffs.as_mtz_dataset(column_root_label ='Fc') if self.params.output_model_and_model_ave_mtz: fcalc_ave_mtz_dataset.mtz_object().write(file_name = "aveFc.mtz") fcalc_ave_map_coeffs = fcalc_ave_map_coeffs.fft_map() fcalc_ave_map_coeffs.apply_volume_scaling() fcalc_ave_map_data = fcalc_ave_map_coeffs.real_map_unpadded() fcalc_ave_map_stats = maptbx.statistics(fcalc_ave_map_data) print >> self.log, "<Fcalc> Map Stats :" fcalc_ave_map_stats.show_summary(f = self.log) offset = fcalc_ave_map_stats.min() model_neg_ll = [] number_previous_scatters = 0 # Run through structure list again and get probability xrs_list = [] for model, ens_pdb_xrs in enumerate(ens_pdb_xrs_s): if self.params.verbose: print >> self.log, '\n\nModel : ', model+1 # Get model atom sigmas vs Fcalc fcalc_map = model_map_coeffs[model].fft_map() fcalc_map.apply_volume_scaling() fcalc_map_data = fcalc_map.real_map_unpadded() fcalc_map_stats = maptbx.statistics(fcalc_map_data) if self.params.verbose: print >> self.log, "Fcalc map stats :" fcalc_map_stats.show_summary(f = self.log) xrs = get_map_sigma(ens_pdb_hierarchy = ens_pdb_hierarchy, ens_pdb_xrs = ens_pdb_xrs, fft_map_1 = fcalc_map, model_i = model, residue_detail = self.params.residue_detail, ignore_hd = self.params.ignore_hd, number_previous_scatters = number_previous_scatters, log = self.log) fcalc_sigmas = xrs.scatterers().extract_occupancies() del fcalc_map # Get model atom sigmas vs <Fcalc> xrs = get_map_sigma(ens_pdb_hierarchy = ens_pdb_hierarchy, ens_pdb_xrs = ens_pdb_xrs, fft_map_1 = fcalc_ave_map_coeffs, model_i = model, residue_detail = self.params.residue_detail, ignore_hd = self.params.ignore_hd, number_previous_scatters = number_previous_scatters, log = self.log) ### For testing other residue averaging options #print xrs.residue_selections fcalc_ave_sigmas = xrs.scatterers().extract_occupancies() # Probability of model given <model> prob = fcalc_ave_sigmas / fcalc_sigmas # XXX debug option if False: for n,p in enumerate(prob): print >> self.log, ' {0:5d} {1:5.3f}'.format(n,p) # Set probabilty between 0 and 1 # XXX Make Histogram / more stats prob_lss_zero = flex.bool(prob <= 0) prob_grt_one = flex.bool(prob > 1) prob.set_selected(prob_lss_zero, 0.001) prob.set_selected(prob_grt_one, 1.0) xrs.set_occupancies(prob) xrs_list.append(xrs) sum_neg_ll = sum(-flex.log(prob)) model_neg_ll.append((sum_neg_ll, model)) if self.params.verbose: print >> self.log, 'Model probability stats :' print >> self.log, prob.min_max_mean().show() print >> self.log, ' Count < 0.0 : ', prob_lss_zero.count(True) print >> self.log, ' Count > 1.0 : ', prob_grt_one.count(True) # For averaging by residue number_previous_scatters += ens_pdb_xrs.sites_cart().size() # write ensemble pdb file, occupancies as sigma level write_ensemble_pdb(filename = pdb_file_names[0].split('/')[-1].replace('.pdb','') + '_pensemble.pdb', xrs_list = xrs_list, ens_pdb_hierarchy = ens_pdb_hierarchy ) # XXX Test ordering models by nll # XXX Test removing nth percentile atoms if self.params.sort_ensemble_by_nll_score or self.params.fobs_vs_fcalc_post_nll: for percentile in [1.0,0.975,0.95,0.9,0.8,0.6,0.2]: model_neg_ll = sorted(model_neg_ll) f_calc_ave_total_reordered = None print_list = [] for i_neg_ll in model_neg_ll: xrs = xrs_list[i_neg_ll[1]] nll_occ = xrs.scatterers().extract_occupancies() # Set q=0 nth percentile atoms sorted_nll_occ = sorted(nll_occ, reverse=True) number_atoms = len(sorted_nll_occ) percentile_prob_cutoff = sorted_nll_occ[int(number_atoms * percentile)-1] cutoff_selections = flex.bool(nll_occ < percentile_prob_cutoff) cutoff_nll_occ = flex.double(nll_occ.size(), 1.0).set_selected(cutoff_selections, 0.0) #XXX Debug if False: print '\nDebug' for x in xrange(len(cutoff_selections)): print cutoff_selections[x], nll_occ[x], cutoff_nll_occ[x] print percentile print percentile_prob_cutoff print cutoff_selections.count(True) print cutoff_selections.size() print cutoff_nll_occ.count(0.0) print 'Count q = 1 : ', cutoff_nll_occ.count(1.0) print 'Count scatterers size : ', cutoff_nll_occ.size() xrs.set_occupancies(cutoff_nll_occ) fmodel.update_xray_structure(xray_structure = xrs, update_f_calc = True, update_f_mask = True) if f_calc_ave_total_reordered == None: f_calc_ave_total_reordered = fmodel.f_calc().data().deep_copy() f_mask_ave_total_reordered = fmodel.f_masks()[0].data().deep_copy() cntr = 1 else: f_calc_ave_total_reordered += fmodel.f_calc().data().deep_copy() f_mask_ave_total_reordered += fmodel.f_masks()[0].data().deep_copy() cntr+=1 fmodel.update(f_calc = f_calc_ave.array(f_calc_ave_total_reordered / cntr).deep_copy(), f_mask = f_calc_ave.array(f_mask_ave_total_reordered / cntr).deep_copy() ) # Update solvent and scale # XXX Will need to apply_back_trace on latest version fmodel.set_scale_switch = 0 fmodel.update_all_scales() # Reset occ for outout xrs.set_occupancies(nll_occ) # k1 updated vs Fobs if self.params.fobs_vs_fcalc_post_nll: print_list.append([cntr, i_neg_ll[0], i_neg_ll[1], fmodel.r_work(), fmodel.r_free()]) # Order models by nll and print summary print >> self.log, '\nModels ranked by nll <Fcalc> R-factors recalculated' print >> self.log, 'Percentile cutoff : {0:5.3f}'.format(percentile) xrs_list_sorted_nll = [] print >> self.log, ' | NLL <Rw> <Rf> Ens Model' for info in print_list: print >> self.log, ' {0:4d} | {1:8.1f} {2:8.4f} {3:8.4f} {4:12d}'.format( info[0], info[1], info[3], info[4], info[2]+1, ) xrs_list_sorted_nll.append(xrs_list[info[2]]) # Output nll ordered ensemble write_ensemble_pdb(filename = 'nll_ordered_' + pdb_file_names[0].split('/')[-1].replace('.pdb','') + '_pensemble.pdb', xrs_list = xrs_list_sorted_nll, ens_pdb_hierarchy = ens_pdb_hierarchy )