def split_model_with_pae(model, m, pae_matrix, maximum_domains=None, pae_power=1., pae_cutoff=5., pae_graph_resolution=1., minimum_domain_length=10, log=sys.stdout): """ Function to identify groups of atoms in a model that form compact units using a predicted alignment error matrix (pae_matrix). Normally used after trimming low-confidence regions in predicted models to isolate domains that are likely to have indeterminate relationships. m: cctbx.model.model object containing information about the input model after trimming model: model before trimming pae_matrix: matrix of predicted aligned errors (e.g., from AlphaFold2), NxN matrix of RMSD values, N = number of residues in model. maximum_domains: If more than this many domains, merge closest ones until reaching this number pae_power (default=1): each edge in the graph will be weighted proportional to (1/pae**pae_power) pae_cutoff (optional, default=5): graph edges will only be created for residue pairs with pae<pae_cutoff pae_graph_resolution (optional, default = 1): regulates how aggressively the clustering algorithm is. Smaller values lead to larger clusters. Value should be larger than zero, and values larger than 5 are unlikely to be useful minimum_domain_length: if a region is smaller than this, skip completely Output: group_args object with members: m: new model with chainid values from 0 to N where there are N domains chainid 1 to N are the N domains, roughly in order along the chain. chainid_list: list of all the chainid values On failure: returns None """ print("\nSelecting domains with predicted alignment error estimates", file=log) # Select CA and P atoms with B-values in range selection_string = '(name ca or name p)' m_ca = m.apply_selection_string(selection_string) n = model.apply_selection_string( selection_string).get_hierarchy().overall_counts().n_residues # Make sure matrix matches if tuple(pae_matrix.shape) != (n, n): raise Sorry("The pae matrix has a size of (%s,%s) " % (tuple(pae_matrix.shape)) + "but the number of residues in the model is %s" % (n)) from mmtbx.secondary_structure.find_ss_from_ca import get_first_resno first_resno = get_first_resno(model.get_hierarchy()) # Assign all CA in model to a region from mmtbx.domains_from_pae import get_domain_selections_from_pae_matrix selection_list = get_domain_selections_from_pae_matrix( pae_matrix=pae_matrix, pae_power=pae_power, pae_cutoff=pae_cutoff, graph_resolution=pae_graph_resolution, first_resno=first_resno, ) # And apply to full model unique_regions = list(range(len(selection_list))) keep_list = [] good_selections = [] ph = m.get_hierarchy() for selection_string, region_number in zip(selection_list, unique_regions): asc1 = ph.atom_selection_cache() sel = asc1.selection(selection_string) if sel.count(True) >= minimum_domain_length: keep_list.append(True) good_selections.append(selection_string) else: keep_list.append(False) print("Skipping region '%s' with size of only %s residues" % (selection_string, sel.count(True)), file=log) region_name_dict, chainid_list = get_region_name_dict(m, unique_regions, keep_list=keep_list) print("\nSelection list based on PAE values:", file=log) # Now create new model with chains based on region list full_new_model = None for keep, selection_string, region_number in zip(keep_list, selection_list, unique_regions): if not keep: continue new_m = m.apply_selection_string(selection_string) print("%s (%s residues) " % (selection_string, new_m.get_hierarchy().overall_counts().n_residues), file=log) # Now put all of new_m in a chain with chain.id = str(region_number) for model in new_m.get_hierarchy().models()[:1]: # only one model for chain in model.chains()[:1]: # only allowing one chain chain.id = region_name_dict[region_number] if full_new_model: full_new_model = add_model(full_new_model, new_m) else: full_new_model = new_m m = full_new_model # All done return group_args(group_args_type='model_info', model=m, chainid_list=chainid_list) return set_chain_id_by_region(m, m_ca, regions_list, log=log)
def run( params=None, # params for running from command line map_data=None, # map_data, as_double() pdb_inp=None, pdb_hierarchy=None, crystal_symmetry=None, resolution=None, scattering_table='n_gaussian', smoothing_window=5, crossover_atom='CA', minimum_matching_atoms=3, minimum_length=2, dist_max=1.0, minimum_improvement=0.01, max_regions_to_test=10, max_ends_per_region=5, maximum_fraction=0.5, max_keep=10, map_coeffs_file=None,map_coeffs_labels=None, pdb_in_file=None, pdb_out=None, verbose=None, out=sys.stdout): if out is None: out=sys.stdout # explode and refine calls it this way # get info from params if present if params: verbose=params.control.verbose map_coeffs_file=params.input_files.map_coeffs_file map_coeffs_labels=params.input_files.map_coeffs_labels pdb_in_file=params.input_files.pdb_in_file resolution=params.crystal_info.resolution scattering_table=params.crystal_info.scattering_table smoothing_window=params.crossover.smoothing_window crossover_atom=params.crossover.crossover_atom minimum_matching_atoms=params.crossover.minimum_matching_atoms minimum_length=params.crossover.minimum_length dist_max=params.crossover.dist_max minimum_improvement=params.crossover.minimum_improvement max_regions_to_test=params.crossover.max_regions_to_test max_ends_per_region=params.crossover.max_ends_per_region maximum_fraction=params.crossover.maximum_fraction max_keep=params.crossover.max_keep pdb_out=params.output_files.pdb_out # Consistency checks if(pdb_hierarchy is not None): assert pdb_in_file is None assert pdb_inp is None assert crystal_symmetry is not None # XXX more checks here! # Get map_data if not present if not map_data: if not map_coeffs_file or not os.path.isfile(map_coeffs_file): raise Sorry("Cannot find the map_coeffs_file '%s'" %( str(map_coeffs_file))) from mmtbx.building.minimize_chain import get_map_coeffs map_coeffs=get_map_coeffs(map_coeffs_file, map_coeffs_labels=map_coeffs_labels) fft_map = map_coeffs.fft_map(resolution_factor = 0.25) fft_map.apply_sigma_scaling() map_data = fft_map.real_map_unpadded() map_data=map_data.as_double() if map_coeffs and not crystal_symmetry: crystal_symmetry=map_coeffs.crystal_symmetry() if map_coeffs and not resolution: resolution=map_coeffs.d_min() # Get the starting model if(pdb_hierarchy is None): if pdb_inp is None: if not pdb_in_file or not os.path.isfile(pdb_in_file): raise Sorry("Cannot read input PDB file '%s'" %( str(pdb_in_file))) else: print("Taking models from %s" %(pdb_in_file), file=out) pdb_string=open(pdb_in_file).read() pdb_inp=iotbx.pdb.input(source_info=None, lines = pdb_string) if pdb_inp is None: raise Sorry("Need a model or models") if not crystal_symmetry: crystal_symmetry=pdb_inp.crystal_symmetry() assert crystal_symmetry is not None hierarchy = pdb_inp.construct_hierarchy() else: hierarchy = pdb_hierarchy # XXX FIXME n_models=0 for model in hierarchy.models(): n_models+=1 if n_models==1: # nothing to do return hierarchy #xrs = pdb_inp.xray_structure_simple(crystal_symmetry=crystal_symmetry) xrs = hierarchy.extract_xray_structure(crystal_symmetry=crystal_symmetry) xrs.scattering_type_registry(table=scattering_table) if not resolution: from cctbx import maptbx resolution=maptbx.resolution_from_map_and_model.run( map_data=map_data, xray_structure=xrs).d_min if(resolution is None): raise Sorry("Resolution is required") print("\nResolution limit: %7.2f" %(resolution), file=out) print("\nSummary of input models", file=out) xrs.show_summary(f=out, prefix=" ") print("\nReady with %d models and map" %(n_models), file=out) # Get CC by residue for each model and map chain_id_and_resseq_list=[] # Instead set up chain_id and resseq (range) from mmtbx.secondary_structure.find_ss_from_ca import \ split_model,get_first_resno, get_last_resno,get_chain_id model_list=split_model(hierarchy=hierarchy,only_first_model=True) for m in model_list: h=m.hierarchy first_resno=get_first_resno(h) last_resno=get_last_resno(h) chain_id=get_chain_id(h) residue_range=[first_resno,last_resno] chain_id_and_resseq=[chain_id,residue_range] if not chain_id_and_resseq in chain_id_and_resseq_list: chain_id_and_resseq_list.append(chain_id_and_resseq) # Run through chains separately # NOTE: All models of each chain must match exactly # Save composite model, chain by chain composite_model_stream=StringIO() for chain_id_and_resseq in chain_id_and_resseq_list: f=StringIO() chain_id,[start_resno,end_resno]=chain_id_and_resseq atom_selection=get_atom_selection(chain_id=chain_id, start_resno=start_resno,end_resno=end_resno) asc=hierarchy.atom_selection_cache() sel=asc.selection(string = atom_selection) sel_hierarchy=hierarchy.select(sel) pdb_inp=sel_hierarchy.as_pdb_input(crystal_symmetry=crystal_symmetry) ph=pdb_inp.construct_hierarchy() print("\nWorking on chain_id='%s' resseq %d:%d\n" %( chain_id_and_resseq[0],chain_id_and_resseq[1][0],chain_id_and_resseq[1][1]), file=out) # get CC values for all residues cc_dict=get_cc_dict(hierarchy=ph,map_data=map_data,d_min=resolution, crystal_symmetry=crystal_symmetry, table=scattering_table,out=out) # smooth CC values with window of smoothing_window smoothed_cc_dict=smooth_cc_values(cc_dict=cc_dict, smoothing_window=smoothing_window, verbose=verbose,out=out) # figure out all the places where crossover can occur. # FIXME: order of keys changes in py2/3 vthis could be bad n_residues=cc_dict[list(cc_dict.keys())[0]].size() crossover_dict=get_crossover_dict( n_residues=n_residues, hierarchy=ph, crossover_atom=crossover_atom, dist_max=dist_max, minimum_matching_atoms=minimum_matching_atoms, verbose=verbose,out=out) # Now we are ready to identify the best composite model... # A composite has reside 0 from model x, residue 1 from model y etc. # Each change from model a to model b between residues i and i+1 must have # a crossover between a and b at either residue i or i+1 keys=list(cc_dict.keys()) keys.sort() sorted_working_model_list=[] for key in keys: working_model=model_object(source_id=key, cc_dict=cc_dict, smoothed_cc_dict=smoothed_cc_dict, crossover_dict=crossover_dict, minimum_length=minimum_length, minimum_improvement=minimum_improvement, max_regions_to_test=max_regions_to_test, max_ends_per_region=max_ends_per_region, maximum_fraction=maximum_fraction) if verbose: working_model.show_summary(out=out) sorted_working_model_list.append( [working_model.get_score(),working_model]) sorted_working_model_list.sort() sorted_working_model_list.reverse() sorted_working_model_list=\ sorted_working_model_list[:max_keep] working_model_list=[] for s,m in sorted_working_model_list: working_model_list.append(m) # Go through all the working models and cross them with other models to # optimize...Then take all the best and cross... best_score,best_model=sorted_working_model_list[0] found=True cycle=0 while found: cycle+=1 print("\nCYCLE %d current best is %7.3f\n" %( cycle,best_model.get_score()), file=out) found=False sorted_working_model_list=[] new_best=best_model id=0 for working_model in working_model_list: id+=1 others=[] for m in working_model_list: if not working_model==m: others.append(m) new_working_model=working_model.optimize_with_others(others=others) if not new_working_model: print() continue aa=[new_working_model.get_score(),new_working_model] if not aa in sorted_working_model_list: sorted_working_model_list.append(aa) if not sorted_working_model_list: break # nothing to do sorted_working_model_list.sort() sorted_working_model_list.reverse() sorted_working_model_list=sorted_working_model_list[:max_keep] new_working_score,new_working_model=sorted_working_model_list[0] if new_working_score>best_model.get_score(): best_model=new_working_model found=True if verbose: print("NEW BEST SCORE: %7.2f" %(best_model.get_score()), file=out) best_model.show_summary(out=out) print("\nDONE... best is %7.3f\n" %( best_model.get_score()), file=out) # Create composite of this chain # Note residue values. We are going to pick each residue from one of # the models for model in ph.models(): for chain in model.chains(): if chain.id != chain_id: continue residue_list=[] for rg in chain.residue_groups(): residue_list.append(rg.resseq) residue_list.sort() assert len(best_model.source_list)==len(residue_list) for i in range(len(residue_list)): atom_selection=get_atom_selection(model_id=best_model.source_list[i], resseq_sel=residue_list[i]) asc=ph.atom_selection_cache() sel=asc.selection(string = atom_selection) sel_hierarchy=ph.select(sel) print(remove_ter(sel_hierarchy.as_pdb_string()), file=composite_model_stream) # All done, make a new pdb_hierarchy pdb_string=composite_model_stream.getvalue() pdb_inp=iotbx.pdb.input(source_info=None, lines = pdb_string) pdb_hierarchy=pdb_inp.construct_hierarchy() if pdb_out: f=open(pdb_out,'w') print(pdb_hierarchy.as_pdb_string(crystal_symmetry=crystal_symmetry), file=f) print("Final model is in: %s\n" %(f.name)) f.close() return pdb_hierarchy
def run( params=None, # params for running from command line map_data=None, # map_data, as_double() pdb_inp=None, pdb_hierarchy=None, crystal_symmetry=None, resolution=None, scattering_table='n_gaussian', smoothing_window=5, crossover_atom='CA', minimum_matching_atoms=3, minimum_length=2, dist_max=1.0, minimum_improvement=0.01, max_regions_to_test=10, max_ends_per_region=5, maximum_fraction=0.5, max_keep=10, map_coeffs_file=None,map_coeffs_labels=None, pdb_in_file=None, pdb_out=None, verbose=None, out=sys.stdout): if out is None: out=sys.stdout # explode and refine calls it this way # get info from params if present if params: verbose=params.control.verbose map_coeffs_file=params.input_files.map_coeffs_file map_coeffs_labels=params.input_files.map_coeffs_labels pdb_in_file=params.input_files.pdb_in_file resolution=params.crystal_info.resolution scattering_table=params.crystal_info.scattering_table smoothing_window=params.crossover.smoothing_window crossover_atom=params.crossover.crossover_atom minimum_matching_atoms=params.crossover.minimum_matching_atoms minimum_length=params.crossover.minimum_length dist_max=params.crossover.dist_max minimum_improvement=params.crossover.minimum_improvement max_regions_to_test=params.crossover.max_regions_to_test max_ends_per_region=params.crossover.max_ends_per_region maximum_fraction=params.crossover.maximum_fraction max_keep=params.crossover.max_keep pdb_out=params.output_files.pdb_out # Consistency checks if(pdb_hierarchy is not None): assert pdb_in_file is None assert pdb_inp is None assert crystal_symmetry is not None # XXX more checks here! # Get map_data if not present if not map_data: if not map_coeffs_file or not os.path.isfile(map_coeffs_file): raise Sorry("Cannot find the map_coeffs_file '%s'" %( str(map_coeffs_file))) from mmtbx.building.minimize_chain import get_map_coeffs map_coeffs=get_map_coeffs(map_coeffs_file, map_coeffs_labels=map_coeffs_labels) fft_map = map_coeffs.fft_map(resolution_factor = 0.25) fft_map.apply_sigma_scaling() map_data = fft_map.real_map_unpadded() map_data=map_data.as_double() if map_coeffs and not crystal_symmetry: crystal_symmetry=map_coeffs.crystal_symmetry() if map_coeffs and not resolution: resolution=map_coeffs.d_min() # Get the starting model if(pdb_hierarchy is None): if pdb_inp is None: if not pdb_in_file or not os.path.isfile(pdb_in_file): raise Sorry("Cannot read input PDB file '%s'" %( str(pdb_in_file))) else: print >>out,"Taking models from %s" %(pdb_in_file) pdb_string=open(pdb_in_file).read() pdb_inp=iotbx.pdb.input(source_info=None, lines = pdb_string) if pdb_inp is None: raise Sorry("Need a model or models") if not crystal_symmetry: crystal_symmetry=pdb_inp.crystal_symmetry() assert crystal_symmetry is not None hierarchy = pdb_inp.construct_hierarchy() else: hierarchy = pdb_hierarchy # XXX FIXME n_models=0 for model in hierarchy.models(): n_models+=1 if n_models==1: # nothing to do return hierarchy #xrs = pdb_inp.xray_structure_simple(crystal_symmetry=crystal_symmetry) xrs = hierarchy.extract_xray_structure(crystal_symmetry=crystal_symmetry) xrs.scattering_type_registry(table=scattering_table) if not resolution: from cctbx import maptbx resolution=maptbx.resolution_from_map_and_model( map_data=map_data, xray_structure=xrs) print >>out,"\nResolution limit: %7.2f" %(resolution) print >>out,"\nSummary of input models" xrs.show_summary(f=out, prefix=" ") print >>out, "\nReady with %d models and map" %(n_models) # Get CC by residue for each model and map chain_id_and_resseq_list=[] # Instead set up chain_id and resseq (range) from mmtbx.secondary_structure.find_ss_from_ca import \ split_model,get_first_resno, get_last_resno,get_chain_id model_list=split_model(hierarchy=hierarchy,only_first_model=True) for m in model_list: h=m.hierarchy first_resno=get_first_resno(h) last_resno=get_last_resno(h) chain_id=get_chain_id(h) residue_range=[first_resno,last_resno] chain_id_and_resseq=[chain_id,residue_range] if not chain_id_and_resseq in chain_id_and_resseq_list: chain_id_and_resseq_list.append(chain_id_and_resseq) # Run through chains separately # NOTE: All models of each chain must match exactly # Save composite model, chain by chain from cStringIO import StringIO composite_model_stream=StringIO() for chain_id_and_resseq in chain_id_and_resseq_list: from cStringIO import StringIO f=StringIO() chain_id,[start_resno,end_resno]=chain_id_and_resseq atom_selection=get_atom_selection(chain_id=chain_id, start_resno=start_resno,end_resno=end_resno) asc=hierarchy.atom_selection_cache() sel=asc.selection(string = atom_selection) sel_hierarchy=hierarchy.select(sel) pdb_inp=sel_hierarchy.as_pdb_input(crystal_symmetry=crystal_symmetry) ph=pdb_inp.construct_hierarchy() print >>out,"\nWorking on chain_id='%s' resseq %d:%d\n" %( chain_id_and_resseq[0],chain_id_and_resseq[1][0],chain_id_and_resseq[1][1]) # get CC values for all residues cc_dict=get_cc_dict(hierarchy=ph,map_data=map_data,d_min=resolution, crystal_symmetry=crystal_symmetry, table=scattering_table,out=out) # smooth CC values with window of smoothing_window smoothed_cc_dict=smooth_cc_values(cc_dict=cc_dict, smoothing_window=smoothing_window, verbose=verbose,out=out) # figure out all the places where crossover can occur. n_residues=cc_dict[cc_dict.keys()[0]].size() crossover_dict=get_crossover_dict( n_residues=n_residues, hierarchy=ph, crossover_atom=crossover_atom, dist_max=dist_max, minimum_matching_atoms=minimum_matching_atoms, verbose=verbose,out=out) # Now we are ready to identify the best composite model... # A composite has reside 0 from model x, residue 1 from model y etc. # Each change from model a to model b between residues i and i+1 must have # a crossover between a and b at either residue i or i+1 keys=cc_dict.keys() keys.sort() sorted_working_model_list=[] for key in keys: working_model=model_object(source_id=key, cc_dict=cc_dict, smoothed_cc_dict=smoothed_cc_dict, crossover_dict=crossover_dict, minimum_length=minimum_length, minimum_improvement=minimum_improvement, max_regions_to_test=max_regions_to_test, max_ends_per_region=max_ends_per_region, maximum_fraction=maximum_fraction) if verbose: working_model.show_summary(out=out) sorted_working_model_list.append( [working_model.get_score(),working_model]) sorted_working_model_list.sort() sorted_working_model_list.reverse() sorted_working_model_list=\ sorted_working_model_list[:max_keep] working_model_list=[] for s,m in sorted_working_model_list: working_model_list.append(m) # Go through all the working models and cross them with other models to # optimize...Then take all the best and cross... best_score,best_model=sorted_working_model_list[0] found=True cycle=0 while found: cycle+=1 print >>out, "\nCYCLE %d current best is %7.3f\n" %( cycle,best_model.get_score()) found=False sorted_working_model_list=[] new_best=best_model id=0 for working_model in working_model_list: id+=1 others=[] for m in working_model_list: if not working_model==m: others.append(m) new_working_model=working_model.optimize_with_others(others=others) if not new_working_model: print continue aa=[new_working_model.get_score(),new_working_model] if not aa in sorted_working_model_list: sorted_working_model_list.append(aa) if not sorted_working_model_list: break # nothing to do sorted_working_model_list.sort() sorted_working_model_list.reverse() sorted_working_model_list=sorted_working_model_list[:max_keep] new_working_score,new_working_model=sorted_working_model_list[0] if new_working_score>best_model.get_score(): best_model=new_working_model found=True if verbose: print >>out,"NEW BEST SCORE: %7.2f" %(best_model.get_score()) best_model.show_summary(out=out) print >>out, "\nDONE... best is %7.3f\n" %( best_model.get_score()) # Create composite of this chain # Note residue values. We are going to pick each residue from one of # the models for model in ph.models(): for chain in model.chains(): if chain.id != chain_id: continue residue_list=[] for rg in chain.residue_groups(): residue_list.append(rg.resseq) residue_list.sort() assert len(best_model.source_list)==len(residue_list) for i in xrange(len(residue_list)): atom_selection=get_atom_selection(model_id=best_model.source_list[i], resseq_sel=residue_list[i]) asc=ph.atom_selection_cache() sel=asc.selection(string = atom_selection) sel_hierarchy=ph.select(sel) print >>composite_model_stream,remove_ter(sel_hierarchy.as_pdb_string()) # All done, make a new pdb_hierarchy pdb_string=composite_model_stream.getvalue() pdb_inp=iotbx.pdb.input(source_info=None, lines = pdb_string) pdb_hierarchy=pdb_inp.construct_hierarchy() if pdb_out: f=open(pdb_out,'w') print >>f,pdb_hierarchy.as_pdb_string(crystal_symmetry=crystal_symmetry) print "Final model is in: %s\n" %(f.name) f.close() return pdb_hierarchy