def __init__(self, model, params=None, log=sys.stdout): self.model = model self.params = params if self.params is None: self.params = validate.get_default_params().ss_validation self.log = log self.results = None ss_log = cStringIO.StringIO() try: ss_annot = self.model.get_ss_annotation(log=ss_log) except Sorry as e: print >> self.log, " Syntax error in SS: %s" % e.message return ss_log_cont = ss_log.getvalue() n_bad_helices = ss_log_cont.count("Bad HELIX") n_bad_sheets = ss_log_cont.count("Bad SHEET") pdb_h = self.model.get_hierarchy() if ss_annot is None or ss_annot.is_empty(): print >> self.log, "No SS annotation, nothing to analyze" return if n_bad_helices > 0: print >> self.log, "Number of helices with syntax error: %d" % n_bad_helices if n_bad_helices > 0: print >> self.log, "Number of sheets with syntax error: %d" % n_bad_sheets if model.get_number_of_models() != 1: raise Sorry("Multiple models not supported.") if not pdb_h.contains_protein(): print >> self.log, "Protein is not found in the model" return if pdb_h.is_ca_only(): print >> self.log, "Error: CA-only model" return if is_ca_and_something(pdb_h): print >> self.log, "CA-only and something model" return if some_chains_are_ca(pdb_h): print >> self.log, "some chains are CA-only" return n_total_helix_sheet_records = ss_annot.get_n_helices( ) + ss_annot.get_n_sheets() n_bad_helix_sheet_records = 0 # Empty stuff: empty_annots = ss_annot.remove_empty_annotations(pdb_h) number_of_empty_helices = empty_annots.get_n_helices() number_of_empty_sheets = empty_annots.get_n_sheets() n_bad_helix_sheet_records += (number_of_empty_helices + number_of_empty_sheets) if number_of_empty_helices > 0: print >> self.log, "Helices without corresponding atoms in the model (%d):" % number_of_empty_helices for h in empty_annots.helices: print >> self.log, " ", h.as_pdb_str() if number_of_empty_sheets > 0: print >> self.log, "Sheets without corresponding atoms in the model (%d):" % number_of_empty_sheets for sh in empty_annots.sheets: print >> self.log, " ", sh.as_pdb_str() print >> self.log, "Checking annotations thoroughly, use nproc=<number> if it is too slow..." hsh_tuples = [] for h in ss_annot.helices: hsh_tuples.append(([h], [])) for sh in ss_annot.sheets: hsh_tuples.append(([], [sh])) calc_ss_stats = gather_ss_stats( pdb_h, mediocre_hbond_cutoff=self.params.mediocre_hbond_cutoff, bad_hbond_cutoff=self.params.bad_hbond_cutoff) results = [] if len(hsh_tuples) > 0: results = easy_mp.pool_map(processes=self.params.nproc, fixed_func=calc_ss_stats, args=hsh_tuples) cumm_n_hbonds = 0 cumm_n_bad_hbonds = 0 cumm_n_mediocre_hbonds = 0 cumm_n_rama_out = 0 cumm_n_wrong_reg = 0 n_elem_with_wrong_rama = 0 n_elem_with_rama_out = 0 n_elem_with_bad_hbond = 0 # # Hydrogen Bonds in Proteins: Role and Strength # Roderick E Hubbard, Muhammad Kamran Haider # ENCYCLOPEDIA OF LIFE SCIENCES & 2010, John Wiley & Sons, Ltd. www.els.net # # See also: http://proteopedia.org/wiki/index.php/Hydrogen_bonds # for ss_elem, r in zip(ss_annot.helices + ss_annot.sheets, results): if r is not None: n_hbonds, n_bad_hbonds, n_mediocre_hbonds, hb_lens, n_outliers, n_wrong_region = r cumm_n_hbonds += n_hbonds cumm_n_bad_hbonds += n_bad_hbonds cumm_n_mediocre_hbonds += n_mediocre_hbonds cumm_n_rama_out += n_outliers cumm_n_wrong_reg += n_wrong_region if n_wrong_region > 0: n_elem_with_wrong_rama += 1 if n_outliers > 0: n_elem_with_rama_out += 1 if n_bad_hbonds > 0: n_elem_with_bad_hbond += 1 if n_bad_hbonds + n_outliers + n_wrong_region > 0: n_bad_helix_sheet_records += 1 if n_bad_hbonds + n_mediocre_hbonds + n_outliers + n_wrong_region > 0: # this is bad annotation, printing it to log with separate stats: print >> self.log, "Bad annotation found:" print >> self.log, "%s" % ss_elem.as_pdb_str() print >> self.log, " Total hb: %d, mediocre: %d, bad: %d, Rama outliers: %d, Rama wrong %d" % ( n_hbonds, n_mediocre_hbonds, n_bad_hbonds, n_outliers, n_wrong_region) print >> self.log, "-" * 80 # n1 = percentage of bad SS elements (per given model); # bad here means: n_bad_hbonds + n_outliers + n_wrong_region > 0 n1 = safe_div(n_bad_helix_sheet_records, n_total_helix_sheet_records) * 100. # n2 = percentage of SS elements that have at least one residue belonging to a wrong region of Ramachandran plot (per given model); n2 = safe_div(n_elem_with_wrong_rama, n_total_helix_sheet_records) * 100. # n3 = percentage of SS elements that have at least one residue being a Ramachandran plot outlier (per given model); n3 = safe_div(n_elem_with_rama_out, n_total_helix_sheet_records) * 100. # n4 = percentage of bad H bonds (per given model). n4 = safe_div(cumm_n_bad_hbonds, cumm_n_hbonds) * 100. # No per SS element separation # percentage of SS elements that have at least one bad H bond (per given model) n5 = safe_div(n_elem_with_bad_hbond, n_total_helix_sheet_records) * 100. print >> self.log, "Overall info:" print >> self.log, " Total HELIX+SHEET recods :", n_total_helix_sheet_records print >> self.log, " Total bad HELIX+SHEET recods :", n_bad_helix_sheet_records print >> self.log, " Total declared H-bonds :", cumm_n_hbonds print >> self.log, " Total mediocre H-bonds (%.1f-%.1fA):" % ( self.params.mediocre_hbond_cutoff, self.params.bad_hbond_cutoff), \ cumm_n_mediocre_hbonds print >> self.log, " Total bad H-bonds (>%.1fA) :" % self.params.bad_hbond_cutoff, \ cumm_n_bad_hbonds print >> self.log, " Total Ramachandran outliers :", cumm_n_rama_out print >> self.log, " Total wrong Ramachandrans :", cumm_n_wrong_reg print >> self.log, "All done." help_string = """\ Total bad HELIX+SHEET recods does not include records with syntax mistakes (they are outputted separately in the beginning of the log), but includes empty records (without corresponding atoms in the model) and records with any deviations in geometry (bad/mediocre bonds, Ramachandran angles are outliers or wrong). Ramachandran outliers - residues in disallowed region of Ramachandran plot. Wrong Ramachandrans - residues in favored and allowed regions of Ramachandran plot, but don't belong to region of annotated secondary structure element. For example, residue annotated as HELIX has phi-psi angles in beta-strand region and vice versa. """ print >> self.log, help_string if self.params.filter_annotation: filtered_ann = ss_annot.filter_annotation(hierarchy=pdb_h) print >> self.log, "Filtered annotation:" print >> self.log, filtered_ann.as_pdb_str() self.results = group_args( n_total_helix_sheet_records=n_total_helix_sheet_records, n_bad_helix_sheet_records=n_bad_helix_sheet_records, n_hbonds=cumm_n_hbonds, n_mediocre_hbonds=cumm_n_mediocre_hbonds, n_bad_hbonds=cumm_n_bad_hbonds, n_rama_out=cumm_n_rama_out, n_wrong_reg=cumm_n_wrong_reg, n1=n1, n2=n2, n3=n3, n4=n4, n5=n5, # Number of helices with syntax error. Specifically, those producing # ValueError on converting the field to a number. n_bad_helices=n_bad_helices, n_bad_sheets=n_bad_sheets)
def run(args=None, pdb_inp=None, pdb_hierarchy=None, cs=None, params=None, out=sys.stdout, log=sys.stderr): if (pdb_hierarchy is None): assert args is not None # params keyword is for running program from GUI dialog if (((len(args) == 0) and (params is None)) or ((len(args) > 0) and ((args[0] == "-h") or (args[0] == "--help")))): show_usage() return # parse command-line arguments if (params is None): pcl = iotbx.phil.process_command_line_with_files( args=args, master_phil_string=master_phil_str, pdb_file_def="file_name") work_params = pcl.work.extract() # or use parameters defined by GUI else: work_params = params pdb_files = work_params.file_name pdb_combined = iotbx.pdb.combine_unique_pdb_files(file_names=pdb_files) pdb_structure = iotbx.pdb.input(source_info=None, lines=flex.std_string( pdb_combined.raw_records)) pdb_h = pdb_structure.construct_hierarchy() else: work_params = params if work_params is None: work_params = master_phil.extract() pdb_h = pdb_hierarchy atoms = pdb_h.atoms() ss_log = cStringIO.StringIO() try: if (pdb_inp is not None): pdb_structure = pdb_inp ss_annot = pdb_structure.extract_secondary_structure(log=ss_log) except Sorry as e: print >> out, " Syntax error in SS: %s" % e.message return if work_params.nproc < 1: work_params.nproc = 1 ss_log_cont = ss_log.getvalue() n_bad_helices = ss_log_cont.count("Bad HELIX") n_bad_sheets = ss_log_cont.count("Bad SHEET") if ss_annot is None or ss_annot.is_empty(): print >> out, "No SS annotation, nothing to analyze" return if n_bad_helices > 0: print >> out, "Number of bad helices: %d" % n_bad_helices if n_bad_helices > 0: print >> out, "Number of bad sheets: %d" % n_bad_sheets if len(pdb_h.models()) != 1: raise Sorry("Multiple models not supported.") if not pdb_h.contains_protein(): print >> out, "Protein is not found in the model" return if pdb_h.is_ca_only(): print >> out, "Error: CA-only model" return if is_ca_and_something(pdb_h): print >> out, "CA-only and something model" return if some_chains_are_ca(pdb_h): print >> out, "some chains are CA-only" return corrupted_cs = False if cs is not None: if [cs.unit_cell(), cs.space_group()].count(None) > 0: corrupted_cs = True cs = None elif cs.unit_cell().volume() < 10: corrupted_cs = True cs = None if cs is None: if corrupted_cs: print >> out, "Symmetry information is corrupted, " else: print >> out, "Symmetry information was not found, " print >> out, "putting molecule in P1 box." from cctbx import uctbx atoms = pdb_structure.atoms() box = uctbx.non_crystallographic_unit_cell_with_the_sites_in_its_center( sites_cart=atoms.extract_xyz(), buffer_layer=3) atoms.set_xyz(new_xyz=box.sites_cart) cs = box.crystal_symmetry() n_total_helix_sheet_records = ss_annot.get_n_helices( ) + ss_annot.get_n_sheets() n_bad_helix_sheet_records = 0 # Empty stuff: empty_annots = ss_annot.remove_empty_annotations(pdb_h) number_of_empty_helices = empty_annots.get_n_helices() number_of_empty_sheets = empty_annots.get_n_sheets() n_bad_helix_sheet_records += (number_of_empty_helices + number_of_empty_sheets) if number_of_empty_helices > 0: print >> out, "Helices without corresponding atoms in the model (%d):" % number_of_empty_helices for h in empty_annots.helices: print >> out, " ", h.as_pdb_str() if number_of_empty_sheets > 0: print >> out, "Sheets without corresponding atoms in the model (%d):" % number_of_empty_sheets for sh in empty_annots.sheets: print >> out, " ", sh.as_pdb_str() print >> out, "Checking annotations thoroughly, use nproc=<number> if it is too slow..." hsh_tuples = [] for h in ss_annot.helices: hsh_tuples.append(([h], [])) for sh in ss_annot.sheets: hsh_tuples.append(([], [sh])) calc_ss_stats = gather_ss_stats( pdb_h, mediocre_hbond_cutoff=work_params.mediocre_hbond_cutoff, bad_hbond_cutoff=work_params.bad_hbond_cutoff) results = [] if len(hsh_tuples) > 0: results = easy_mp.pool_map(processes=work_params.nproc, fixed_func=calc_ss_stats, args=hsh_tuples) cumm_n_hbonds = 0 cumm_n_bad_hbonds = 0 cumm_n_mediocre_hbonds = 0 cumm_n_rama_out = 0 cumm_n_wrong_reg = 0 n_elem_with_wrong_rama = 0 n_elem_with_rama_out = 0 n_elem_with_bad_hbond = 0 # # Hydrogen Bonds in Proteins: Role and Strength # Roderick E Hubbard, Muhammad Kamran Haider # ENCYCLOPEDIA OF LIFE SCIENCES & 2010, John Wiley & Sons, Ltd. www.els.net # # See also: http://proteopedia.org/wiki/index.php/Hydrogen_bonds # for ss_elem, r in zip(ss_annot.helices + ss_annot.sheets, results): if r is not None: n_hbonds, n_bad_hbonds, n_mediocre_hbonds, hb_lens, n_outliers, n_wrong_region = r cumm_n_hbonds += n_hbonds cumm_n_bad_hbonds += n_bad_hbonds cumm_n_mediocre_hbonds += n_mediocre_hbonds cumm_n_rama_out += n_outliers cumm_n_wrong_reg += n_wrong_region if n_wrong_region > 0: n_elem_with_wrong_rama += 1 if n_outliers > 0: n_elem_with_rama_out += 1 if n_bad_hbonds > 0: n_elem_with_bad_hbond += 1 if n_bad_hbonds + n_outliers + n_wrong_region > 0: n_bad_helix_sheet_records += 1 if n_bad_hbonds + n_mediocre_hbonds + n_outliers + n_wrong_region > 0: # this is bad annotation, printing it to log with separate stats: print >> out, "Bad annotation found:" print >> out, "%s" % ss_elem.as_pdb_str() print >> out, " Total hb: %d, mediocre: %d, bad: %d, Rama outliers: %d, Rama wrong %d" % ( n_hbonds, n_mediocre_hbonds, n_bad_hbonds, n_outliers, n_wrong_region) print >> out, "-" * 80 # n1 = percentage of bad SS elements (per given model); # bad here means: n_bad_hbonds + n_outliers + n_wrong_region > 0 n1 = safe_div(n_bad_helix_sheet_records, n_total_helix_sheet_records) * 100. # n2 = percentage of SS elements that have at least one residue belonging to a wrong region of Ramachandran plot (per given model); n2 = safe_div(n_elem_with_wrong_rama, n_total_helix_sheet_records) * 100. # n3 = percentage of SS elements that have at least one residue being a Ramachandran plot outlier (per given model); n3 = safe_div(n_elem_with_rama_out, n_total_helix_sheet_records) * 100. # n4 = percentage of bad H bonds (per given model). n4 = safe_div(cumm_n_bad_hbonds, cumm_n_hbonds) * 100. # No per SS element separation # percentage of SS elements that have at least one bad H bond (per given model) n5 = safe_div(n_elem_with_bad_hbond, n_total_helix_sheet_records) * 100. print >> out, "Overall info:" print >> out, " Total HELIX+SHEET recods :", n_total_helix_sheet_records print >> out, " Total bad HELIX+SHEET recods :", n_bad_helix_sheet_records print >> out, " Total declared H-bonds :", cumm_n_hbonds print >> out, " Total mediocre H-bonds (%.1f-%.1fA):" % ( work_params.mediocre_hbond_cutoff, work_params.bad_hbond_cutoff), \ cumm_n_mediocre_hbonds print >> out, " Total bad H-bonds (>%.1fA) :" % work_params.bad_hbond_cutoff, \ cumm_n_bad_hbonds print >> out, " Total Ramachandran outliers :", cumm_n_rama_out print >> out, " Total wrong Ramachandrans :", cumm_n_wrong_reg print >> out, "All done." if work_params.filter_annotation: filtered_ann = ss_annot.filter_annotation(hierarchy=pdb_h) print >> out, "Filtered annotation:" print >> out, filtered_ann.as_pdb_str() return group_args(n_total_helix_sheet_records=n_total_helix_sheet_records, n_bad_helix_sheet_records=n_bad_helix_sheet_records, n_hbonds=cumm_n_hbonds, n_mediocre_hbonds=cumm_n_mediocre_hbonds, n_bad_hbonds=cumm_n_bad_hbonds, n_rama_out=cumm_n_rama_out, n_wrong_reg=cumm_n_wrong_reg, n1=n1, n2=n2, n3=n3, n4=n4, n5=n5)