def transfer_energy(atoms, sas): """Calculate transfer energy for each osmolyte and amino acid type. .. todo:: implement max/min for Creamer model :param list(pqr.Atom) atoms: list of atoms :param SolventAccessibleSurface sas: solvent-accessible surface area object :returns: DataFrame with energy contribution (kcal/mol) by residue type :rtype: pd.DataFrame """ ref_model = ReferenceModels() counts = count_residues(atoms) folded_areas = get_folded_areas(atoms, sas) unfolded_areas = ref_model.molecule_areas(atoms, model="auton", how="mean") tripeptide_areas = ref_model.molecule_areas(atoms, model="tripeptide", how="mean") scaled_areas = (unfolded_areas - folded_areas) / tripeptide_areas scaled_areas["sidechain"]["GLY"] = 0.0 osmolytes = list(ENERGY_DICT["backbone"].keys()) residue_energies = {} bb_energy = {osmolyte: 0.0 for osmolyte in osmolytes} for res, area in scaled_areas.T.iteritems(): sc_energy = {} for osmolyte in osmolytes: sc_energy[osmolyte] = (counts[res] * ENERGY_DICT[res][osmolyte] * area["sidechain"]) bb_energy[osmolyte] += (counts[res] * ENERGY_DICT["backbone"][osmolyte] * area["backbone"]) residue_energies[res] = sc_energy residue_energies["backbone"] = bb_energy return pd.DataFrame(residue_energies).T.sort_index() / 1000
def main(args=None): """Main entry point. :param list(str) args: list of strings to parse :returns: dictionary of results :rtype: dict """ parser = build_parser() args = parser.parse_args(args) logging.basicConfig(level=getattr(logging, args.log_level)) _LOGGER.info(f"Osmolytes version {osmolytes.__version__}") _LOGGER.debug(f"Got command-line arguments: {args}") _LOGGER.info(f"Reading PQR input from {args.pqr_path}...") with open(args.pqr_path, "rt") as pqr_file: atoms = parse_pqr_file(pqr_file) _LOGGER.info( f"Constructing protein solvent-accessible surface with " f"solvent radius {args.solvent_radius} and {args.surface_points} " f"points per atom." ) sas = SolventAccessibleSurface( atoms, probe_radius=args.solvent_radius, num_points=args.surface_points, xyz_path=args.surface_output, ) counts = count_residues(atoms) areas = get_folded_areas(atoms, sas).rename( {"sidechain": "sidechain areas", "backbone": "backbone areas"}, axis="columns" ) areas["residue counts"] = counts print(areas) areas = areas[["residue counts", "sidechain areas", "backbone areas"]] _LOGGER.info(f"Protein composition:\n{areas}") _LOGGER.info("Calculating transfer energies.") energy_df = transfer_energy(atoms, sas) _LOGGER.info(f"Detailed energies (kcal/mol/M):\n{energy_df.to_string()}") energies = energy_df.sum(axis=0).sort_values() _LOGGER.info(f"Summary energies (kcal/mol/M):\n{energies}") output_dir = Path(args.output_dir) if args.output == "xlsx": output_path = output_dir / f"{Path(args.pqr_path).stem}-mvalues.xlsx" _LOGGER.info(f"Writing energies (kcal/mol/M) to {output_path}") energy_df.to_excel(output_path) elif args.output == "csv": output_path = output_dir / f"{Path(args.pqr_path).stem}-mvalues.csv" _LOGGER.info(f"Writing energies (kcal/mol/M) to {output_path}") energy_df.to_csv(output_path) return { "args": args, "atoms": atoms, "sas": sas, "energy_df": energy_df, "energies": energies, }
def test_residue_count(protein): """Test aggregate SASAs per residue type as reported in Auton and Bolen (doi:10.1073/pnas.0507053102, Supporting Table 2).""" pqr_path = PROTEIN_PATH / f"{protein}.pqr" with open(pqr_path, "rt") as pqr_file: atoms = parse_pqr_file(pqr_file) test_counts = count_residues(atoms).sort_index() ref_path = PROTEIN_PATH / "Auton-Bolen.yaml" with open(ref_path, "rt") as ref_file: ref_dict = yaml.load(ref_file, Loader=yaml.FullLoader) ref_dict = ref_dict[protein] ref_df = pd.DataFrame(ref_dict).T ref_df = ref_df[ref_df["number"] > 0] ref_counts = np.array(ref_df["number"].sort_index()) results_df = pd.DataFrame({"Test": test_counts, "Ref": ref_counts}) _LOGGER.info(f"Results for {protein}:\n{results_df}") np.testing.assert_equal(test_counts, ref_counts)
def test_auton_sasa(protein, tmp_path): """Test aggregate SASAs per residue type as reported in Auton and Bolen (doi:10.1073/pnas.0507053102, Supporting Table 2).""" _LOGGER.info("Temp path: %s", tmp_path) pqr_path = PROTEIN_PATH / f"{protein}.pqr" with open(pqr_path, "rt") as pqr_file: atoms = parse_pqr_file(pqr_file) xyz_path = Path(tmp_path) / "surface.xyz" sas = SolventAccessibleSurface( atoms, probe_radius=1.4, num_points=200, xyz_path=xyz_path ) test_areas = [sas.atom_surface_area(iatom) for iatom in range(len(atoms))] test_areas = aggregate( atoms, test_areas, chain_id=False, res_name=True, res_num=False, sidechain=True, ) test_areas = test_areas.set_index("res_name").sort_index() test_counts = np.array(count_residues(atoms)) ref_path = PROTEIN_PATH / "Auton-Bolen.yaml" with open(ref_path, "rt") as ref_file: ref_dict = yaml.load(ref_file, Loader=yaml.FullLoader) ref_dict = ref_dict[protein] ref_df = pd.DataFrame(ref_dict).T ref_df = ref_df[ref_df["number"] > 0] ref_counts = np.array(ref_df["number"].sort_index()) np.testing.assert_equal(test_counts, ref_counts) for group in ["backbone", "sidechain"]: ref = np.array(ref_df[group]) test = np.array(test_areas[test_areas["sidechain"] == group]["value"]) # Test correlation since we're not 100% sure what parameters were used # in PNAS paper results = linregress(ref, test) info = ( f"{group} areas are correlated with results from paper: " f"{results}" ) _LOGGER.info(info) if not np.isclose(results.pvalue, 0): err = f"Poor fit to {group} areas: {results}" raise AssertionError(err)
def molecule_areas(self, atoms, model, how="mean"): """Return reference areas for the molecule. :param list(pqr.Atom) residue: list of atoms in molecule :param str model: model to use - creamer: denatured areas from Creamer, Srinivasan, Rose. doi: 10.1021/bi962819o - auton: denatured areas from Auton, Bolen doi: 10.1073/pnas.0507053102 - tripeptide: Gly-X-Gly areas for residue X from Auton, Bolen doi: 10.1073/pnas.0507053102 :param str what: what area to report - sidechain - backbone - total :param str how: calculation to perform on model values - max - min - mean :returns: DataFrame with a row for each residue type :rtype: pd.DataFrame """ count = count_residues(atoms) rows = {} for res, num in count.iteritems(): res_areas = self.residue_area(res, model, how) row = {} for what in ["sidechain", "backbone"]: row[what] = num * res_areas[what] rows[res] = row return pd.DataFrame(rows).T.sort_index()