def main(args=None): """Main entry point. :param list(str) args: list of strings to parse :returns: dictionary of results :rtype: dict """ parser = build_parser() args = parser.parse_args(args) logging.basicConfig(level=getattr(logging, args.log_level)) _LOGGER.info(f"Osmolytes version {osmolytes.__version__}") _LOGGER.debug(f"Got command-line arguments: {args}") _LOGGER.info(f"Reading PQR input from {args.pqr_path}...") with open(args.pqr_path, "rt") as pqr_file: atoms = parse_pqr_file(pqr_file) _LOGGER.info( f"Constructing protein solvent-accessible surface with " f"solvent radius {args.solvent_radius} and {args.surface_points} " f"points per atom." ) sas = SolventAccessibleSurface( atoms, probe_radius=args.solvent_radius, num_points=args.surface_points, xyz_path=args.surface_output, ) counts = count_residues(atoms) areas = get_folded_areas(atoms, sas).rename( {"sidechain": "sidechain areas", "backbone": "backbone areas"}, axis="columns" ) areas["residue counts"] = counts print(areas) areas = areas[["residue counts", "sidechain areas", "backbone areas"]] _LOGGER.info(f"Protein composition:\n{areas}") _LOGGER.info("Calculating transfer energies.") energy_df = transfer_energy(atoms, sas) _LOGGER.info(f"Detailed energies (kcal/mol/M):\n{energy_df.to_string()}") energies = energy_df.sum(axis=0).sort_values() _LOGGER.info(f"Summary energies (kcal/mol/M):\n{energies}") output_dir = Path(args.output_dir) if args.output == "xlsx": output_path = output_dir / f"{Path(args.pqr_path).stem}-mvalues.xlsx" _LOGGER.info(f"Writing energies (kcal/mol/M) to {output_path}") energy_df.to_excel(output_path) elif args.output == "csv": output_path = output_dir / f"{Path(args.pqr_path).stem}-mvalues.csv" _LOGGER.info(f"Writing energies (kcal/mol/M) to {output_path}") energy_df.to_csv(output_path) return { "args": args, "atoms": atoms, "sas": sas, "energy_df": energy_df, "energies": energies, }
def test_atom_sasa(molecule, tmp_path): """Test per-atom solvent-accessible surface areas for small molecules.""" _LOGGER.info("Temp path: %s", tmp_path) atom_tolerance = 0.03 total_tolerance = 0.03 pqr_path = "%s.pdb" % molecule pqr_path = Path("tests/data/alkanes") / pqr_path with open(pqr_path, "rt") as pqr_file: atoms = parse_pqr_file(pqr_file) xyz_path = Path(tmp_path) / f"{molecule}.xyz" sas = SolventAccessibleSurface(atoms, 0.65, 200, xyz_path=xyz_path) test_areas = np.array( [sas.atom_surface_area(iatom) for iatom in range(len(atoms))] ) test_areas = test_areas ref_areas = np.array(ATOM_AREAS[molecule]) ref_areas = ref_areas abs_diff = np.absolute(test_areas - ref_areas) rel_diff = abs_diff / np.sum(ref_areas) keys = [] for atom in atoms: keys.append(f"{atom.pqr_atom_num} {atom.atom_name}") df = pd.DataFrame( index=keys, data={ "Ref values": ref_areas, "Test values": test_areas, "Abs diff": abs_diff, "Rel diff": rel_diff, }, ) _LOGGER.info("\n%s", df.sort_values("Abs diff", ascending=False)) errors = [] if np.any(rel_diff > atom_tolerance): _LOGGER.error("\n%s", df.sort_values("Rel diff", ascending=False)) errors.append( "Per-atom relative differences exceed tolerance (%g)" % atom_tolerance ) abs_error = np.absolute(np.sum(test_areas) - np.sum(ref_areas)) rel_error = abs_error / np.sum(ref_areas) if rel_error > total_tolerance: _LOGGER.warning( f"Tolerance exceeded: {total_tolerance}\n" f"Reference area: {np.sum(ref_areas)}\n" f"Test area: {np.sum(test_areas)}\n" f"Absolute difference: {abs_error}\n" f"Relative difference: {rel_error}\n" ) errors.append( "Total relative difference exceeds tolerance (%g)" % total_tolerance ) if errors: raise AssertionError(";".join(errors))
def test_residue_count(protein): """Test aggregate SASAs per residue type as reported in Auton and Bolen (doi:10.1073/pnas.0507053102, Supporting Table 2).""" pqr_path = PROTEIN_PATH / f"{protein}.pqr" with open(pqr_path, "rt") as pqr_file: atoms = parse_pqr_file(pqr_file) test_counts = count_residues(atoms).sort_index() ref_path = PROTEIN_PATH / "Auton-Bolen.yaml" with open(ref_path, "rt") as ref_file: ref_dict = yaml.load(ref_file, Loader=yaml.FullLoader) ref_dict = ref_dict[protein] ref_df = pd.DataFrame(ref_dict).T ref_df = ref_df[ref_df["number"] > 0] ref_counts = np.array(ref_df["number"].sort_index()) results_df = pd.DataFrame({"Test": test_counts, "Ref": ref_counts}) _LOGGER.info(f"Results for {protein}:\n{results_df}") np.testing.assert_equal(test_counts, ref_counts)
def test_auton_sasa(protein, tmp_path): """Test aggregate SASAs per residue type as reported in Auton and Bolen (doi:10.1073/pnas.0507053102, Supporting Table 2).""" _LOGGER.info("Temp path: %s", tmp_path) pqr_path = PROTEIN_PATH / f"{protein}.pqr" with open(pqr_path, "rt") as pqr_file: atoms = parse_pqr_file(pqr_file) xyz_path = Path(tmp_path) / "surface.xyz" sas = SolventAccessibleSurface( atoms, probe_radius=1.4, num_points=200, xyz_path=xyz_path ) test_areas = [sas.atom_surface_area(iatom) for iatom in range(len(atoms))] test_areas = aggregate( atoms, test_areas, chain_id=False, res_name=True, res_num=False, sidechain=True, ) test_areas = test_areas.set_index("res_name").sort_index() test_counts = np.array(count_residues(atoms)) ref_path = PROTEIN_PATH / "Auton-Bolen.yaml" with open(ref_path, "rt") as ref_file: ref_dict = yaml.load(ref_file, Loader=yaml.FullLoader) ref_dict = ref_dict[protein] ref_df = pd.DataFrame(ref_dict).T ref_df = ref_df[ref_df["number"] > 0] ref_counts = np.array(ref_df["number"].sort_index()) np.testing.assert_equal(test_counts, ref_counts) for group in ["backbone", "sidechain"]: ref = np.array(ref_df[group]) test = np.array(test_areas[test_areas["sidechain"] == group]["value"]) # Test correlation since we're not 100% sure what parameters were used # in PNAS paper results = linregress(ref, test) info = ( f"{group} areas are correlated with results from paper: " f"{results}" ) _LOGGER.info(info) if not np.isclose(results.pvalue, 0): err = f"Poor fit to {group} areas: {results}" raise AssertionError(err)
def main(): """Main driver.""" parser = build_parser() args = parser.parse_args() with open(args.pqr, "rt") as pqr_file: atoms = pqr.parse_pqr_file(pqr_file) with open(args.apbs_output, "rt") as apbs_file: sasa_values = parse_apbs_output(apbs_file) sasa_dict = {} for iatom, atom in enumerate(atoms): if atom.chain_id not in sasa_dict: sasa_dict[atom.chain_id] = {} chain_dict = sasa_dict[atom.chain_id] residue_key = f"{atom.res_num} {atom.res_name}" if residue_key not in chain_dict: chain_dict[residue_key] = 0.0 chain_dict[residue_key] = chain_dict[residue_key] + sasa_values[iatom] sasa_dict[atom.chain_id] = chain_dict with open(args.json_output, "wt") as json_file: json.dump(sasa_dict, json_file, indent=2)
def test_parsing(pqr_path, chain_id): """Test PQR parsing with and without chains""" pqr_path = PROTEIN_PATH / pqr_path with open(pqr_path) as pqr_file: atoms = parse_pqr_file(pqr_file) for iatom in range(40): test_atom = atoms[iatom] ref_atom = PQR_DATA[iatom] assert test_atom.entry_type == ref_atom[0] assert test_atom.pqr_atom_num == ref_atom[1] assert test_atom.atom_name == ref_atom[2] assert test_atom.res_name == ref_atom[3] assert test_atom.res_num == ref_atom[4] assert test_atom.chain_id == chain_id ref_pos = np.array([ref_atom[5], ref_atom[6], ref_atom[7]]) np.testing.assert_almost_equal(test_atom.position, ref_pos, decimal=3) np.testing.assert_almost_equal(test_atom.charge, ref_atom[8], decimal=3) np.testing.assert_almost_equal(test_atom.radius, ref_atom[9], decimal=3)
def test_protein_details(pqr_path, ref_json, tmp_path): """Test SASA performance for proteins.""" _LOGGER.info("Temp path: %s", tmp_path) abs_cutoff = 2.5 num_abs_cutoff = 26 rel_cutoff = 0.1 num_rel_cutoff = 33 tot_abs_cutoff = 24.0 tot_rel_cutoff = 0.002 pqr_path = PROTEIN_PATH / pqr_path ref_json = PROTEIN_PATH / ref_json with open(pqr_path, "rt") as pqr_file: atoms = parse_pqr_file(pqr_file) xyz_path = Path(tmp_path) / "surface.xyz" sas = SolventAccessibleSurface( atoms, probe_radius=1.4, num_points=2000, xyz_path=xyz_path ) test_areas = [sas.atom_surface_area(iatom) for iatom in range(len(atoms))] df = aggregate( atoms, test_areas, chain_id=True, res_name=False, res_num=True, sidechain=False, ) with open(ref_json, "rt") as ref_file: ref_dict = json.load(ref_file) keys = [] ref_values = [] test_values = [] for chain in ref_dict.keys(): chain_df = df[df["chain_id"] == chain] test_values = test_values + list(chain_df["value"].values) keys = keys + [f"{chain} {key}" for key in ref_dict[chain].keys()] ref_values = ref_values + list(ref_dict[chain].values()) ref_values = np.array(ref_values) test_values = np.array(test_values) abs_diff = np.absolute(ref_values - test_values) rel_diff = np.divide(abs_diff, ref_values) df = pd.DataFrame( index=keys, data={ "Ref values": ref_values, "Test values": test_values, "Abs diff": abs_diff, "Rel diff": rel_diff, }, ) _LOGGER.info( "Largest absolute differences:\n%s", df.nlargest(20, "Abs diff") ) df = df.dropna() _LOGGER.info( "Largest relative differences:\n%s", df.nlargest(20, "Rel diff") ) errors = [] num_abs = len(abs_diff[abs_diff > abs_cutoff]) if num_abs > num_abs_cutoff: err = ( f"Number of absolute differences ({num_abs}) above {abs_cutoff} " f"exceeds limit ({num_abs_cutoff})" ) errors.append(err) ref_total = np.sum(ref_values) test_total = np.sum(test_values) abs_total = np.absolute(ref_total - test_total) if abs_total > tot_abs_cutoff: err = ( f"Absolute difference in total area ({abs_total}) exceeds " f"limit ({tot_abs_cutoff})" ) errors.append(err) rel_total = abs_total / ref_total if rel_total > tot_rel_cutoff: err = ( f"Relative difference in total area ({rel_total}) exceeds " "limit ({tot_rel_cutoff})" ) errors.append(err) if errors: raise AssertionError(";".join(errors))
def test_insertion_codes(protein): """Test whether proteins with insertion codes break the software.""" pqr_path = PROTEIN_PATH / f"{protein}.pqr" with open(pqr_path, "rt") as pqr_file: parse_pqr_file(pqr_file)