Example #1
0
def main(args=None):
    """Main entry point.

    :param list(str) args:  list of strings to parse
    :returns:  dictionary of results
    :rtype:  dict
    """
    parser = build_parser()
    args = parser.parse_args(args)
    logging.basicConfig(level=getattr(logging, args.log_level))
    _LOGGER.info(f"Osmolytes version {osmolytes.__version__}")
    _LOGGER.debug(f"Got command-line arguments: {args}")
    _LOGGER.info(f"Reading PQR input from {args.pqr_path}...")
    with open(args.pqr_path, "rt") as pqr_file:
        atoms = parse_pqr_file(pqr_file)
    _LOGGER.info(
        f"Constructing protein solvent-accessible surface with "
        f"solvent radius {args.solvent_radius} and {args.surface_points} "
        f"points per atom."
    )
    sas = SolventAccessibleSurface(
        atoms,
        probe_radius=args.solvent_radius,
        num_points=args.surface_points,
        xyz_path=args.surface_output,
    )
    counts = count_residues(atoms)
    areas = get_folded_areas(atoms, sas).rename(
        {"sidechain": "sidechain areas", "backbone": "backbone areas"},
        axis="columns"
    )
    areas["residue counts"] = counts
    print(areas)
    areas = areas[["residue counts", "sidechain areas", "backbone areas"]]
    _LOGGER.info(f"Protein composition:\n{areas}")
    _LOGGER.info("Calculating transfer energies.")
    energy_df = transfer_energy(atoms, sas)
    _LOGGER.info(f"Detailed energies (kcal/mol/M):\n{energy_df.to_string()}")
    energies = energy_df.sum(axis=0).sort_values()
    _LOGGER.info(f"Summary energies (kcal/mol/M):\n{energies}")
    output_dir = Path(args.output_dir)
    if args.output == "xlsx":
        output_path = output_dir / f"{Path(args.pqr_path).stem}-mvalues.xlsx"
        _LOGGER.info(f"Writing energies (kcal/mol/M) to {output_path}")
        energy_df.to_excel(output_path)
    elif args.output == "csv":
        output_path = output_dir / f"{Path(args.pqr_path).stem}-mvalues.csv"
        _LOGGER.info(f"Writing energies (kcal/mol/M) to {output_path}")
        energy_df.to_csv(output_path)
    return {
        "args": args,
        "atoms": atoms,
        "sas": sas,
        "energy_df": energy_df,
        "energies": energies,
    }
Example #2
0
def test_atom_sasa(molecule, tmp_path):
    """Test per-atom solvent-accessible surface areas for small molecules."""
    _LOGGER.info("Temp path:  %s", tmp_path)
    atom_tolerance = 0.03
    total_tolerance = 0.03
    pqr_path = "%s.pdb" % molecule
    pqr_path = Path("tests/data/alkanes") / pqr_path
    with open(pqr_path, "rt") as pqr_file:
        atoms = parse_pqr_file(pqr_file)
    xyz_path = Path(tmp_path) / f"{molecule}.xyz"
    sas = SolventAccessibleSurface(atoms, 0.65, 200, xyz_path=xyz_path)
    test_areas = np.array(
        [sas.atom_surface_area(iatom) for iatom in range(len(atoms))]
    )
    test_areas = test_areas
    ref_areas = np.array(ATOM_AREAS[molecule])
    ref_areas = ref_areas
    abs_diff = np.absolute(test_areas - ref_areas)
    rel_diff = abs_diff / np.sum(ref_areas)
    keys = []
    for atom in atoms:
        keys.append(f"{atom.pqr_atom_num} {atom.atom_name}")
    df = pd.DataFrame(
        index=keys,
        data={
            "Ref values": ref_areas,
            "Test values": test_areas,
            "Abs diff": abs_diff,
            "Rel diff": rel_diff,
        },
    )
    _LOGGER.info("\n%s", df.sort_values("Abs diff", ascending=False))
    errors = []
    if np.any(rel_diff > atom_tolerance):
        _LOGGER.error("\n%s", df.sort_values("Rel diff", ascending=False))
        errors.append(
            "Per-atom relative differences exceed tolerance (%g)"
            % atom_tolerance
        )
    abs_error = np.absolute(np.sum(test_areas) - np.sum(ref_areas))
    rel_error = abs_error / np.sum(ref_areas)
    if rel_error > total_tolerance:
        _LOGGER.warning(
            f"Tolerance exceeded: {total_tolerance}\n"
            f"Reference area: {np.sum(ref_areas)}\n"
            f"Test area: {np.sum(test_areas)}\n"
            f"Absolute difference: {abs_error}\n"
            f"Relative difference: {rel_error}\n"
        )
        errors.append(
            "Total relative difference exceeds tolerance (%g)"
            % total_tolerance
        )
    if errors:
        raise AssertionError(";".join(errors))
Example #3
0
def test_residue_count(protein):
    """Test aggregate SASAs per residue type as reported in Auton and Bolen
    (doi:10.1073/pnas.0507053102, Supporting Table 2)."""
    pqr_path = PROTEIN_PATH / f"{protein}.pqr"
    with open(pqr_path, "rt") as pqr_file:
        atoms = parse_pqr_file(pqr_file)
    test_counts = count_residues(atoms).sort_index()
    ref_path = PROTEIN_PATH / "Auton-Bolen.yaml"
    with open(ref_path, "rt") as ref_file:
        ref_dict = yaml.load(ref_file, Loader=yaml.FullLoader)
    ref_dict = ref_dict[protein]
    ref_df = pd.DataFrame(ref_dict).T
    ref_df = ref_df[ref_df["number"] > 0]
    ref_counts = np.array(ref_df["number"].sort_index())
    results_df = pd.DataFrame({"Test": test_counts, "Ref": ref_counts})
    _LOGGER.info(f"Results for {protein}:\n{results_df}")
    np.testing.assert_equal(test_counts, ref_counts)
Example #4
0
def test_auton_sasa(protein, tmp_path):
    """Test aggregate SASAs per residue type as reported in Auton and Bolen
    (doi:10.1073/pnas.0507053102, Supporting Table 2)."""
    _LOGGER.info("Temp path:  %s", tmp_path)
    pqr_path = PROTEIN_PATH / f"{protein}.pqr"
    with open(pqr_path, "rt") as pqr_file:
        atoms = parse_pqr_file(pqr_file)
    xyz_path = Path(tmp_path) / "surface.xyz"
    sas = SolventAccessibleSurface(
        atoms, probe_radius=1.4, num_points=200, xyz_path=xyz_path
    )
    test_areas = [sas.atom_surface_area(iatom) for iatom in range(len(atoms))]
    test_areas = aggregate(
        atoms,
        test_areas,
        chain_id=False,
        res_name=True,
        res_num=False,
        sidechain=True,
    )
    test_areas = test_areas.set_index("res_name").sort_index()
    test_counts = np.array(count_residues(atoms))
    ref_path = PROTEIN_PATH / "Auton-Bolen.yaml"
    with open(ref_path, "rt") as ref_file:
        ref_dict = yaml.load(ref_file, Loader=yaml.FullLoader)
    ref_dict = ref_dict[protein]
    ref_df = pd.DataFrame(ref_dict).T
    ref_df = ref_df[ref_df["number"] > 0]
    ref_counts = np.array(ref_df["number"].sort_index())
    np.testing.assert_equal(test_counts, ref_counts)
    for group in ["backbone", "sidechain"]:
        ref = np.array(ref_df[group])
        test = np.array(test_areas[test_areas["sidechain"] == group]["value"])
        # Test correlation since we're not 100% sure what parameters were used
        # in PNAS paper
        results = linregress(ref, test)
        info = (
            f"{group} areas are correlated with results from paper:  "
            f"{results}"
        )
        _LOGGER.info(info)
        if not np.isclose(results.pvalue, 0):
            err = f"Poor fit to {group} areas:  {results}"
            raise AssertionError(err)
Example #5
0
def main():
    """Main driver."""
    parser = build_parser()
    args = parser.parse_args()
    with open(args.pqr, "rt") as pqr_file:
        atoms = pqr.parse_pqr_file(pqr_file)
    with open(args.apbs_output, "rt") as apbs_file:
        sasa_values = parse_apbs_output(apbs_file)
    sasa_dict = {}
    for iatom, atom in enumerate(atoms):
        if atom.chain_id not in sasa_dict:
            sasa_dict[atom.chain_id] = {}
        chain_dict = sasa_dict[atom.chain_id]
        residue_key = f"{atom.res_num} {atom.res_name}"
        if residue_key not in chain_dict:
            chain_dict[residue_key] = 0.0
        chain_dict[residue_key] = chain_dict[residue_key] + sasa_values[iatom]
        sasa_dict[atom.chain_id] = chain_dict
    with open(args.json_output, "wt") as json_file:
        json.dump(sasa_dict, json_file, indent=2)
Example #6
0
def test_parsing(pqr_path, chain_id):
    """Test PQR parsing with and without chains"""
    pqr_path = PROTEIN_PATH / pqr_path
    with open(pqr_path) as pqr_file:
        atoms = parse_pqr_file(pqr_file)
        for iatom in range(40):
            test_atom = atoms[iatom]
            ref_atom = PQR_DATA[iatom]
            assert test_atom.entry_type == ref_atom[0]
            assert test_atom.pqr_atom_num == ref_atom[1]
            assert test_atom.atom_name == ref_atom[2]
            assert test_atom.res_name == ref_atom[3]
            assert test_atom.res_num == ref_atom[4]
            assert test_atom.chain_id == chain_id
            ref_pos = np.array([ref_atom[5], ref_atom[6], ref_atom[7]])
            np.testing.assert_almost_equal(test_atom.position,
                                           ref_pos,
                                           decimal=3)
            np.testing.assert_almost_equal(test_atom.charge,
                                           ref_atom[8],
                                           decimal=3)
            np.testing.assert_almost_equal(test_atom.radius,
                                           ref_atom[9],
                                           decimal=3)
Example #7
0
def test_protein_details(pqr_path, ref_json, tmp_path):
    """Test SASA performance for proteins."""
    _LOGGER.info("Temp path:  %s", tmp_path)
    abs_cutoff = 2.5
    num_abs_cutoff = 26
    rel_cutoff = 0.1
    num_rel_cutoff = 33
    tot_abs_cutoff = 24.0
    tot_rel_cutoff = 0.002
    pqr_path = PROTEIN_PATH / pqr_path
    ref_json = PROTEIN_PATH / ref_json
    with open(pqr_path, "rt") as pqr_file:
        atoms = parse_pqr_file(pqr_file)
    xyz_path = Path(tmp_path) / "surface.xyz"
    sas = SolventAccessibleSurface(
        atoms, probe_radius=1.4, num_points=2000, xyz_path=xyz_path
    )
    test_areas = [sas.atom_surface_area(iatom) for iatom in range(len(atoms))]
    df = aggregate(
        atoms,
        test_areas,
        chain_id=True,
        res_name=False,
        res_num=True,
        sidechain=False,
    )
    with open(ref_json, "rt") as ref_file:
        ref_dict = json.load(ref_file)
    keys = []
    ref_values = []
    test_values = []
    for chain in ref_dict.keys():
        chain_df = df[df["chain_id"] == chain]
        test_values = test_values + list(chain_df["value"].values)
        keys = keys + [f"{chain} {key}" for key in ref_dict[chain].keys()]
        ref_values = ref_values + list(ref_dict[chain].values())
    ref_values = np.array(ref_values)
    test_values = np.array(test_values)
    abs_diff = np.absolute(ref_values - test_values)
    rel_diff = np.divide(abs_diff, ref_values)
    df = pd.DataFrame(
        index=keys,
        data={
            "Ref values": ref_values,
            "Test values": test_values,
            "Abs diff": abs_diff,
            "Rel diff": rel_diff,
        },
    )
    _LOGGER.info(
        "Largest absolute differences:\n%s", df.nlargest(20, "Abs diff")
    )
    df = df.dropna()
    _LOGGER.info(
        "Largest relative differences:\n%s", df.nlargest(20, "Rel diff")
    )
    errors = []
    num_abs = len(abs_diff[abs_diff > abs_cutoff])
    if num_abs > num_abs_cutoff:
        err = (
            f"Number of absolute differences ({num_abs}) above {abs_cutoff} "
            f"exceeds limit ({num_abs_cutoff})"
        )
        errors.append(err)
    ref_total = np.sum(ref_values)
    test_total = np.sum(test_values)
    abs_total = np.absolute(ref_total - test_total)
    if abs_total > tot_abs_cutoff:
        err = (
            f"Absolute difference in total area ({abs_total}) exceeds "
            f"limit ({tot_abs_cutoff})"
        )
        errors.append(err)
    rel_total = abs_total / ref_total
    if rel_total > tot_rel_cutoff:
        err = (
            f"Relative difference in total area ({rel_total}) exceeds "
            "limit ({tot_rel_cutoff})"
        )
        errors.append(err)
    if errors:
        raise AssertionError(";".join(errors))
Example #8
0
def test_insertion_codes(protein):
    """Test whether proteins with insertion codes break the software."""
    pqr_path = PROTEIN_PATH / f"{protein}.pqr"
    with open(pqr_path, "rt") as pqr_file:
        parse_pqr_file(pqr_file)