Esempio n. 1
0
def transfer_energy(atoms, sas):
    """Calculate transfer energy for each osmolyte and amino acid type.

    .. todo:: implement max/min for Creamer model

    :param list(pqr.Atom) atoms:  list of atoms
    :param SolventAccessibleSurface sas: solvent-accessible surface area object
    :returns:  DataFrame with energy contribution (kcal/mol) by residue type
    :rtype:  pd.DataFrame
    """
    ref_model = ReferenceModels()
    counts = count_residues(atoms)
    folded_areas = get_folded_areas(atoms, sas)
    unfolded_areas = ref_model.molecule_areas(atoms, model="auton", how="mean")
    tripeptide_areas = ref_model.molecule_areas(atoms,
                                                model="tripeptide",
                                                how="mean")
    scaled_areas = (unfolded_areas - folded_areas) / tripeptide_areas
    scaled_areas["sidechain"]["GLY"] = 0.0
    osmolytes = list(ENERGY_DICT["backbone"].keys())
    residue_energies = {}
    bb_energy = {osmolyte: 0.0 for osmolyte in osmolytes}
    for res, area in scaled_areas.T.iteritems():
        sc_energy = {}
        for osmolyte in osmolytes:
            sc_energy[osmolyte] = (counts[res] * ENERGY_DICT[res][osmolyte] *
                                   area["sidechain"])
            bb_energy[osmolyte] += (counts[res] *
                                    ENERGY_DICT["backbone"][osmolyte] *
                                    area["backbone"])
        residue_energies[res] = sc_energy
    residue_energies["backbone"] = bb_energy
    return pd.DataFrame(residue_energies).T.sort_index() / 1000
Esempio n. 2
0
def main(args=None):
    """Main entry point.

    :param list(str) args:  list of strings to parse
    :returns:  dictionary of results
    :rtype:  dict
    """
    parser = build_parser()
    args = parser.parse_args(args)
    logging.basicConfig(level=getattr(logging, args.log_level))
    _LOGGER.info(f"Osmolytes version {osmolytes.__version__}")
    _LOGGER.debug(f"Got command-line arguments: {args}")
    _LOGGER.info(f"Reading PQR input from {args.pqr_path}...")
    with open(args.pqr_path, "rt") as pqr_file:
        atoms = parse_pqr_file(pqr_file)
    _LOGGER.info(
        f"Constructing protein solvent-accessible surface with "
        f"solvent radius {args.solvent_radius} and {args.surface_points} "
        f"points per atom."
    )
    sas = SolventAccessibleSurface(
        atoms,
        probe_radius=args.solvent_radius,
        num_points=args.surface_points,
        xyz_path=args.surface_output,
    )
    counts = count_residues(atoms)
    areas = get_folded_areas(atoms, sas).rename(
        {"sidechain": "sidechain areas", "backbone": "backbone areas"},
        axis="columns"
    )
    areas["residue counts"] = counts
    print(areas)
    areas = areas[["residue counts", "sidechain areas", "backbone areas"]]
    _LOGGER.info(f"Protein composition:\n{areas}")
    _LOGGER.info("Calculating transfer energies.")
    energy_df = transfer_energy(atoms, sas)
    _LOGGER.info(f"Detailed energies (kcal/mol/M):\n{energy_df.to_string()}")
    energies = energy_df.sum(axis=0).sort_values()
    _LOGGER.info(f"Summary energies (kcal/mol/M):\n{energies}")
    output_dir = Path(args.output_dir)
    if args.output == "xlsx":
        output_path = output_dir / f"{Path(args.pqr_path).stem}-mvalues.xlsx"
        _LOGGER.info(f"Writing energies (kcal/mol/M) to {output_path}")
        energy_df.to_excel(output_path)
    elif args.output == "csv":
        output_path = output_dir / f"{Path(args.pqr_path).stem}-mvalues.csv"
        _LOGGER.info(f"Writing energies (kcal/mol/M) to {output_path}")
        energy_df.to_csv(output_path)
    return {
        "args": args,
        "atoms": atoms,
        "sas": sas,
        "energy_df": energy_df,
        "energies": energies,
    }
Esempio n. 3
0
def test_residue_count(protein):
    """Test aggregate SASAs per residue type as reported in Auton and Bolen
    (doi:10.1073/pnas.0507053102, Supporting Table 2)."""
    pqr_path = PROTEIN_PATH / f"{protein}.pqr"
    with open(pqr_path, "rt") as pqr_file:
        atoms = parse_pqr_file(pqr_file)
    test_counts = count_residues(atoms).sort_index()
    ref_path = PROTEIN_PATH / "Auton-Bolen.yaml"
    with open(ref_path, "rt") as ref_file:
        ref_dict = yaml.load(ref_file, Loader=yaml.FullLoader)
    ref_dict = ref_dict[protein]
    ref_df = pd.DataFrame(ref_dict).T
    ref_df = ref_df[ref_df["number"] > 0]
    ref_counts = np.array(ref_df["number"].sort_index())
    results_df = pd.DataFrame({"Test": test_counts, "Ref": ref_counts})
    _LOGGER.info(f"Results for {protein}:\n{results_df}")
    np.testing.assert_equal(test_counts, ref_counts)
Esempio n. 4
0
def test_auton_sasa(protein, tmp_path):
    """Test aggregate SASAs per residue type as reported in Auton and Bolen
    (doi:10.1073/pnas.0507053102, Supporting Table 2)."""
    _LOGGER.info("Temp path:  %s", tmp_path)
    pqr_path = PROTEIN_PATH / f"{protein}.pqr"
    with open(pqr_path, "rt") as pqr_file:
        atoms = parse_pqr_file(pqr_file)
    xyz_path = Path(tmp_path) / "surface.xyz"
    sas = SolventAccessibleSurface(
        atoms, probe_radius=1.4, num_points=200, xyz_path=xyz_path
    )
    test_areas = [sas.atom_surface_area(iatom) for iatom in range(len(atoms))]
    test_areas = aggregate(
        atoms,
        test_areas,
        chain_id=False,
        res_name=True,
        res_num=False,
        sidechain=True,
    )
    test_areas = test_areas.set_index("res_name").sort_index()
    test_counts = np.array(count_residues(atoms))
    ref_path = PROTEIN_PATH / "Auton-Bolen.yaml"
    with open(ref_path, "rt") as ref_file:
        ref_dict = yaml.load(ref_file, Loader=yaml.FullLoader)
    ref_dict = ref_dict[protein]
    ref_df = pd.DataFrame(ref_dict).T
    ref_df = ref_df[ref_df["number"] > 0]
    ref_counts = np.array(ref_df["number"].sort_index())
    np.testing.assert_equal(test_counts, ref_counts)
    for group in ["backbone", "sidechain"]:
        ref = np.array(ref_df[group])
        test = np.array(test_areas[test_areas["sidechain"] == group]["value"])
        # Test correlation since we're not 100% sure what parameters were used
        # in PNAS paper
        results = linregress(ref, test)
        info = (
            f"{group} areas are correlated with results from paper:  "
            f"{results}"
        )
        _LOGGER.info(info)
        if not np.isclose(results.pvalue, 0):
            err = f"Poor fit to {group} areas:  {results}"
            raise AssertionError(err)
Esempio n. 5
0
    def molecule_areas(self, atoms, model, how="mean"):
        """Return reference areas for the molecule.

        :param list(pqr.Atom) residue:  list of atoms in molecule
        :param str model:  model to use

          - creamer: denatured areas from Creamer, Srinivasan, Rose.
            doi: 10.1021/bi962819o
          - auton: denatured areas from Auton, Bolen
            doi: 10.1073/pnas.0507053102
          - tripeptide:  Gly-X-Gly areas for residue X from Auton, Bolen
            doi: 10.1073/pnas.0507053102

        :param str what:  what area to report

          - sidechain
          - backbone
          - total

        :param str how:  calculation to perform on model values

          - max
          - min
          - mean

        :returns:  DataFrame with a row for each residue type
        :rtype:  pd.DataFrame
        """
        count = count_residues(atoms)
        rows = {}
        for res, num in count.iteritems():
            res_areas = self.residue_area(res, model, how)
            row = {}
            for what in ["sidechain", "backbone"]:
                row[what] = num * res_areas[what]
            rows[res] = row
        return pd.DataFrame(rows).T.sort_index()