def important_pfam(seqs_from_pdb_hmm):
    for query in tqdm(bpsio.parse(seqs_from_pdb_hmm, 'hmmer3-text')):
        try:
            pdb, chain, start, end = query.id.split("_")  # @UnusedVariable
            if ExperimentalStructure.objects(name=pdb,residue_sets__name="important_pfam").count():
                continue

            strdoc = ExperimentalStructure.objects(name=pdb).get()

            if not strdoc.residue_set("important_pfam"):
                important_rs = ResidueSet(name="important_pfam")
                domain_rs = None
                for hit in query:
                    if len(hit):
                        hsp = hit[0]
                        domain_rs = ResidueSet(name=hit.id)
                        i = 0
                        for x in str(hsp.aln[1].seq):
                            residue = chain + "_" + str(i + int(start))
                            if x == x.upper():
                                important_rs.residues.append(residue)
                            i = i + 1
                            domain_rs.residues.append(residue)
                        if domain_rs:
                            strdoc.residue_sets.append(domain_rs)
                strdoc.residue_sets.append(important_rs)
                strdoc.save()
        except DoesNotExist:
            pass
def free_cys_tyr(pdb_utils):
    parser = PDBParser(PERMISSIVE=1, QUIET=1)
    _log.debug("procesing free cys/tyr")
    total = ExperimentalStructure.objects(residue_sets__name__ne = "free_tyr").count()
    for strdoc in tqdm(ExperimentalStructure.objects(residue_sets__name__ne = "free_tyr").no_cache().timeout(False), total=total):

        if not (strdoc.residue_set("free_cys") or strdoc.residue_set("free_tyr")):
            if not os.path.exists(pdb_utils.pdb_path(strdoc.name)):
                pdb_utils.update_pdb(strdoc.name)
            if not os.path.exists(pdb_utils.pdb_path(strdoc.name)):
                continue
            try:
                bp_pdb = list(parser.get_structure(strdoc.name, pdb_utils.pdb_path(strdoc.name)  ))[0]
            except PDBConstructionException:
                continue
            except TypeError:
                continue

            free = {"CYS": [], "TYR": []}
            codes = {"CYS": "SG", "TYR": "OH"}
            for x in bp_pdb.get_residues():
                if x.resname in codes:
                    neighbor_atoms = set(list(bp_pdb.get_atoms())) - set(list(x))
                    if (codes[x.resname] in x) and (
                            not any(map(lambda atom: (x[codes[x.resname]] - atom) <= 3, neighbor_atoms))):
                        free[x.resname].append(x.parent.id + "_" + str(x.id[1]))
            if free["CYS"]:
                rs = ResidueSet(name="free_cys", residues=free["CYS"])
                strdoc.residue_sets.append(rs)
            if free["TYR"]:
                rs = ResidueSet(name="free_tyr", residues=free["TYR"])
                strdoc.residue_sets.append(rs)
            if free["CYS"] or free["TYR"]:
                strdoc.save()
Beispiel #3
0
    def free_cys_tyr(self, strdoc):
        parser = PDBParser(PERMISSIVE=1, QUIET=1)

        strdoc.residue_sets = [x for x in strdoc.residue_sets if x.name not in ["free_cys", "free_tyr", "cys", "tyr"]]
        #         if not (strdoc.residue_set("free_cys") or strdoc.residue_set("free_tyr")):
        struct_path = self.struct_path(self.work_dir, strdoc)
        bp_pdb = list(parser.get_structure(strdoc.name, struct_path))[0]
        free = {"CYS": [], "TYR": []}
        _all = {"CYS": [], "TYR": []}
        codes = {"CYS": "SG", "TYR": "OH"}
        for x in bp_pdb.get_residues():
            if x.resname in codes:
                aa_code = x.parent.id + "_" + str(x.id[1])
                _all[x.resname].append(aa_code)
                neighbor_atoms = set(list(bp_pdb.get_atoms())) - set(list(x))
                if (codes[x.resname] in x) and (
                        not any(map(lambda atom: (x[codes[x.resname]] - atom) <= 3, neighbor_atoms))):
                    free[x.resname].append(aa_code)
        if free["CYS"]:
            rs = ResidueSet(name="free_cys", residues=free["CYS"])
            strdoc.residue_sets.append(rs)
        if free["TYR"]:
            rs = ResidueSet(name="free_tyr", residues=free["TYR"])
            strdoc.residue_sets.append(rs)
        if _all["CYS"]:
            rs = ResidueSet(name="cys", residues=_all["CYS"])
            strdoc.residue_sets.append(rs)
        if _all["TYR"]:
            rs = ResidueSet(name="tyr", residues=_all["TYR"])
            strdoc.residue_sets.append(rs)
Beispiel #4
0
    def cluster_aligned_annotation(self, model, template_aln, segment, cluster_aligned_segment, cluster_res_start):
        segment.clust_start
        segment_start = int(segment.start)
        segment_end = int(segment.end)
        segment_str = "_".join(
            [segment.pdb, segment.chain, str(segment_start).split(".")[0], str(segment.end).split(".")[0]])
        if segment_str != template_aln.aln_hit.name:
            template_eq = ExperimentalStructure.objects(name=segment.pdb).get()
            #   prot             IVAGRVSQKMAPVLRQIYDQMAEPKWVLAMGVCAS
            #   template       **IVAGAAS--MAPVLQQIYDQM--PKWVLAMGVC--**
            #   template_eq      -----AS--MAPVVQQILDQ---------------
            #   template_eq2   **IVAAAS--MAPVVQQILDQ---------------
            #   template_eq3   **IVAAAS--MAPVVQQILDQQM--PKWVLAMGVC--**

            template_res_to_cluster_res_map = lambda template_res: (
                                                                           template_res - cluster_res_start) - segment.clust_start + segment_start

            alignment_start_after_cluster_start = cluster_aligned_segment.residue_numbers()[0] >= segment.clust_start
            eq_start_residue = segment_start if alignment_start_after_cluster_start else segment_start + template_aln.aln_hit.start

            alignment_end_before_cluster_end = cluster_aligned_segment.residue_numbers()[-1] <= segment.clust_end
            eq_end_residue = segment_end if alignment_end_before_cluster_end else template_res_to_cluster_res_map(
                template_aln.hit_res_end)

            eq_aligned_segment = ResidueSet(name="aln", residues=[segment.chain + "_" + str(i + eq_start_residue)
                                                                  for i in range(len(template_aln.aln_query.seq)) if
                                                                  not template_aln.is_gap(i)])

            def map_eq_resid_to_query_resid(eq_res_id, template_aln, segment, segment_start):
                pos_in_eq = eq_res_id - segment_start
                pos_in_template = pos_in_eq + segment.clust_start
                pos_in_query = template_aln.map_pos_hit_query(pos_in_template)  # - template_aln.aln_hit.start
                return pos_in_query

            eq_aligned_csas = (template_eq.residue_set("csa").in_range(eq_start_residue,
                                                                       eq_end_residue) & eq_aligned_segment).residue_numbers()
            if eq_aligned_csas:
                aligned_csas_projected = [map_eq_resid_to_query_resid(eq_res_id, template_aln, segment, segment_start)
                                          for eq_res_id in eq_aligned_csas]
                residue_set_csa = ResidueSet(name="csa_" + segment.pdb, type="catalitic_projected",
                                             residues=["_" + str(resid) for resid in aligned_csas_projected])
                model.residue_sets.append(residue_set_csa)

            for comp_type in main_compound_types:
                binding_name = comp_type.lower() + "_binding"
                eq_aligned_binding = (template_eq.residue_set(binding_name).in_range(eq_start_residue, eq_end_residue)
                                      & eq_aligned_segment).residue_numbers()
                if eq_aligned_binding:
                    aligned_binding_projected = [
                        map_eq_resid_to_query_resid(eq_res_id, template_aln, segment, segment_start) for eq_res_id in
                        eq_aligned_binding if eq_res_id]
                    residue_set_binding = ResidueSet(name=binding_name + "_" + segment.pdb,
                                                     type=binding_name + "_projected",
                                                     residues=["_" + str(resid) for resid in aligned_binding_projected])
                    model.residue_sets.append(residue_set_binding)
Beispiel #5
0
 def pocket_residue_set(pockets_json,structure_atoms):
     rss = []
     with open(pockets_json) as handle:
         pockets_dict = json.load(handle)
     for pocket_dict in pockets_dict:
         rs = ResidueSet(name="Pocket_" + str(pocket_dict["number"]), type="pocket",residues=[])
         for key, value in pocket_dict["properties"].items():
             rs[eq2[key]] = value
         residues = list(set([x.parent.parent.id + "_" + str(x.parent.id[1])
                              for x in structure_atoms
                              if str(x.serial_number) in pocket_dict["atoms"]]))
         rs.residues = residues
         if rs.residues:
             rss.append(rs)
     return rss
Beispiel #6
0
    def aligned_annoations(self, model):
        model.residue_sets = [x for x in model.residue_sets
                              if not any(map(lambda rsname: x.name.startswith(rsname),
                                             ["csa"] + [y.lower() + "_binding" for y in
                                                        main_compound_types]))]

        for template_aln in model.templates:
            pdb, chain= template_aln.aln_hit.name.split("_")
            segment_start = template_aln.aln_hit.start
            segment_end = template_aln.aln_hit.end

            try:
                template = ExperimentalStructure.objects(name=pdb).get()
            except Structure.DoesNotExist as ex:
                print([ex,pdb])
                continue


            is_aligned = not ((int(segment_start) == -1) & (int(segment_end) == -1))

            if is_aligned:
                start_residue = template_aln[0].h_resid
                end_residue = template_aln[-1].h_resid
                aligned_segment = ResidueSet(name="aln", residues=[chain + "_" + str(template_aln[i].h_resid)
                                                                   for i in range(len(template_aln.aln_query.seq))
                                                                   if not template_aln.is_gap(i)])

                aligned_csas = template.residue_set("csa").in_range(start_residue, end_residue) & aligned_segment
                if len(aligned_csas):
                    aln_query = [" _" + str(template_aln.aln_pos_from_h_resid(int(x.split("_")[1])).q_resid) for x in
                                 aligned_csas.residues]
                    model.residue_sets.append(
                        ResidueSet(name="csa_" + pdb, residues=aln_query, type="catalitic_projected"))

                for comp_type in main_compound_types:
                    binding_name = comp_type.lower() + "_binding"
                    aligned_binding = template.residue_set(binding_name).in_range(start_residue,
                                                                                  end_residue) & aligned_segment

                    def query_residue(pdb_resid):
                        return " _" + str(template_aln.aln_pos_from_h_resid(pdb_resid).q_resid)

                    aligned_binding_residues = [query_residue(pdb_resid) for pdb_resid in
                                                aligned_binding.residue_numbers()]
                    if aligned_binding_residues:
                        rsbinding = ResidueSet(name=binding_name + "_" + pdb, residues=aligned_binding_residues,
                                               type=binding_name + "_projected")
                        model.residue_sets.append(rsbinding)
def update_binding_residues(distances_tbl):
    df_binding_dist = pd.read_table(distances_tbl, sep="\t",
                                    names=[
                                        "pdb", "chain", "hmm_name", "prot_res", "resname",
                                        "res_atom_id",
                                        "comp_res_id", "comp_resname", "comp_atom_id", "distance"
                                    ])
    df_binding_dist["comptype"] = map(lambda x: compound_type[x], df_binding_dist.comp_resname)

    groups = df_binding_dist.groupby("pdb")
    total = len(groups)
    _log.debug("procesing binding")
    for pdb, df_binding_dist_pdb in tqdm(groups, total=total):

        try:
            for r_comp_type in ['LIPID', 'METAL', 'NUCLEOTIDE', 'SUGAR', "DRUG", "COFACTOR"]:
                if r_comp_type != "COFACTOR":
                    comp_type = r_comp_type
                else:
                    comp_type = "DRUG"

                df_comp_dist_pdb_near = df_binding_dist_pdb[
                    (df_binding_dist_pdb.distance <= 3) & (comp_type == df_binding_dist_pdb.comptype)]
                if len(df_comp_dist_pdb_near):
                    strdoc = ExperimentalStructure.objects(name=pdb).no_cache().get()
                    rs_name = comp_type.lower() + "_binding"
                    if not strdoc.has_residue_set(rs_name):
                        residue_list = list(set([row.chain + "_" + str(row.prot_res) for i, row in
                                                 df_comp_dist_pdb_near.iterrows()]))  # @UnusedVariable
                        rs = ResidueSet(name=rs_name, residues=residue_list, type="binding")
                        strdoc.residue_sets.append(rs)
                        strdoc.save()

        except DoesNotExist:
            _log.warn("%s does not exists" % pdb)
Beispiel #8
0
    def important_pfam(self, domains, model):
        template = model.templates[0]

        important_rs = ResidueSet(name=StructureAnotator.important_pfam_rs)

        for domain in domains:

            domain_id = "_".join([domain.identifier, str(domain.location.start), str(domain.location.end)])
            domain_rs = ResidueSet(name=domain_id, type="domain",
                                   residues=[" _" + str(template.aln_pos_from_query_pos(i).q_resid)
                                             for i in range(domain.location.start, domain.location.end + 1)
                                             if i >= template.aln_query.start
                                             and i <= template.aln_query.end
                                             and i < len(template.aln_query.original_seq())
                                             ])

            model.residue_sets = [x for x in model.residue_sets if x.name != domain_id]

            if (1.0 * len(domain_rs) / len(domain)) > 0.8:
                """
                Para asegurarme que el 80% del dominio esta modelado en la estructura
                """
                model.residue_sets.append(domain_rs)

                model.residue_sets = [x for x in model.residue_sets if x.name != StructureAnotator.important_pfam_rs]
                process_important = True
                try:
                    profile = PFamProfile.create_profile(domain.identifier)
                except ProfileNotFoundError as ex:
                    _log.warn(ex)
                    process_important = False
                if process_important:
                    important = profile.important_positions()
                    if important:
                        profile_seq_map, _, aln_profile_map = profile.map_alignment_simple(domain.aln.aln_hit.seq,
                                                                                           domain.aln.aln_hit.start)
                        for i in range(len(domain.aln.aln_hit.seq)):

                            if (i in aln_profile_map) and (not domain.aln.is_gap(i)):
                                profile_pos = aln_profile_map[i]
                                if (profile_pos in important):
                                    # assert profile_pos in important
                                    seq_pos = profile_seq_map[profile_pos]
                                    important_rs.residues.append(" _" + str(seq_pos))
                        if important_rs:
                            model.residue_sets.append(important_rs)
def update_csa(csa_txt):
    df_csa = pd.read_csv(csa_txt)

    for pdb in tqdm(set(df_csa["PDB ID"])):
        try:
            strdoc = ExperimentalStructure.objects(name=pdb).no_cache().get()
        except ExperimentalStructure.DoesNotExist:
            _log.warn(pdb + " csa pdb does not exists...")
            continue

        pdb = strdoc.name
        pdb_csa = df_csa[df_csa["PDB ID"] == pdb]
        if len(pdb_csa) > 0:
            if not strdoc.has_residue_set("csa"):
                csas = [str(row["CHAIN ID"]) + "_" + str(row["RESIDUE NUMBER"]) for i, row in
                        pdb_csa.iterrows()]  # @UnusedVariable
                csa_res_set = ResidueSet(name="csa", type="catalitic", residues=csas)
                strdoc.residue_sets.append(csa_res_set)