def free_cys_tyr(pdb_utils):
    parser = PDBParser(PERMISSIVE=1, QUIET=1)
    _log.debug("procesing free cys/tyr")
    total = ExperimentalStructure.objects(residue_sets__name__ne = "free_tyr").count()
    for strdoc in tqdm(ExperimentalStructure.objects(residue_sets__name__ne = "free_tyr").no_cache().timeout(False), total=total):

        if not (strdoc.residue_set("free_cys") or strdoc.residue_set("free_tyr")):
            if not os.path.exists(pdb_utils.pdb_path(strdoc.name)):
                pdb_utils.update_pdb(strdoc.name)
            if not os.path.exists(pdb_utils.pdb_path(strdoc.name)):
                continue
            try:
                bp_pdb = list(parser.get_structure(strdoc.name, pdb_utils.pdb_path(strdoc.name)  ))[0]
            except PDBConstructionException:
                continue
            except TypeError:
                continue

            free = {"CYS": [], "TYR": []}
            codes = {"CYS": "SG", "TYR": "OH"}
            for x in bp_pdb.get_residues():
                if x.resname in codes:
                    neighbor_atoms = set(list(bp_pdb.get_atoms())) - set(list(x))
                    if (codes[x.resname] in x) and (
                            not any(map(lambda atom: (x[codes[x.resname]] - atom) <= 3, neighbor_atoms))):
                        free[x.resname].append(x.parent.id + "_" + str(x.id[1]))
            if free["CYS"]:
                rs = ResidueSet(name="free_cys", residues=free["CYS"])
                strdoc.residue_sets.append(rs)
            if free["TYR"]:
                rs = ResidueSet(name="free_tyr", residues=free["TYR"])
                strdoc.residue_sets.append(rs)
            if free["CYS"] or free["TYR"]:
                strdoc.save()
def important_pfam(seqs_from_pdb_hmm):
    for query in tqdm(bpsio.parse(seqs_from_pdb_hmm, 'hmmer3-text')):
        try:
            pdb, chain, start, end = query.id.split("_")  # @UnusedVariable
            if ExperimentalStructure.objects(name=pdb,residue_sets__name="important_pfam").count():
                continue

            strdoc = ExperimentalStructure.objects(name=pdb).get()

            if not strdoc.residue_set("important_pfam"):
                important_rs = ResidueSet(name="important_pfam")
                domain_rs = None
                for hit in query:
                    if len(hit):
                        hsp = hit[0]
                        domain_rs = ResidueSet(name=hit.id)
                        i = 0
                        for x in str(hsp.aln[1].seq):
                            residue = chain + "_" + str(i + int(start))
                            if x == x.upper():
                                important_rs.residues.append(residue)
                            i = i + 1
                            domain_rs.residues.append(residue)
                        if domain_rs:
                            strdoc.residue_sets.append(domain_rs)
                strdoc.residue_sets.append(important_rs)
                strdoc.save()
        except DoesNotExist:
            pass
def update_quaternary(pdbUtils):
    '''
    Example – Author and computed assembly predictions agree
    REMARK 350 BIOMOLECULE: 1
    REMARK 350 AUTHOR DETERMINED BIOLOGICAL UNIT: DODECAMERIC
    REMARK 350 SOFTWARE DETERMINED QUATERNARY STRUCTURE: DODECAMERIC
    
    Example – Author and computed assembly predictions differ
    REMARK 350 BIOMOLECULE: 1
REMARK 350 AUTHOR DETERMINED BIOLOGICAL UNIT: HEXAMERIC
REMARK 350 APPLY THE FOLLOWING TO CHAINS: A, B, C, D, E, F 
REMARK 350 BIOMOLECULE: 2
REMARK 350 AUTHOR DETERMINED BIOLOGICAL UNIT: HEXAMERIC
REMARK 350 APPLY THE FOLLOWING TO CHAINS: G, H, I, J, K, L
REMARK 350 BIOMOLECULE: 3
REMARK 350 SOFTWARE DETERMINED QUATERNARY STRUCTURE: DODECAMERIC
REMARK 350 SOFTWARE USED: PISA
REMARK 350 TOTAL BURIED SURFACE AREA: 2990 ANGSTROM**2
REMARK 350 SURFACE AREA OF THE COMPLEX: 9330 ANGSTROM**2
REMARK 350 CHANGE IN SOLVENT FREE ENERGY: -40.0 KCAL/MOL
REMARK 350 APPLY THE FOLLOWING TO CHAINS: A, B, C, D, E, F, G, H, I,
REMARK 350 AND CHAINS: J, K, L
    '''
    total = ExperimentalStructure.objects().count()

    for strdoc in tqdm(ExperimentalStructure.objects().no_cache(), total=total):
        if not strdoc.quaternary:
            try:

                with open(pdbUtils.pdb_path(strdoc.name)) as h:
                    data = [l for l in h.readlines() if l.startswith("REMARK 350")]
                biomolecules_index = [i for i, l in enumerate(data) if "BIOMOLECULE:" in l] + [None]
                biomolecules = []
                for s, e in zip(biomolecules_index[0::2], biomolecules_index[1::2]):
                    biomolecule = data[s].split(":")[1].strip()
                    author = [l for l in data[s:e] if "AUTHOR DETERMINED BIOLOGICAL UNIT" in l]
                    if author:
                        author = author[0].split(":")[1].strip()
                    program = [l for l in data[s:e] if " SOFTWARE DETERMINED QUATERNARY STRUCTURE" in l]
                    if program:
                        program = program[0].split(":")[1].strip()
                    biomolecules.append((biomolecule, author, program))
                quaternaty = ""
                for bm in biomolecules:
                    quaternaty = "- Biomolecule " + str(bm[0]) + ": "
                    if (bm[1] or bm[2]) and (bm[1] == bm[2]):
                        quaternaty += ": " + bm[1]
                    elif bm[1]:
                        quaternaty += bm[1]
                    elif bm[2]:
                        quaternaty += bm[2]
                if len(biomolecules) == 1:
                    quaternaty = quaternaty.replace("- Biomolecule " + str(bm[0]) + ": ", "")
                strdoc.quaternary = quaternaty
                strdoc.save()
            except IndexError:
                _log.debug("no se puede parsear %s" % strdoc.name)
            except FileNotFoundError :
                _log.debug(f"{strdoc.name} could not be found")
def update_binding_residues(distances_tbl):
    df_binding_dist = pd.read_table(distances_tbl, sep="\t",
                                    names=[
                                        "pdb", "chain", "hmm_name", "prot_res", "resname",
                                        "res_atom_id",
                                        "comp_res_id", "comp_resname", "comp_atom_id", "distance"
                                    ])
    df_binding_dist["comptype"] = map(lambda x: compound_type[x], df_binding_dist.comp_resname)

    groups = df_binding_dist.groupby("pdb")
    total = len(groups)
    _log.debug("procesing binding")
    for pdb, df_binding_dist_pdb in tqdm(groups, total=total):

        try:
            for r_comp_type in ['LIPID', 'METAL', 'NUCLEOTIDE', 'SUGAR', "DRUG", "COFACTOR"]:
                if r_comp_type != "COFACTOR":
                    comp_type = r_comp_type
                else:
                    comp_type = "DRUG"

                df_comp_dist_pdb_near = df_binding_dist_pdb[
                    (df_binding_dist_pdb.distance <= 3) & (comp_type == df_binding_dist_pdb.comptype)]
                if len(df_comp_dist_pdb_near):
                    strdoc = ExperimentalStructure.objects(name=pdb).no_cache().get()
                    rs_name = comp_type.lower() + "_binding"
                    if not strdoc.has_residue_set(rs_name):
                        residue_list = list(set([row.chain + "_" + str(row.prot_res) for i, row in
                                                 df_comp_dist_pdb_near.iterrows()]))  # @UnusedVariable
                        rs = ResidueSet(name=rs_name, residues=residue_list, type="binding")
                        strdoc.residue_sets.append(rs)
                        strdoc.save()

        except DoesNotExist:
            _log.warn("%s does not exists" % pdb)
Exemple #5
0
def update_clusters():
    for cluster_name, seqs in CDHit().clustered_seq_iterator(
            "/data/databases/pdb/processed/seqs_from_pdb95.fasta"):
        _log.debug(cluster_name)

        cristals = []
        cluster = Cluster(name=cluster_name, type="PDB_Segments_95")
        for seq in seqs:
            seq_id, seq_start, seq_end, clust_start, clust_end = seq
            pdb, chain, start, end = seq_id.split("_")
            cristals.append(pdb)
            cluster.parts.append(
                BioProperties(pdb=pdb,
                              chain=chain,
                              start=start,
                              end=end,
                              seq_start=seq_start,
                              seq_end=seq_end,
                              clust_start=clust_start,
                              clust_end=clust_end))
        for pdb in set(cristals):
            try:
                cristal_doc = ExperimentalStructure.objects(name=pdb).get()
                cristal_doc.clusters = [
                    x for x in cristal_doc.clusters
                    if x.type != "PDB_Segments_95"
                ]
                if not cristal_doc.cluster(cluster_name):
                    cristal_doc.clusters.append(cluster)
                    cristal_doc.save()
            except DoesNotExist as ex:
                print(str(ex))
Exemple #6
0
def procesar_pdb(pdb, pdbUtils):
    pdb_file = pdbUtils.pdb_path(pdb)
    if not os.path.exists(pdb_file):
        with open("/tmp/pdb_load_errors.txt", "a") as handle:
            handle.write(pdb + "|NOT FOUND: " + pdb_file + " \n")
    try:
        structure = parser.get_structure(pdb, pdb_file)
        models = list(structure)
        if len(models) == 0:
            with open("/tmp/pdb_load_errors.txt", "a") as handle:
                handle.write("Has no models: " + pdb_file + " \n")
        else:
            strdoc = ExperimentalStructure(name=pdb, seq_collection_name="pdb")

            model = structure[0]
            for chain in model:
                chaindoc = Chain(name=chain.id,
                                 segments=[[y.id[1] for y in list(x)]
                                           for x in [chain]])
                strdoc.chains.append(chaindoc)

                for residue in chain:
                    res_id = residue.id[1]
                    molecule = Molecule(
                        resid=res_id,
                        chain=chain.id,
                        compound=residue.get_resname(),
                        compound_type=get_compound_type(residue))
                    if get_compound_type(residue) == 'RESIDUE':
                        chaindoc.residues.append(molecule)

                    else:
                        molecule.compound_type = get_compound_type(residue)
                        #                         if not [x for x in strdoc.ligands if (x.compound_type == molecule.compound_type) and  (x.compound_type == 'SOLVENT')]:
                        if molecule.compound_type != 'SOLVENT':
                            strdoc.ligands.append(molecule)
            try:
                complete_pockets(pdb, strdoc, structure, pdbUtils)
            except:
                pass

            complete_pdb_attrs(pdb, strdoc, pdbUtils)
            strdoc.save()
    except Exception as ex:
        with open("/tmp/pdb_load_errors.txt", "a") as handle:
            handle.write(pdb + "|" + str(ex) + "\n")
Exemple #7
0
    def cluster_aligned_annotation(self, model, template_aln, segment, cluster_aligned_segment, cluster_res_start):
        segment.clust_start
        segment_start = int(segment.start)
        segment_end = int(segment.end)
        segment_str = "_".join(
            [segment.pdb, segment.chain, str(segment_start).split(".")[0], str(segment.end).split(".")[0]])
        if segment_str != template_aln.aln_hit.name:
            template_eq = ExperimentalStructure.objects(name=segment.pdb).get()
            #   prot             IVAGRVSQKMAPVLRQIYDQMAEPKWVLAMGVCAS
            #   template       **IVAGAAS--MAPVLQQIYDQM--PKWVLAMGVC--**
            #   template_eq      -----AS--MAPVVQQILDQ---------------
            #   template_eq2   **IVAAAS--MAPVVQQILDQ---------------
            #   template_eq3   **IVAAAS--MAPVVQQILDQQM--PKWVLAMGVC--**

            template_res_to_cluster_res_map = lambda template_res: (
                                                                           template_res - cluster_res_start) - segment.clust_start + segment_start

            alignment_start_after_cluster_start = cluster_aligned_segment.residue_numbers()[0] >= segment.clust_start
            eq_start_residue = segment_start if alignment_start_after_cluster_start else segment_start + template_aln.aln_hit.start

            alignment_end_before_cluster_end = cluster_aligned_segment.residue_numbers()[-1] <= segment.clust_end
            eq_end_residue = segment_end if alignment_end_before_cluster_end else template_res_to_cluster_res_map(
                template_aln.hit_res_end)

            eq_aligned_segment = ResidueSet(name="aln", residues=[segment.chain + "_" + str(i + eq_start_residue)
                                                                  for i in range(len(template_aln.aln_query.seq)) if
                                                                  not template_aln.is_gap(i)])

            def map_eq_resid_to_query_resid(eq_res_id, template_aln, segment, segment_start):
                pos_in_eq = eq_res_id - segment_start
                pos_in_template = pos_in_eq + segment.clust_start
                pos_in_query = template_aln.map_pos_hit_query(pos_in_template)  # - template_aln.aln_hit.start
                return pos_in_query

            eq_aligned_csas = (template_eq.residue_set("csa").in_range(eq_start_residue,
                                                                       eq_end_residue) & eq_aligned_segment).residue_numbers()
            if eq_aligned_csas:
                aligned_csas_projected = [map_eq_resid_to_query_resid(eq_res_id, template_aln, segment, segment_start)
                                          for eq_res_id in eq_aligned_csas]
                residue_set_csa = ResidueSet(name="csa_" + segment.pdb, type="catalitic_projected",
                                             residues=["_" + str(resid) for resid in aligned_csas_projected])
                model.residue_sets.append(residue_set_csa)

            for comp_type in main_compound_types:
                binding_name = comp_type.lower() + "_binding"
                eq_aligned_binding = (template_eq.residue_set(binding_name).in_range(eq_start_residue, eq_end_residue)
                                      & eq_aligned_segment).residue_numbers()
                if eq_aligned_binding:
                    aligned_binding_projected = [
                        map_eq_resid_to_query_resid(eq_res_id, template_aln, segment, segment_start) for eq_res_id in
                        eq_aligned_binding if eq_res_id]
                    residue_set_binding = ResidueSet(name=binding_name + "_" + segment.pdb,
                                                     type=binding_name + "_projected",
                                                     residues=["_" + str(resid) for resid in aligned_binding_projected])
                    model.residue_sets.append(residue_set_binding)
Exemple #8
0
    def aligned_annoations(self, model):
        model.residue_sets = [x for x in model.residue_sets
                              if not any(map(lambda rsname: x.name.startswith(rsname),
                                             ["csa"] + [y.lower() + "_binding" for y in
                                                        main_compound_types]))]

        for template_aln in model.templates:
            pdb, chain= template_aln.aln_hit.name.split("_")
            segment_start = template_aln.aln_hit.start
            segment_end = template_aln.aln_hit.end

            try:
                template = ExperimentalStructure.objects(name=pdb).get()
            except Structure.DoesNotExist as ex:
                print([ex,pdb])
                continue


            is_aligned = not ((int(segment_start) == -1) & (int(segment_end) == -1))

            if is_aligned:
                start_residue = template_aln[0].h_resid
                end_residue = template_aln[-1].h_resid
                aligned_segment = ResidueSet(name="aln", residues=[chain + "_" + str(template_aln[i].h_resid)
                                                                   for i in range(len(template_aln.aln_query.seq))
                                                                   if not template_aln.is_gap(i)])

                aligned_csas = template.residue_set("csa").in_range(start_residue, end_residue) & aligned_segment
                if len(aligned_csas):
                    aln_query = [" _" + str(template_aln.aln_pos_from_h_resid(int(x.split("_")[1])).q_resid) for x in
                                 aligned_csas.residues]
                    model.residue_sets.append(
                        ResidueSet(name="csa_" + pdb, residues=aln_query, type="catalitic_projected"))

                for comp_type in main_compound_types:
                    binding_name = comp_type.lower() + "_binding"
                    aligned_binding = template.residue_set(binding_name).in_range(start_residue,
                                                                                  end_residue) & aligned_segment

                    def query_residue(pdb_resid):
                        return " _" + str(template_aln.aln_pos_from_h_resid(pdb_resid).q_resid)

                    aligned_binding_residues = [query_residue(pdb_resid) for pdb_resid in
                                                aligned_binding.residue_numbers()]
                    if aligned_binding_residues:
                        rsbinding = ResidueSet(name=binding_name + "_" + pdb, residues=aligned_binding_residues,
                                               type=binding_name + "_projected")
                        model.residue_sets.append(rsbinding)
def update_csa(csa_txt):
    df_csa = pd.read_csv(csa_txt)

    for pdb in tqdm(set(df_csa["PDB ID"])):
        try:
            strdoc = ExperimentalStructure.objects(name=pdb).no_cache().get()
        except ExperimentalStructure.DoesNotExist:
            _log.warn(pdb + " csa pdb does not exists...")
            continue

        pdb = strdoc.name
        pdb_csa = df_csa[df_csa["PDB ID"] == pdb]
        if len(pdb_csa) > 0:
            if not strdoc.has_residue_set("csa"):
                csas = [str(row["CHAIN ID"]) + "_" + str(row["RESIDUE NUMBER"]) for i, row in
                        pdb_csa.iterrows()]  # @UnusedVariable
                csa_res_set = ResidueSet(name="csa", type="catalitic", residues=csas)
                strdoc.residue_sets.append(csa_res_set)
Exemple #10
0
                "pockets.0": {
                    "$exists": 0
                }
            }, {"name": 1})
    }
    procesados = {
        x["name"]: 1
        for x in db.structures.find({"seq_collection_name": "pdb"},
                                    {"name": 1})
    }
    pdbs = list(pdbUtils)
    for (pdb, pdb_file) in tqdm(pdbs):

        if pdb in procesados:
            if pdb in procesados_sin_pocket:
                q = ExperimentalStructure.objects(seq_collection_name="pdb",
                                                  name=pdb).no_cache()
                if q:
                    strdoc = q.get()
                    try:
                        structure = parser.get_structure(
                            pdb, pdbUtils.pdb_path(pdb))
                        try:
                            complete_pockets(pdb, strdoc, structure, pdbUtils)
                            strdoc.save()
                        except:
                            pass

                        complete_pdb_attrs(pdb, strdoc, pdbUtils)
                        strdoc.save()

                    except Exception as ex: