def find_homolog(dataset_name, query_cdd, query_resi, target_pdb, target_chain):
    from Bio import pairwise2
    from Bio.SubsMat.MatrixInfo import blosum62
    interfaces = get_interfaces_path(dataset_name)

    print query_cdd, query_resi, target_pdb, target_chain

    target = None
    target_score = 0.

    ipath = os.path.join(interfaces, "{}.extended_sdi".format(query_cdd))
    with open(ipath) as f:
        for line in f:
            fields = line.rstrip().split("\t")
            if fields[-1] == "0": continue
            if fields[0] == target_pdb: print fields
            if fields[0] == target_pdb and fields[1] == target_chain:
                #print "posible match", fields
                _target_resi = fields[5]
                try:
                    _target_score = pairwise2.align.globaldx(query_resi, _target_resi, blosum62, score_only=1)
                except StopIteration:
                    continue

                if target is None or _target_score > _target_score:
                    target_score = _target_score
                    target = fields

    return target
def find_partner(dataset_name, pdb_file, pdb, chain1, sdi1, cdd1, resi1, chain2, cdd2):
    interfaces = get_interfaces_path(dataset_name)
    bs1 = get_binding_site_from_resi(pdb_file, chain1, resi1.split(","))
    #print dataset_name, pdb_file, pdb, chain1, sdi1, cdd1, resi1, chain2, cdd2

    possible_matches = {}
    ipath = os.path.join(interfaces, "{}.extended_sdi".format(cdd2))
    with open(ipath) as f:
        for line in f:
            fields = line.rstrip().split("\t")
            #print fields
            if fields[0] == pdb and fields[1] == chain2 and fields[7] == cdd2 and fields[8] == cdd1 and fields[9] in ("{0}{1}_{0}{2}".format(pdb.upper(),chain1,chain2), "{0}{2}_{0}{1}".format(pdb.upper(),chain1,chain2)):
                resi2 = fields[4].split(",")
                bs2 = get_binding_site_from_resi(pdb_file, chain2, resi2)
                for i in bs1:
                    for j in bs2:
                        dist = _calculate_atom_distance(i, j)
                        try:
                            if dist < possible_matches[tuple(fields)]:
                                possible_matches[tuple(fields)] = dist
                        except KeyError:
                            possible_matches[tuple(fields)] = dist
    if len(possible_matches) > 0:
        return min(possible_matches.iteritems(), key=lambda x: x[1])[0]
    else:
        #print "==> Binding site not in IBIS, calculating based on distance",
        resi2 = get_partner_binding_site(pdb_file, bs1, chain2)
        if resi2 is not None:
            print
            return pdb, chain2, None, None, resi2, None, cdd1==cdd2, cdd2, cdd1, "{0}{1}_{0}{2}".format(pdb, chain2, chain1), "1"
        else:
            #print "but failed"
            return None
Ejemplo n.º 3
0
def get_ibis(dataset_name, pdb_ibis_file, use_cdd_domain=None, multimers=False, check_binding_sites=False):
    out_dir = get_interfaces_path(dataset_name)
    seen_cdd = set()
    with open(pdb_ibis_file) as pdb_ibis:
        for pdb_chain, entries in groupby(parse_ibis(pdb_ibis), key=lambda l: l["Query"]):
            pdb, chain = pdb_chain[:-1], pdb_chain[-1]

            if check_binding_sites:
                try:
                    structure = Structure.from_pdb(pdb, chain)
                except (KeyboardInterrupt, SystemExit) as e:
                    raise
                except InvalidPDB:
                    continue
                except:
                    trace = traceback.format_exc()
                    print "Error:", trace
                    continue

            for entry in entries:
                if entry["Interaction_type"] not in ["PPI"]: continue #, "LIG"

                cdd = entry["Query_Domain"]
                partner = entry["Interaction_Partner"]
                residues = entry["PDB_Residue_No"].lstrip().replace(" ", ",")
                residue_str = entry["Binding_Site_Residues"]
                observed = entry["Is_Observed"] == "1"
                pdb_evidence = entry["PDB_Evidence"]
                is_multimer = "1" if cdd == partner else "0"


                if multimers and not is_multimer:
                    #Query domain should not be target domain
                    continue

                if use_cdd_domain is not None and cdd != use_cdd_domain:
                    continue

                if check_binding_sites:
                    #Check if positions match structure
                    pdb_seq = ""
                    for i, r in enumerate(residues.split(",")):
                        if r == "X": continue

                        res = structure.get_residue_from_resseq(r)

                        if res is None:
                            residue_str = residue_str[:i]+residue_str[i+1:]
                        else:
                            pdb_seq += PDB.Polypeptide.three_to_one(res.get_resname())

                    if pdb_seq != residue_str:
                        print "{} {} does not match {} =/= {}".format(entry["Query"], residues, pdb_seq, residue_str)
                        continue

                seen_cdd.add(cdd)

                with open(os.path.join(out_dir, cdd.replace("/", ""), "{}.tsv".format(pdb)), "a+") as f:
                    print >> f, "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pdb, chain, residues, residue_str, is_multimer, cdd, partner, pdb_evidence, int(observed))
Ejemplo n.º 4
0
def merge_ibis(dataset_name, cdd):
    out_dir = get_interfaces_path(dataset_name)
    with open(os.path.join(out_dir, "{}.raw".format(cdd.replace("/", ""))), "w") as raw_cdd:
        print >> raw_cdd, "pdb\tchain\tresi\tresn\tis_multimer\tcdd\tpartner\tpdb_evidence\tis_observed"
        for pdb in glob.glob(os.path.join(out_dir, cdd.replace("/", ""), "*.tsv")):
            with open(pdb) as f:
                for line in f:
                    raw_cdd.write(line)
    shutil.rmtree(os.path.join(out_dir, cdd.replace("/", "")))
def submit_ibis_cdd(dataset_name, job_name="calc_bsa", dependency=None):
    ibis_data = get_interfaces_path(dataset_name)
    CDD = pd.read_csv(os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "MMDB", "StructDomSfam.csv"), usecols=["label"]).drop_duplicates().dropna()
    CDD = sorted(CDD["label"].apply(lambda cdd: cdd.replace("/", "").replace("'", "\'")).tolist())

    job = SwarmJob(job_name+"_full", walltime="18:00:00")
    for cdd in CDD:
        cdd_f = os.path.join(ibis_data, "{}.extended_sdi".format(cdd.replace("/", "")))
        job += "/data/draizene/3dcnn-torch-py2 python {} run {} \"{}\" {}\n".format(__file__, dataset_name, cdd.replace("/", ""), cdd_f)

    jid = job.run(dependency=dependency)
    print jid
Ejemplo n.º 6
0
def add_sdi(dataset_name, cdd_ibis, cdd_sdi, cleanup=False):
    interfaces_path = get_interfaces_path(dataset_name)

    if not os.path.exists(interfaces_path):
        os.makedirs(interfaces_path)

    name_prefix = os.path.splitext(os.path.basename(cdd_ibis))[0]
    inpath = cdd_ibis
    outpath = os.path.join(interfaces_path, "{}.sdi".format(name_prefix))
    observed_outpath = os.path.join(interfaces_path,
                                    "{}.observed_sdi".format(name_prefix))
    inferred_outpath = os.path.join(interfaces_path,
                                    "{}.inferred_sdi".format(name_prefix))
    extended_outpath = os.path.join(interfaces_path,
                                    "{}.extended_sdi".format(name_prefix))

    if not os.path.isfile(cdd_ibis) and not os.path.isfile(cdd_sdi):
        print "Files not found"
        with open(outpath, "w") as f, open(observed_outpath, "w") as f2, open(
                inferred_outpath, "w") as f3, open(extended_outpath,
                                                   "w") as f3:
            pass
        return

    cdd_sdi = pd.read_csv(cdd_sdi, dtype={'pdb': str, 'chain': str})
    cdd_ibis = pd.read_table(cdd_ibis, dtype={'pdb': str, 'chain': str})

    all_sdi_domains = cdd_sdi.groupby(["pdbId", "chnLett"])

    domains = defaultdict(set)
    observedDomains = defaultdict(set)
    inferredDomains = defaultdict(set)

    extended_sdi_file = open(extended_outpath, "w")
    print >> extended_sdi_file, "pdb\tchain\tsdi\tdomNum\tresi\tresn\tis_multimer\tcdd\tpartner\tpdb_evidence\tobserved"

    for i, row in cdd_ibis.iterrows():
        try:
            sdi_domains = all_sdi_domains.get_group((row["pdb"], row["chain"]))
        except KeyError, IndexError:
            try:
                sdi_domains = all_sdi_domains.get_group((row["pdb"], ""))
            except KeyError, IndexError:
                print "No SDIs for", row["pdb"], row["chain"]
                continue