def find_homolog(dataset_name, query_cdd, query_resi, target_pdb, target_chain): from Bio import pairwise2 from Bio.SubsMat.MatrixInfo import blosum62 interfaces = get_interfaces_path(dataset_name) print query_cdd, query_resi, target_pdb, target_chain target = None target_score = 0. ipath = os.path.join(interfaces, "{}.extended_sdi".format(query_cdd)) with open(ipath) as f: for line in f: fields = line.rstrip().split("\t") if fields[-1] == "0": continue if fields[0] == target_pdb: print fields if fields[0] == target_pdb and fields[1] == target_chain: #print "posible match", fields _target_resi = fields[5] try: _target_score = pairwise2.align.globaldx(query_resi, _target_resi, blosum62, score_only=1) except StopIteration: continue if target is None or _target_score > _target_score: target_score = _target_score target = fields return target
def find_partner(dataset_name, pdb_file, pdb, chain1, sdi1, cdd1, resi1, chain2, cdd2): interfaces = get_interfaces_path(dataset_name) bs1 = get_binding_site_from_resi(pdb_file, chain1, resi1.split(",")) #print dataset_name, pdb_file, pdb, chain1, sdi1, cdd1, resi1, chain2, cdd2 possible_matches = {} ipath = os.path.join(interfaces, "{}.extended_sdi".format(cdd2)) with open(ipath) as f: for line in f: fields = line.rstrip().split("\t") #print fields if fields[0] == pdb and fields[1] == chain2 and fields[7] == cdd2 and fields[8] == cdd1 and fields[9] in ("{0}{1}_{0}{2}".format(pdb.upper(),chain1,chain2), "{0}{2}_{0}{1}".format(pdb.upper(),chain1,chain2)): resi2 = fields[4].split(",") bs2 = get_binding_site_from_resi(pdb_file, chain2, resi2) for i in bs1: for j in bs2: dist = _calculate_atom_distance(i, j) try: if dist < possible_matches[tuple(fields)]: possible_matches[tuple(fields)] = dist except KeyError: possible_matches[tuple(fields)] = dist if len(possible_matches) > 0: return min(possible_matches.iteritems(), key=lambda x: x[1])[0] else: #print "==> Binding site not in IBIS, calculating based on distance", resi2 = get_partner_binding_site(pdb_file, bs1, chain2) if resi2 is not None: print return pdb, chain2, None, None, resi2, None, cdd1==cdd2, cdd2, cdd1, "{0}{1}_{0}{2}".format(pdb, chain2, chain1), "1" else: #print "but failed" return None
def get_ibis(dataset_name, pdb_ibis_file, use_cdd_domain=None, multimers=False, check_binding_sites=False): out_dir = get_interfaces_path(dataset_name) seen_cdd = set() with open(pdb_ibis_file) as pdb_ibis: for pdb_chain, entries in groupby(parse_ibis(pdb_ibis), key=lambda l: l["Query"]): pdb, chain = pdb_chain[:-1], pdb_chain[-1] if check_binding_sites: try: structure = Structure.from_pdb(pdb, chain) except (KeyboardInterrupt, SystemExit) as e: raise except InvalidPDB: continue except: trace = traceback.format_exc() print "Error:", trace continue for entry in entries: if entry["Interaction_type"] not in ["PPI"]: continue #, "LIG" cdd = entry["Query_Domain"] partner = entry["Interaction_Partner"] residues = entry["PDB_Residue_No"].lstrip().replace(" ", ",") residue_str = entry["Binding_Site_Residues"] observed = entry["Is_Observed"] == "1" pdb_evidence = entry["PDB_Evidence"] is_multimer = "1" if cdd == partner else "0" if multimers and not is_multimer: #Query domain should not be target domain continue if use_cdd_domain is not None and cdd != use_cdd_domain: continue if check_binding_sites: #Check if positions match structure pdb_seq = "" for i, r in enumerate(residues.split(",")): if r == "X": continue res = structure.get_residue_from_resseq(r) if res is None: residue_str = residue_str[:i]+residue_str[i+1:] else: pdb_seq += PDB.Polypeptide.three_to_one(res.get_resname()) if pdb_seq != residue_str: print "{} {} does not match {} =/= {}".format(entry["Query"], residues, pdb_seq, residue_str) continue seen_cdd.add(cdd) with open(os.path.join(out_dir, cdd.replace("/", ""), "{}.tsv".format(pdb)), "a+") as f: print >> f, "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pdb, chain, residues, residue_str, is_multimer, cdd, partner, pdb_evidence, int(observed))
def merge_ibis(dataset_name, cdd): out_dir = get_interfaces_path(dataset_name) with open(os.path.join(out_dir, "{}.raw".format(cdd.replace("/", ""))), "w") as raw_cdd: print >> raw_cdd, "pdb\tchain\tresi\tresn\tis_multimer\tcdd\tpartner\tpdb_evidence\tis_observed" for pdb in glob.glob(os.path.join(out_dir, cdd.replace("/", ""), "*.tsv")): with open(pdb) as f: for line in f: raw_cdd.write(line) shutil.rmtree(os.path.join(out_dir, cdd.replace("/", "")))
def submit_ibis_cdd(dataset_name, job_name="calc_bsa", dependency=None): ibis_data = get_interfaces_path(dataset_name) CDD = pd.read_csv(os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "MMDB", "StructDomSfam.csv"), usecols=["label"]).drop_duplicates().dropna() CDD = sorted(CDD["label"].apply(lambda cdd: cdd.replace("/", "").replace("'", "\'")).tolist()) job = SwarmJob(job_name+"_full", walltime="18:00:00") for cdd in CDD: cdd_f = os.path.join(ibis_data, "{}.extended_sdi".format(cdd.replace("/", ""))) job += "/data/draizene/3dcnn-torch-py2 python {} run {} \"{}\" {}\n".format(__file__, dataset_name, cdd.replace("/", ""), cdd_f) jid = job.run(dependency=dependency) print jid
def add_sdi(dataset_name, cdd_ibis, cdd_sdi, cleanup=False): interfaces_path = get_interfaces_path(dataset_name) if not os.path.exists(interfaces_path): os.makedirs(interfaces_path) name_prefix = os.path.splitext(os.path.basename(cdd_ibis))[0] inpath = cdd_ibis outpath = os.path.join(interfaces_path, "{}.sdi".format(name_prefix)) observed_outpath = os.path.join(interfaces_path, "{}.observed_sdi".format(name_prefix)) inferred_outpath = os.path.join(interfaces_path, "{}.inferred_sdi".format(name_prefix)) extended_outpath = os.path.join(interfaces_path, "{}.extended_sdi".format(name_prefix)) if not os.path.isfile(cdd_ibis) and not os.path.isfile(cdd_sdi): print "Files not found" with open(outpath, "w") as f, open(observed_outpath, "w") as f2, open( inferred_outpath, "w") as f3, open(extended_outpath, "w") as f3: pass return cdd_sdi = pd.read_csv(cdd_sdi, dtype={'pdb': str, 'chain': str}) cdd_ibis = pd.read_table(cdd_ibis, dtype={'pdb': str, 'chain': str}) all_sdi_domains = cdd_sdi.groupby(["pdbId", "chnLett"]) domains = defaultdict(set) observedDomains = defaultdict(set) inferredDomains = defaultdict(set) extended_sdi_file = open(extended_outpath, "w") print >> extended_sdi_file, "pdb\tchain\tsdi\tdomNum\tresi\tresn\tis_multimer\tcdd\tpartner\tpdb_evidence\tobserved" for i, row in cdd_ibis.iterrows(): try: sdi_domains = all_sdi_domains.get_group((row["pdb"], row["chain"])) except KeyError, IndexError: try: sdi_domains = all_sdi_domains.get_group((row["pdb"], "")) except KeyError, IndexError: print "No SDIs for", row["pdb"], row["chain"] continue