def convert_pdb_to_mmtf(job, sfam_id, jobStoreIDs=None, clustered=True, preemptable=True): raise NotImplementedError() work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] clustered = "clustered" if clustered else "full" pdb_path = os.path.join(work_dir, "pdb") if not os.path.isdir(pdb_path): os.makedirs(pdb_path) #Download all with same sfam if jobStoreIDs is None: in_store = IOStore.get("{}:molmimic-{}-structures".format(prefix), clustered) for f in in_store.list_input_directory(sfam_id): if f.endswith(".pdb"): in_store.read_input_file(f, os.path.join(work_dir, f)) else: for jobStoreID in jobStoreIDs: job.fileStore.readGlobalFile(fileStoreID, userPath=pdb_path) PdbToMmtfFull(pdb_path, mmtf_path, work_dir=work_dir, job=job) out_store = IOStore.get("{}:molmimic-{}-mmtf".format(prefix, clustered)) out_store.write_output_directory(mmtf_path, sfam_id)
def start_toil(job): print "Starting job" work_dir = job.fileStore.getLocalTempDir() in_store = IOStore.get("aws:us-east-1:molmimic-ibis") int_store = IOStore.get("aws:us-east-1:molmimic-interfaces") #Download PDB info pdb_file = os.path.join(work_dir, "PDB.h5") in_store.read_input_file("PDB.h5", pdb_file) #Add pdb info into local job store pdbFileStoreID = job.fileStore.writeGlobalFile(pdb_file) #Download PDB Taxonomy information tax_file = os.path.join(work_dir, "pdb_chain_taxonomy.h5") in_store.read_input_file("pdb_chain_taxonomy.h5", tax_file) #Add tax info into local job store taxFileStoreID = job.fileStore.writeGlobalFile(tax_file) tables = set(range(1,87))-set([51]) sfams = pd.read_hdf(pdb_file, "Superfamilies", columns= ["sfam_id"]).drop_duplicates().dropna()["sfam_id"].sort_values() #RealtimeLogger.info("SFAMS: {}".format(sfams.shape[0])) sfamFileStoreIDs = {} for s in sfams: k = "{0}/{0}.observed_interactome".format(int(s)) if int_store.exists(k): RealtimeLogger.info("Loading {}".format(s)) f = job.fileStore.getLocalTempFileName() int_store.read_input_file(k, f) sfamFileStoreIDs[int(s)] = job.fileStore.writeGlobalFile(f) os.remove(f) else: RealtimeLogger.info("FAILED Loading {}".format(s)) assert len(sfamFileStoreIDs) > 0 os.remove(tax_file) os.remove(pdb_file) job.log("Running tables: {}".format(tables)) j = job for table in table: j.addFollowOnJobFn(get_inferred_structural_interactome_by_table, table, pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs) # map_job(job, get_inferred_structural_interactome_by_table, tables, # pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs) j.addFollowOnJobFn(merge_inferred_interactome, pdbFileStoreID)
def calculate_features_for_sfam(job, sfam_id, further_parallelize=False): work_dir = job.fileStore.getLocalTempDir() pdb_store = IOStore.get("aws:us-east-1:molmimic-full-structures") out_store = IOStore.get("aws:us-east-1:molmimic-structure-features") extensions = set(["atom.npy", "residue.npy", "edges.gz"]) done_files = lambda k: set([f.rsplit("_", 1)[1] for f in \ out_store.list_input_directory(k)]) pdb_keys = [k for k in pdb_store.list_input_directory(str(int(sfam_id))) if \ k.endswith(".pdb") and extensions != done_files(os.path.splitext(k)[0])] if further_parallelize: map_job(job, calculate_features, pdb_keys) else: for pdb_key in pdb_keys: #pdb_store.list_input_directory(int(sfam_id)): calculate_features(job, pdb_key, work_dir=work_dir)
def get_sfam_ddi_sizes(job, sfam_id, observed=True): int_type = "observed" if observed else "inferred" work_dir = job.fileStore.getLocalTempDir() interface_store = IOStore.get("aws:us-east-1:molmimic-interfaces") interfaces_key = "{s}/{s}.{o}_interactome".format( s=sfam_id, o="observed" if observed else "inferred") interfaces_file = os.path.basename(interfaces_key) interface_store.read_input_file(interfaces_key, interfaces_file) interfaces = pd.read_hdf(interfaces_file, "table") RealtimeLogger.info("COLS: {}".format(interfaces.columns)) counts = interfaces.fillna(-1.).groupby( ["mol_superfam_id", "int_superfam_id"]).size().reset_index(name="count") RealtimeLogger.info("SIZES :{}".format(counts)) try: os.remove(interfaces_file) except OSError: pass return counts
def best_sfams(job, all_counts, max_sfams=300): import json work_dir = job.fileStore.getLocalTempDir() out_store = IOStore.get("aws:us-east-1:molmimic-ddi") #Merge into one dataframe counts = pd.concat(all_counts) #mol->int should be same as int->mol: remove dupes ddi_counts = {} for counts in all_counts: for row in counts.itertuples(): ddi = tuple( map(int, sorted((row.mol_superfam_id, row.int_superfam_id)))) if ddi in ddi_counts: RealtimeLogger.info("{} {}, are counts symmetrical? {}".format( ddi[0], ddi[1], "Yes" if ddi_counts[ddi] == row.count else "No")) continue ddi_counts[ddi] = row.count sfams = sorted(ddi_counts.iteritems(), key=lambda x: x[1], reverse=True) RealtimeLogger.info("sfams is {}".format(sfams)) sfam_file = os.path.join(work_dir, "sorted_sfams.json") with open(sfam_file, "w") as f: json.dump(sfams, f) out_store.write_output_file(sfam_file, "sorted_sfams.json") return sfams[:max_sfams]
def process_sfams(job, max_sfams=300, memory="1G"): import json from collections import defaultdict work_dir = job.fileStore.getLocalTempDir() store = IOStore.get("aws:us-east-1:molmimic-ddi") json_file = os.path.join(work_dir, "sorted_sfams.json") store.read_input_file("sorted_sfams.json", json_file) with open(json_file) as f: counts = json.load(f) sfams = defaultdict(set) num_ddi = 0 for i, ((mol_sfam, int_sfam), count) in enumerate(counts): RealtimeLogger.info("{}: {}-{}".format(i, mol_sfam, int_sfam)) if -1 in map(int, (mol_sfam, int_sfam)): RealtimeLogger.info("SKIPPED") continue if count < 90: RealtimeLogger.info("SKIPPED count is less than 100") continue print(mol_sfam, int_sfam) #if i<30: continue # and (max_sfams is None or num_ddi<max_sfams) and \ #mol_sfam not in sfams[int_sfam]: if mol_sfam != int_sfam: if (mol_sfam not in sfams and int_sfam not in sfams) or (mol_sfam in sfams and int_sfam in sfams): sfams[mol_sfam].add(int_sfam) RealtimeLogger.info("Added {}: {}-{}".format( i, mol_sfam, int_sfam)) elif mol_sfam in sfams and int_sfam not in sfams: sfams[mol_sfam].add(int_sfam) RealtimeLogger.info("Added {}: {}-{}".format( i, mol_sfam, int_sfam)) elif mol_sfam not in sfams and int_sfam in sfams: sfams[int_sfam].add(mol_sfam) RealtimeLogger.info("Added {}: {}-{}".format( i, int_sfam, mol_sfam)) else: RealtimeLogger.info( "Could not add {}: {}-{}; mol_sfam {} sfams; int_sfam {} sfams" .format(i, mol_sfam, int_sfam, "in" if mol_sfam in sfams else "not in", "in" if int_sfam in sfams else "not in")) num_ddi += 1 #break RealtimeLogger.info("{} starting domains".format(len(sfams))) #mol_sfam = list(sfams.keys())[pd.np.random.randint(len(sfams))] #int_sfams = sfams[mol_sfam] #job.addChildJobFn(process_sfam, mol_sfam, int_sfams) for mol_sfam, int_sfams in sfams.iteritems(): job.addChildJobFn(process_sfam, mol_sfam, int_sfams)
def process_sfam(job, sfam_id, pdbFileStoreID, cores=1): work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] in_store = IOStore.get("{}:molmimic-full-structures".format(prefix)) sdoms_file = copy_pdb_h5(job, pdbFileStoreID) sdoms = pd.read_hdf(unicode(sdoms_file), "merged") #, where="sfam_id == {}".format(sfam_id)) # skip_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "keep.csv") # if os.path.isfile(skip_file): # skip = pd.read_csv(skip_file) # sdoms = sdoms[sdoms["sdi"].isin(skip["sdi"])] sdoms = sdoms[sdoms["sfam_id"] == float( sfam_id)]["sdi"].drop_duplicates().dropna() #sdoms = sdoms[:1] if cores > 2: #Only makes sense for slurm or other bare-matal clsuters setup_dask(cores) d_sdoms = dd.from_pandas(sdoms, npartitions=cores) RealtimeLogger.info("Running sfam dask {}".format(sdoms)) processed_domains = d_sdoms.apply( lambda row: process_domain(job, row.sdi, sdoms_file), axis=1).compute() else: processed_domains = job.addChildJobFn(map_job_rv, process_domain, sdoms, pdbFileStoreID, preemptable=True).rv() return processed_domains
def start_toil(job, memory="1G"): work_dir = job.fileStore.getLocalTempDir() store = IOStore.get("aws:us-east-1:molmimic-ddi") if not store.exists("sorted_sfams.json"): interface_store = IOStore.get("aws:us-east-1:molmimic-interfaces") observed = set(key.split("/")[0] for key in interface_store.list_input_directory() \ if key.endswith("observed_interactome")) #inferred = set(key.split("/")[0] for key in keys if key.endswith("inferred_interactome")) interface_counts = [ job.addChildJobFn(get_sfam_ddi_sizes, o).rv() for o in observed ] merge_job = job.addFollowOnJobFn(best_sfams, interface_counts) else: merge_job = job merge_job.addFollowOnJobFn(process_sfams)
def get_missing(job, observed=True): import datetime work_dir = job.fileStore.getLocalTempDir() file = "missing_{}_{}.h5".format( "observed" if observed else "inferred", str(datetime.datetime.now()).replace(" ", "-")) outfile = os.path.join(work_dir, file) store = IOStore.get("aws:us-east-1:molmimic-missing-structures") to_remove = [] for k in store.list_input_directory( "observed" if observed else "inferred"): path = os.path.join(work_dir, k) store.read_input_file(k, path) df = pd.read_hdf(path, "table", mode="r") df.to_hdf(outfile, "table", table=True, format="table", append=True, mode="a", complib="bzip2", complevel=9) to_remove.append(k) try: os.remove(path) except OSError: pass if not os.path.isfile(outfile): df = pd.DataFrame() df.to_hdf(outfile, "table", table=True, format="table", append=True, mode="a", complib="bzip2", complevel=9) store.write_output_file(outfile, file) try: os.remove(outfile) except OSError: pass for f in to_remove: try: store.remove(k) except (SystemExit, KeyboardInterrupt): raise except: pass
def start_toil(job, pdbFileStoreID=None): work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] in_store = IOStore.get("{}:molmimic-ibis".format(prefix)) out_store = IOStore.get("{}:molmimic-interfaces".format(prefix)) if pdbFileStoreID is None: #Download PDB info pdb_path = os.path.join(work_dir, "PDB.h5") in_store.read_input_file("PDB.h5", pdb_path) #Add pdb info into local job store pdbFileStoreID = job.fileStore.writeGlobalFile(pdb_path) else: pdb_path = job.fileStore.readGlobalFile(pdbFileStoreID) ibis_obs_prefix = "IBIS_observed.h5" ibis_obs_path = os.path.join(work_dir, ibis_obs_prefix) in_store.read_input_file(ibis_obs_prefix, ibis_obs_path) #Add ibis info into local job store ibisObsFileStoreID = job.fileStore.writeGlobalFile(ibis_obs_path) #Choose which superfamilies to run, skip those already present skip_sfam = set([float(f.split("/", 1)[0]) for f in out_store.list_input_directory() \ if f.endswith(".observed_interactome")]) pdb = filter_hdf_chunks(unicode(ibis_obs_path), "ObsInt", columns=["mol_superfam_id"]).drop_duplicates() sfams = pdb[~pdb["mol_superfam_id"].isin(skip_sfam)][ "mol_superfam_id"].drop_duplicates().dropna().astype(int) print "Will run a total of {} SFAMS".format(len(sfams)) #Run all superfamilies map_job(job, get_observed_structural_interactome, sfams, pdbFileStoreID, ibisObsFileStoreID) #Cleanup job.addFollowOnJobFn(cleanup) os.remove(ibis_obs_path) os.remove(pdb_path)
def cleanup(job, sfam_ids): work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] store = IOStore.get("aws:us-east-1:molmimic-interfaces") for sfam_id in sfam_ids: infrows = store.list_input_directory("{}/_infrows".format(int(sfam_id))) inftables = store.list_input_directory("{}/_inftables".format(int(sfam_id))) finished_tables = [k.split("/")[-1].split("_")[0] for k in inftables] for k in infrows: if k.split("/")[2] in finished_tables: store.remove_file(k)
def compare_sfams(job, useExisting=False, observed=True): work_dir = job.fileStore.getLocalTempDir() store = IOStore.get("aws:us-east-1:molmimic-missing-structures") all_missing = "missing_{}.h5".format( "observed" if observed else "inferred") all_missing_f = os.path.join(work_dir, all_missing) if not useExisting or not store.exists(all_missing): inf_store = IOStore.get("aws:us-east-1:molmimic-interfaces") ending = ".{}_interactome".format( "observed" if observed else "inferred") sfams = [k.split("/",1)[0] for k in inf_store.list_input_directory() \ if k.endswith(ending)] else: store.read_input_file(all_missing, all_missing_f) sfams = pd.read_hdf(all_missing_f, "table", columns=["sfam"])["sfams"].drop_duplicates() map_job(job, compare_sfam, sfams) job.addFollowOnJobFn(get_missing, observed=observed)
def cleanup(job): prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] in_store = IOStore.get("{}:molmimic-interfaces".format(prefix)) keys = list(in_store.list_input_directory()) finished = set( key.split("/")[0] for key in keys if key.endswith("observed_interactome")) failed = set( key.split("/")[0] for key in keys if "_obsrows" in key and key.endswith("failed")) for key in failed - finished: in_store.remove_file(key)
def get_inferred_structural_interactome_by_table(job, table, pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs): work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] in_store = IOStore.get("aws:us-east-1:molmimic-ibis") out_store = IOStore.get("aws:us-east-1:molmimic-interfaces") RealtimeLogger.info("Running table {}".format(table)) #Read in H5 for entire table tableInfPath = get_file(job, "IBIS_inferred_{}.h5".format(table), in_store) tableInfPathFileStoreID = job.fileStore.writeGlobalFile(tableInfPath) sfams = filter_hdf_chunks(tableInfPath, "Intrac{}".format(table), columns=["nbr_superfam_id"]).drop_duplicates().dropna() skip_sfam = set([s for s in sfams["nbr_superfam_id"] if \ out_store.exists("{0}/{0}.inferred_interactome".format(int(s))) or \ not out_store.exists("{0}/{0}.observed_interactome".format(int(s)))]) # skip_sfam = set([int(f.split("/", 1)[0]) for f in out_store.list_input_directory() \ # if f.endswith(".inferred_interactome")]) sfams = sfams[~sfams["nbr_superfam_id"].isin(skip_sfam)] sfams = sfams["nbr_superfam_id"].drop_duplicates().dropna().astype(int).tolist() # partial_sfams = set(int(k.split("/")[0]) for sfam in sfams for k in \ # out_store.list_input_directory( # "{sfam}/_inftables/Intrac{table}_{sfam}.inferred_interactome".format( \ # sfam=sfam, table=table)) if not k.endswith("failed")) #sfams = list(set(sfams)-partial_sfams) if len(sfams) > 0: map_job(job, get_table_sfams, sfams, table, tableInfPathFileStoreID, pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs) try: os.remove(tableInfPath) except OSError: pass
def start_toil(job): import pandas as pd work_dir = job.fileStore.getLocalTempDir() in_store = IOStore.get("aws:us-east-1:molmimic-ibis") pdb_file = os.path.join(work_dir, "PDB.h5") in_store.read_input_file("PDB.h5", pdb_file) sfams = pd.read_hdf(pdb_file, "Superfamilies", columns=[ "sfam_id" ]).drop_duplicates().dropna()["sfam_id"].sort_values() #sfams = [299845.0] map_job(job, calculate_features_for_sfam, sfams) os.remove(pdb_file)
def start_toil(job, name="prep_protein"): """Start the workflow to process PDB files""" work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] in_store = IOStore.get("{}:molmimic-ibis".format(prefix)) #Download PDB info sdoms_file = os.path.join(work_dir, "PDB.h5") in_store.read_input_file("PDB.h5", sdoms_file) #Add pdb info into local job store pdbFileStoreID = job.fileStore.writeGlobalFile(sdoms_file) #Get all unique superfamilies sdoms = pd.read_hdf(unicode(sdoms_file), "merged") # skip_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "keep.csv") # if os.path.isfile(skip_file): # skip = pd.read_csv(skip_file) # sdoms = sdoms[sdoms["sdi"].isin(skip["sdi"])] # RealtimeLogger.info("SKIPPING {} sdis; RUNIING {} sdis".format(skip.shape[0], sdoms.shape[0])) # sfams = sdoms["sfam_id"].drop_duplicates().dropna() #sfams = sfams[:1] #sfams = ["653504"] # max_cores = job.fileStore.jobStore.config.maxCores if \ # job.fileStore.jobStore.config.maxCores > 2 else \ # job.fileStore.jobStore.config.defaultCores max_cores = job.fileStore.jobStore.config.defaultCores #Add jobs for each sdi job.addChildJobFn(map_job, process_sfam, sfams, pdbFileStoreID, cores=max_cores) #Add jobs for to post process each sfam #job.addFollowOnJobFn(map_job, post_process_sfam, sfams, pdbFileStoreID, # cores=max_cores) del sdoms os.remove(sdoms_file)
def get_sifts(pdb, job=None): if job is not None: work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] in_store = IOStore.get("{}:molmimic-sifts".format(prefix)) sifts_prefix = "{}/{}.xml.gz".format(pdb[1:3].lower(), pdb.lower()) sifts_path = os.path.join(work_dir, os.path.basename(sifts_prefix)) job.log("Saving {}:molmimic-sifts :: {} to {}".format( prefix, sifts_prefix, sifts_path)) try: in_store.read_input_file(sifts_prefix, sifts_path) except (SystemExit, KeyboardInterrupt): raise except: raise InvalidSIFTS("Cannot open {}".format(pdb)) with open(sifts_path) as f: yield f os.remove(sifts_path) else: path = os.path.join( os.environ.get("PDB_SNAPSHOT", os.path.join(data_path_prefix, "pdb")), "sifts", pdb[1:3].lower(), "{}.xml.gz".format(pdb.lower())) try: with open(path) as f: yield file except IOError as e: url = "ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/xml/{}.xml.gz".format( pdb.lower()) try: sifts = requests.get(url) file = StringIO(sifts.content) yield file except requests.exceptions.RequestException: file.close() raise InvalidSIFTS("Not found: " + url + " orgig error:" + str(e)) finally: file.close() os.remove(sifts)
def create_data_loader(job, sfam_id, preemptable=True): """Create H5 for Molmimic3dCNN to read Note: move this somewhere else """ work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] pdb_path = os.path.join(work_dir, "pdb") if not os.path.isdir(pdb_path): os.makedirs(pdb_path) id_format = re.compile( "^([A-Z0-9]{4})_([A-Za-z0-9]+)_sdi([0-9]+)_d([0-9]+)$") #Get all with keys same sfam, but do not download in_store = IOStore.get("{}:molmimic-clustered-structures".format(prefix)) keys = [id_format.match(f).groups() for f in in_store.list_input_directory(sfam_id) \ if f.endswith(".pdb") and id_format.match(f)] pdb_path = os.path.join(PDB_PATH, dataset_name, "by_superfamily", str(int(sfam_id))) clusters_file = os.path.join(pdb_path, "{}_nr.fasta".format(int(sfam_id))) try: pdb, chain, sdi, domain = zip(*[id_format.match(seq.id[:-2]).groups() \ for s in SeqIO.parse(clusters_file, "fasta")]) except ValueError: RealtimeLogger.info( "Unable to create data loading file for {}.".format(sfam_id)) return domains = pd.DataFrame({ "pdb": pdb, "chain": chain, "domNo": domain, "sdi": sdi }) data_loader = os.path.join(pdb_path, "{}.h5".format(int(sfam_id))) domains.to_hdf(unicode(data_loader), "table", complevel=9, complib="bzip2")
def download_pdb(job, sfam_id, pdb, chain, sdi, domNo, extracted=False, work_dir=None): if work_dir is None: work_dir = os.getcwd() in_store = IOStore.get("aws:us-east-1:molmimic-full-structures") prefix = "{}/{}/{}_{}_sdi{}_d{}.pdb".format(int(sfam_id), pdb[1:3].lower(), pdb.upper(), chain, sdi, domNo) file = os.path.join(work_dir, os.path.basename(prefix)) if extracted: prefix += ".extracted" assert in_store.exists(prefix) in_store.read_input_file(prefix, file) return file
def download_consurf(pdb=None, chain=None, consurf_path=None): if consurf_path is None: consurf_path = os.getcwd() if (pdb, chain).count(None) == 0: pdb_id = pdb.upper() if chain != " ": pdb_id += "_" + chain consurf_db_file = os.path.join(consurf_path, pdb[1:3].upper(), pdb_id) download_all = False elif (pdb, chain).count(None) == 2: download_all = True pdb_id = None store = IOStore.get("aws:us-east-1:molmimic-consurf") pdb_list = os.path.join(consurf_path, "pdbaa_list.nr") done_consurf = [os.path.splitext(os.path.basename(k))[0] for k in \ store.list_input_directory() if k != "pdbaa_list.nr"] if not store.exists("pdbaa_list.nr"): r = requests.get( "http://bental.tau.ac.il/new_ConSurfDB/ConSurfDB_list_feature.zip") zip_path = os.path.join(consurf_path, "ConSurfDB_list_feature.zip") with open(zip_path, "w") as zip: zip.write(r.content) with zipfile.ZipFile(zip_path) as z: with z.open("pdbaa_list.nr") as zf, open(pdb_list, 'wb') as f: copyfileobj(zf, f) store.write_output_file(pdb_list, "pdbaa_list.nr") os.remove(zip_path) else: store.read_input_file("pdbaa_list.nr", pdb_list) with open(pdb_list) as f: Parallel(n_jobs=-1)(delayed(parse_consurf_line)(line, pdb_id=pdb_id, \ consurf_path=consurf_path, download_all=download_all, \ done_consurf=done_consurf) for line in f) os.remove(pdb_list)
def merge_inferred_interactome(job, pdbFileStoreID): work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] out_store = IOStore.get("aws:us-east-1:molmimic-interfaces") pdb_file = get_file(job, "PDB.h5", pdbFileStoreID) sfams = pd.read_hdf(pdb_file, "Superfamilies", columns= ["sfam_id"]).drop_duplicates()["sfam_id"] os.remove(pdb_file) skip_sfam = [s for s in sfams if out_store.exists( "{0}/{0}.inferred_interactome".format(s))] sfam_to_run = [s for s in sfams if out_store.exists( "{0}/{0}.observed_interactome".format(s)) and s not in skip_sfam] # all_sfam = [os.path.basename(f).split(".") for f in out_store.list_input_directory() if not f.endswith("failed")] # skip_sfam = [f[0] for f in all_sfam if f[1] == "inferred_interactome"] # sfam_to_run = [f[0] for f in all_sfam if f[1] == "observed_interactome" \ # and f[0] not in skip_sfam] map_job(job, merge_inferred_interactome_sfam, sfam_to_run) job.addFollowOnJobFn(cleanup, sfam_to_run)
def get_observed_structural_interactome(job, sfam_id, pdbFileStoreID, ibisObsFileStoreID): work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] out_store = IOStore.get("{}:molmimic-interfaces".format(prefix)) ibis_obs_path = get_file(job, "IBIS_obs.h5", ibisObsFileStoreID) try: df = filter_hdf(ibis_obs_path, "ObsInt", "mol_superfam_id", float(sfam_id), columns=["obs_int_id"]) int_ids = df["obs_int_id"].drop_duplicates().dropna().astype(int) if len(int_ids) == 0: job.log("EMPTY OBS SFAM {}".format(sfam_id)) print "EMPTY OBS SFAM {}".format(sfam_id) return except (SystemExit, KeyboardInterrupt): raise except Exception as e: job.log("FAILED OBS SFAM {} {}".format(sfam_id, e)) print "FAILED OBS SFAM {} {}".format(sfam_id, e) return current_rows = set( int(os.path.basename(key)[:-3]) for key in out_store.list_input_directory("{}/_obsrows".format( int(sfam_id))) if not key.endswith("failed")) int_ids = list(set(int_ids) - current_rows) print "Will run {} ids: {}".format(len(int_ids), int_ids) if len(int_ids) > 0: #Add jobs for each interaction map_job(job, process_observed_interaction, int_ids, sfam_id, ibisObsFileStoreID, pdbFileStoreID) #Merge converted residues job.addFollowOnJobFn(merge_interactome_rows, sfam_id)
def compare_sfam(job, sfam, useExisting=False, observed=True): work_dir = job.fileStore.getLocalTempDir() out_store = IOStore.get("aws:us-east-1:molmimic-missing-structures") inf_store = IOStore.get("aws:us-east-1:molmimic-interfaces") struc_store = IOStore.get("aws:us-east-1:molmimic-full-structures") all_missing = "missing_{}.h5".format( "observed" if observed else "inferred") all_missing_f = os.path.join(work_dir, all_missing) if not useExisting or not out_store.exists(all_missing): obs_key = "{sfam}/{sfam}.{type}_interactome".format( sfam=sfam, type="observed" if observed else "inferred") obs_f = os.path.join(work_dir, os.path.basename(obs_key)) try: inf_store.read_input_file(obs_key, obs_f) except (KeyboardInterrupt, SystemExit): raise except Exception as e: job.log("Unable to open {} ({})".format(obs_key, e)) return [] st = pd.HDFStore(obs_f) df = st.get("/table") st.close() get_key = lambda f, p, c, s, d: "{}/{}/{}_{}_sdi{}_d{}.pdb".format( int(f), p[1:3].lower(), p.upper(), c, s, d) mol_ints = df[[ "mol_pdb", "mol_chain", "mol_sdi_id", "mol_domNo", "mol_superfam_id" ]] mol_ints = mol_ints.rename( columns={ "mol_pdb": "pdb", "mol_chain": "chain", "mol_sdi_id": "sdi", "mol_domNo": "domNo", "mol_superfam_id": "sfam_id" }) int_ints = df[[ "int_pdb", "int_chain", "int_sdi_id", "int_domNo", "int_superfam_id" ]] int_ints = int_ints.rename( columns={ "int_pdb": "pdb", "int_chain": "chain", "int_sdi_id": "sdi", "int_domNo": "domNo", "int_superfam_id": "sfam_id" }) pdbs = pd.concat((mol_ints, int_ints)).drop_duplicates() else: out_store.read_input_file(all_missing, all_missing_f) sfams = pd.read_hdf(all_missing_f, "table") ibis_store = IOStore.get("aws:us-east-1:molmimic-ibis") pdbf = ibis_store.read_input_file("PDB.h5", os.path.join(work_dir, "PDB.h5")) pdb = pd.read_hdf(os.path.join(work_dir, "PDB.h5"), "merged", columns=["sdi", "sfam_id"]).drop_duplicates() pdbs = pd.merge(sfams, pdb, on="sdi") missing = [] for i, row in pdbs.iterrows(): try: if not struc_store.exists( get_key(row.sfam_id, row.pdb, row.chain, row.sdi, row.domNo)): raise except (KeyboardInterrupt, SystemExit): raise except: missing.append(row) continue RealtimeLogger.info(" Found - {} {} {} {} {}".format( row.sfam_id, row.pdb, row.chain, row.sdi, row.domNo)) if len(missing) > 0 and not useExisting: RealtimeLogger.info("{} Missing {} entries".format(sfam, len(missing))) missing = pd.DataFrame(missing) key = "{}_{}.h5".format("observed" if observed else "inferred", int(sfam)) path = os.path.join(work_dir, key) missing.to_hdf(path, "table") out_store.write_output_file(path, key) elif len(missing) > 0 and useExisting: missing = pd.DataFrame(missing) file = "missing_{}_{}.h5".format( "observed" if observed else "inferred", str(datetime.datetime.now()).replace(" ", "-")) outfile = os.path.join(work_dir, file) missing.to_hdf(outfile, "table") out_store.write_output_file(outfile, file)
def merge_inferred_interactome_sfam(job, sfam_id): work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] iostore = IOStore.get("aws:us-east-1:molmimic-interfaces") sfam_prefix = "{}/_infrows".format(int(sfam_id)) sfam_file = "{s}/{s}.inferred_interactome".format(s=int(sfam_id)) merged_file = job.fileStore.getLocalTempFileName() to_delete = [] done_tables = [] for table_prefix in iostore.list_input_directory(sfam_prefix): if table_prefix.endswith("failed"): continue table = int(os.path.basename(os.path.dirname(table_prefix)).replace("Intrac", "")) if table in done_tables: continue RealtimeLogger.info("Running table sfam {}".format(table_prefix)) try: RealtimeLogger.info("Merge {} {}".format(sfam_id, table)) table_sfam_prefix = merge_table_sfam(job, sfam_id, table) if table_sfam_prefix is None: RealtimeLogger.info("Merging failed for {} {}".format(sfam_id, table)) table_file = os.path.join(work_dir, os.path.basename(table_sfam_prefix)) iostore.read_input_file(table_sfam_prefix, table_file) for df in pd.read_hdf(unicode(table_file), "table", chunksize=1000): df.to_hdf(unicode(merged_file), "table", mode="a", append=True, format="table", table=True, complevel=9, complib="bzip2", min_itemsize=1024) to_delete.append(table_sfam_prefix) except (IOError, ValueError) as e: raise job.log("Failed to read {} bc {}".format(table_prefix, e)) try: os.remove(table_file) except OSError: pass done_tables.append(table) if os.path.isfile(merged_file): #Write output file iostore.write_output_file(merged_file, sfam_file) #Cleanup try: os.remove(merged_file) except OSError: pass for key in to_delete: iostore.remove_file(key) rows_prefix = "{}/_infrows".format(int(sfam_id)) if iostore.exists(rows_prefix): for f in iostore.list_input_directory(rows_prefix): iostore.remove(f) else: failed_file = os.path.join(work_dir, "failed_file") with open(failed_file, "w") as f: print >>f, "No merged_file present" iostore.write_output_file(failed_file, sfam_file+".failed") try: os.remove(failed_file) except OSError: pass
def merge_interactome_rows(job, sfam_id): print "Start merge", sfam_id work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] out_store = IOStore.get("{}:molmimic-interfaces".format(prefix)) status = "observed" new_cols = ["mol_res", "int_res"] resi_prefix = "{}.observed_interactome".format(int(sfam_id)) data_cols = ["obs_int_id", "mol_sdi", "int_sdi"] resi_path = os.path.join(work_dir, resi_prefix) #Combine residues into dataframe possible_errors = [] nrows = None for nrows, row_prefix in enumerate( out_store.list_input_directory("{}/_obsrows/".format( int(sfam_id)))): if row_prefix.endswith("failed"): continue job.log("Running {} {}".format(sfam_id, row_prefix)) row_file = os.path.join(work_dir, os.path.basename(row_prefix)) out_store.read_input_file(row_prefix, row_file) df = pd.read_hdf(row_file, "table") try: df.to_hdf(unicode(resi_path), "table", table=True, format="table", append=True, mode="a", data_columns=data_cols, complib="bzip2", complevel=9, min_itemsize=1024) except (SystemExit, KeyboardInterrupt): raise except: import traceback tb = traceback.format_exc() job.log("Failed writing {}: {} {}".format(sfam_id, resi_path, tb)) possible_errors.append(tb) continue try: os.remove(row_file) except OSError: pass out_store.remove_file(row_prefix) if os.path.isfile(resi_path): #Upload to S3 out_store.write_output_file( resi_path, os.path.join(str(int(sfam_id)), resi_prefix)) #Cleanup os.remove(resi_path) print "End merge", sfam_id elif nrows is not None: job.log("Failed merging: {}".format(resi_path)) print "Failed merging: {}".format(resi_path) fail_file = os.path.join(work_dir, "fail_file") with open(fail_file, "w") as f: f.write("No rows?") for e in possible_errors: f.write(e) f.write("\n") out_store.write_output_file( fail_file, "{}/{}.failed".format(int(sfam_id), resi_prefix)) try: os.remove(fail_file) except OSError: pass
def get_table_sfams(job, mol_sfam_id, table, tableInfStoreID, pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs): work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] in_store = IOStore.get("aws:us-east-1:molmimic-ibis") out_store = IOStore.get("aws:us-east-1:molmimic-interfaces") RealtimeLogger.info("Running table {} sfam {}".format(table, mol_sfam_id)) pdbFilePath = get_file(job, "PDB.h5", pdbFileStoreID) taxFilePath = get_file(job, "pdb_chain_taxonomy.h5", taxFileStoreID) # obsFile = get_file(job, "IBIS_observed.h5", in_store) # # try: # observed_interactome = filter_hdf_chunks("IBIS_observed.h5", "table", "obs_int_id", mol_sfam_id) # except (SystemExit, KeyboardInterrupt): # raise # except: # try: # observed_interactome = filter_hdf_chunks("IBIS_observed.h5", "table", "obs_int_id", float(mol_sfam_id)) # except (SystemExit, KeyboardInterrupt): # raise # except: # job.log("Failed reading IBIS_observed.h5") # return sfamFileStoreID = sfamFileStoreIDs[mol_sfam_id] obsFilePath = get_file(job, "{}_obs.h5".format(int(mol_sfam_id)), sfamFileStoreID, work_dir=work_dir) observed_interactome = pd.read_hdf(obsFilePath, "table") RealtimeLogger.info("Obs has {} rows".format(observed_interactome.shape)) # obsFilePath = os.path.join(work_dir, "{0}.observed_interactome".format(int(mol_sfam_id))) # out_store.read_input_file("{0}/{0}.observed_interactome".format(int(mol_sfam_id)), obsPath) tableInfPath = get_file(job, "IBIS_inferred_{}.h5".format(table), tableInfStoreID) # skip_int = set([tuple(map(int, os.path.basename(f)[:-3].split("_"))) for f in out_store.list_input_directory( # "{}/_infrows/Intrac{}".format(int(mol_sfam_id), table)) if f.endswith(".h5")]) try: inf_int_ids = filter_hdf_chunks(tableInfPath, "Intrac{}".format(table), chunksize=100, nbr_superfam_id=mol_sfam_id) except (RuntimeError, TypeError): job.log("Unable to find sfam {} in table {}, Skipping".format(mol_sfam_id, table)) return #inf_int_ids = set([tuple(row) for row in inf_int_ids.itertuples()]) #inf_int_ids -= skip_int #print "Starting table sfam", mol_sfam_id, inf_int_ids #would this be better to just ran as a loop? #map_job(job, process_inferred_interaction, list(inf_int_ids), mol_sfam_id, table, tableInfStoreID, pdbFileStoreID, taxFileStoreID) try: fail_file = os.path.join(work_dir, "fail_file") for row in inf_int_ids.iterrows(): #if tuple(row) in skip_int: continue nbr_obs_int_id = row[1]["nbr_obs_int_id"] nbr_sdi = row[1]["nbr_sdi_id"] mol_sdi = row[1]["mol_sdi_id"] inf_int_id_file = "{}_{}_{}_{}".format(row[0], nbr_obs_int_id, nbr_sdi, mol_sdi) if out_store.exists("{}/_infrows/Intrac{}/{}.inf.h5".format(int(mol_sfam_id), table, inf_int_id_file)): continue RealtimeLogger.info("Running {}".format(row)) out = process_inferred_interaction( job, row, mol_sfam_id, table, observed_interactome, pdbFilePath, taxFilePath, isrow=True, work_dir=work_dir) if out is not None: inf_int_id, tb = out with open(fail_file, "a") as f: f.write(str(inf_int_id)) f.write("\n") f.write(tb) if os.path.isfile(fail_file): out_store.write_output_file(fail_file, "{}/_infrows/Intrac{}/failed".format(int(mol_sfam_id), table)) try: os.remove(fail_file) except OSError: pass #job.addFollowOnJobFn(merge_table_sfam, mol_sfam_id, table) except (SystemExit, KeyboardInterrupt): raise except: raise finally: for f in [tableInfPath, obsFilePath, pdbFilePath, taxFilePath]: try: os.remove(f) except OSError: pass
def merge_table_sfam(job, sfam_id, table): work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] in_store = IOStore.get("aws:us-east-1:molmimic-ibis") out_store = IOStore.get("aws:us-east-1:molmimic-interfaces") print "Start merge", sfam_id status = "inferred (table {}) {}".format(table, sfam_id) resi_prefix = "Intrac{}_{}.inf.h5".format(table, int(sfam_id)) new_cols = ["mol_res"] data_cols = ["nbr_obs_int_id", "nbr_sdi", "mol_sdi", "int_sdi"] resi_path = os.path.join(work_dir, resi_prefix) #Combine residues into dataframe possible_errors = [] to_delete = [] row_key = "{}/_infrows/Intrac{}".format(int(sfam_id), table) RealtimeLogger.info("Merginf rows from {}".format(row_key)) for row_prefix in out_store.list_input_directory(row_key): if not row_prefix.endswith(".inf.h5"): continue RealtimeLogger.info("Running {} {}".format(sfam_id, row_prefix)) row_file = os.path.join(work_dir, os.path.basename(row_prefix)) try: out_store.read_input_file(row_prefix, row_file) df = pd.read_hdf(row_file, "table") for col, _ in df.dtypes[df.dtypes == 'int64'].iteritems(): df[col] = df[col].astype(float) df.to_hdf(unicode(resi_path), "table", table=True, format="table", append=True, mode="a", data_columns=data_cols, complib="bzip2", complevel=9, min_itemsize=1024) except (SystemExit, KeyboardInterrupt): raise except: import traceback tb = traceback.format_exc() job.log("Failed writing {}: {} {}".format(sfam_id, resi_path, tb)) possible_errors.append(tb) try: os.remove(row_file) except OSError: pass #to_delete.append(row_prefix) if os.path.isfile(resi_path): #Upload to S3 outfile = "{}/_inftables/{}".format(int(sfam_id), resi_prefix) outprefix = "{}/_inftables/{}".format(int(sfam_id), resi_prefix) out_store.write_output_file(resi_path, outprefix) #Cleanup os.remove(resi_path) for key in to_delete: out_store.remove_file(key) print "End merge", sfam_id, table return outprefix else: job.log("Failed merging: {}".format(resi_path)) print "Failed merging: {}".format(resi_path) fail_file = os.path.join(work_dir, "fail_file") with open(fail_file, "w") as f: f.write("no rows?\n") for e in possible_errors: f.write(e) f.write("\n") out_store.write_output_file(fail_file, "{}/_inftables/{}.failed".format(int(sfam_id), resi_prefix))
def process_inferred_interaction(job, inf_int_id, nbr_sfam, table, tableInfStoreID, pdbFileStoreID, taxFileStoreID, isrow=False, work_dir=None): if work_dir is None: work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] in_store = IOStore.get("aws:us-east-1:molmimic-ibis") out_store = IOStore.get("aws:us-east-1:molmimic-interfaces") inf_int_id_file = os.path.basename(job.fileStore.getLocalTempFileName()) try: if not isrow: inf_int_id_file = "_".join(map(str, inf_int_id)) inf_int_id, nbr_obs_int_id, nbr_sdi, mol_sdi = inf_int_id tableInfFile = get_file(job, "IBIS_inferred_{}.h5".format(table), tableInfStoreID) inferred_interfaces = filter_hdf_chunks(tableInfFile, "Intrac{}".format(table), nbr_obs_int_id = nbr_obs_int_id, nbr_sdi_id = nbr_sdi, mol_sdi_id = mol_sdi) inferred_interfaces = inferred_interfaces.loc[inf_int_id].to_frame().T else: inferred_interfaces = inf_int_id[1].to_frame().T #inf_int_id = inferred_interfaces[""] nbr_obs_int_id = inf_int_id[1]["nbr_obs_int_id"] nbr_sdi = inf_int_id[1]["nbr_sdi_id"] mol_sdi = inf_int_id[1]["mol_sdi_id"] inf_int_id_file = "{}_{}_{}_{}".format(inf_int_id[0], nbr_obs_int_id, nbr_sdi, mol_sdi) inferred_interfaces["mol_sdi_id"] = inferred_interfaces["mol_sdi_id"].astype(float) if inferred_interfaces.shape[0] == 0: return pdb_file = get_file(job, "PDB.h5", pdbFileStoreID) if not isrow else pdbFileStoreID tax_file = get_file(job, "pdb_chain_taxonomy.h5", taxFileStoreID) if not isrow else taxFileStoreID try: struct_domains = filter_hdf( pdb_file, "merged", columns = ["sdi", "domNo", "gi", "pdbId", "chnLett", "from", "to", "sfam_id"], sdi = mol_sdi).drop_duplicates() except RuntimeError: job.log("SDI {} is obsolete".format(mol_sdi)) return pdbs = struct_domains["pdbId"] taxa = pd.read_hdf(tax_file, "table", where="pdbId in pdbs") struct_domains = pd.merge(struct_domains, taxa, on=["pdbId", "chnLett"], how="left") #Add PDB, chain, and sdi information. Inner join to only allow sdi's that are in databses, not obsolete inferred_interfaces = pd.merge(inferred_interfaces, struct_domains, how="inner", left_on="mol_sdi_id", right_on="sdi") #RealtimeLogger.info("INTERFACE: {}".format(inferred_interfaces.iloc[0])) if isrow: #RealtimeLogger.info("tableInfStoreID is {} {}".format(type(tableInfStoreID), nbr_obs_int_id)) #RealtimeLogger.info("{}".format(tableInfStoreID.iloc[0])) observed_interactome = tableInfStoreID[tableInfStoreID["obs_int_id"]==nbr_obs_int_id] #RealtimeLogger.info("Got obs file from row") else: obspath = job.fileStore.getLocalTempFileName() out_store.read_input_file("{0}/{0}.observed_interactome".format(nbr_sfam), obspath) raise RuntimeError("Not a row!") try: observed_interactome = filter_hdf_chunks(obspath, "table", "obs_int_id", nbr_obs_int_id) except (SystemExit, KeyboardInterrupt): raise except: try: observed_interactome = filter_hdf_chunks(obspath, "table", "obs_int_id", float(nbr_obs_int_id)) except (SystemExit, KeyboardInterrupt): raise except: job.log("Failed reading {}".format(obspath)) raise observed_interactome = observed_interactome.rename(columns={ 'mol_pdb': 'nbr_pdb', 'mol_chain': 'nbr_chain', 'mol_domNo': 'nbr_domNo', 'mol_res': 'nbr_res', 'mol_gi_x': 'nbr_gi', #'mol_taxid': 'nbr_taxid', #'mol_superfam_id': 'nbr_superfam_id' }) #Add in neghibor information from observed interactome inferred_interfaces["nbr_obs_int_id"] = inferred_interfaces["nbr_obs_int_id"].astype(int) inferred_interfaces = pd.merge(inferred_interfaces, observed_interactome, how="left", left_on="nbr_obs_int_id", right_on="obs_int_id", suffixes=["_inf", "_obs"]) del observed_interactome #RealtimeLogger.info("INTERFACE: {}".format(inferred_interfaces.iloc[0])) #Select relevant columns if "int_superfam_id" in inferred_interfaces.columns: int_superfam_id_col = "int_superfam_id" elif "int_superfam_id_inf" in inferred_interfaces.columns: int_superfam_id_col = "int_superfam_id_inf" elif "int_superfam_id_x" in inferred_interfaces.columns: #Suffix not working? int_superfam_id_col = "int_superfam_id_x" else: raise RuntimeError("Merge faled for obs and inf: {}".format(inferred_interfaces.columns)) try: inferred_interfaces = inferred_interfaces[[ "sdi", "pdbId", "chnLett", "domNo", "from", "to", "resn", "resi", "taxid", "gi", "sfam_id", "nbr_sdi_id", "nbr_pdb", "nbr_chain", "nbr_domNo", "nbr_res", "nbr_taxid", "nbr_gi", "nbr_superfam_id", "nbr_obs_int_id", "nbr_score", "int_sdi_id", "int_pdb", "int_chain", "int_domNo", "int_res", "int_sdi_from", "int_sdi_to", "int_taxid", "int_gi_x", "int_gi_y", int_superfam_id_col]] except KeyError as e: job.log("Unable to filter df. Columns are: {}. Error is: {}".format(inferred_interfaces.columns, e)) raise #Rename columns inferred_interfaces = inferred_interfaces.rename(columns={ "sdi":"mol_sdi_id", "pdbId":"mol_pdb", "chnLett":"mol_chain", "domNo":"mol_domNo", "from":"mol_sdi_from", "to":"mol_sdi_to", "resn":"mol_resn", "resi":"mol_resi", "taxid":"mol_taxid", "gi":"mol_gi", "sfam_id":"mol_superfam_id", int_superfam_id_col:"int_superfam_id", }) try: inferred_interfaces = inferred_interfaces[ \ ~inferred_interfaces["mol_sdi_id"].isnull() & \ ~inferred_interfaces["int_sdi_id"].isnull()] except KeyError as e: job.log("Unable to drop na. Columns are: {}. Error is: {}".format(inferred_interfaces.columns, e)) raise # taxa = [] # # for row in inferred_interfaces.itertuples(): # #Should only be one row, but maybe not # try: # taxas = filter_hdf(tax_file, "table", # pdbId = row.mol_pdb, # chnLett = row.mol_chain) # taxa.append(float(taxas.iloc[0]["taxId"])) # except (KeyboardInterrupt, SystemExit): # raise # else: # taxa.append(np.NaN) # taxa = pd.Series(taxa, index=inferred_interfaces.index) # inferred_interfaces = inferred_interfaces.assign(mol_taxid=taxa) try: resi = pd.Series([decode_residues(job, row.mol_pdb, row.mol_chain, row.mol_resi, row) \ for row in inferred_interfaces.itertuples()], index=inferred_interfaces.index) except (KeyboardInterrupt, SystemExit): raise except: import traceback tb = traceback.format_exc() job.log("FAILED {}".format(tb)) print tb raise resi = inferred_interfaces["mol_resi"].copy() inferred_interfaces = inferred_interfaces.assign(mol_res=resi) del resi del inferred_interfaces["mol_resi"] del inferred_interfaces["mol_resn"] str_cols = ["mol_pdb", "mol_chain", "mol_res", "int_pdb", "int_chain", "int_res", "nbr_pdb", "nbr_chain", "nbr_res"] #RealtimeLogger.info("INTERFACE: {}".format(inferred_interfaces.iloc[0])) for col in inferred_interfaces.columns: inferred_interfaces[col] = inferred_interfaces[col].astype( str if col in str_cols else float) mol_sfams = inferred_interfaces["mol_superfam_id"].drop_duplicates() if len(mol_sfams) == 0: return elif len(mol_sfams) == 1: #Write to HDF file df_file = job.fileStore.getLocalTempFileName() inferred_interfaces.to_hdf(unicode(df_file), "table", format="table", table=True, complevel=9, complib="bzip2", min_itemsize=1024, data_coumns=["nbr_obs_int_id", "nbr_sdi_id", "mol_sdi_id", "int_sdi_id"]) job.log("Wrote "+df_file) df_files = [df_file] #Add ibis info into out store out_store.write_output_file(df_file, "{}/_infrows/Intrac{}/{}.inf.h5".format(int(mol_sfams[0]), table, inf_int_id_file)) else: df_files = [] for i, inf_row in inferred_interfaces.iterrows(): mol_sfam = inf_row["mol_superfam_id"] inf_row = inf_row.to_frame().T for col in inf_row.columns: inf_row[col] = inf_row[col].astype(str if col in str_cols else float) #Write to HDF file df_file = job.fileStore.getLocalTempFileName() inf_row.to_hdf(unicode(df_file), "table", format="table", table=True, complevel=9, complib="bzip2", min_itemsize=1024, data_coumns=["nbr_obs_int_id", "nbr_sdi_id", "mol_sdi_id", "int_sdi_id"]) job.log("Wrote "+df_file) df_files.append(df_file) #Add ibis info into out store out_store.write_output_file(df_file, "{}/_infrows/Intrac{}/{}.inf.h5".format(int(mol_sfam), table, inf_int_id_file)) except (KeyboardInterrupt, SystemExit): raise except: import traceback tb = traceback.format_exc() job.log("FAILED {} {}".format(nbr_sfam, tb)) if not isrow: fail_file = os.path.join(work_dir, "fail_file") with open(fail_file, "w") as f: f.write(str(inf_int_id)) f.write("\n") f.write(tb) out_store.write_output_file(fail_file, "{}/_infrows/Intrac{}/{}.failed".format(int(nbr_sfam), table, inf_int_id_file)) try: os.remove(fail_file) except OSError: pass else: return inf_int_id, tb try: for f in (df_file, pdb_file, tableInfFile): try: os.remove(f) except OSError: pass except: pass finally: try: files = df_files if isrow else df_files+[pdb_file, tableInfFile] for f in files: try: os.remove(f) except OSError: pass except: pass
def calculate_features(job, pdb_or_key, sfam_id=None, chain=None, sdi=None, domNo=None, work_dir=None): from molmimic.common.featurizer import ProteinFeaturizer if work_dir is None and job is not None: work_dir = job.fileStore.getLocalTempDir() if work_dir is None or not os.path.isdir(work_dir): work_dir = os.getcwd() in_store = IOStore.get("aws:us-east-1:molmimic-full-structures") out_store = IOStore.get("aws:us-east-1:molmimic-structure-features") if [sfam_id, chain, sdi, domNo].count(None) == 0: #pdb_or_key is pdb pdb = pdb_or_key key = "{}/{}/{}_{}_sdi{}_d{}".format(int(sfam_id), pdb.lower()[1:3], pdb.upper(), chain, sdi, domNo) else: #pdb_or_key is key assert pdb_or_key.count("_") == 3 key = os.path.splitext(pdb_or_key)[0] pdb, chain, sdi, domNo = os.path.basename(key).split("_") sdi, domNo = sdi[3:], domNo[1:] try: pdb_path = os.path.join(work_dir, os.path.basename(key) + ".pdb") in_store.read_input_file(key + ".pdb", pdb_path) s = ProteinFeaturizer(pdb_path, pdb, chain, sdi=sdi, domNo=domNo, work_dir=work_dir, job=job) _, atom_features = s.calculate_flat_features() RealtimeLogger.info("Finished atom features") _, residue_features = s.calculate_flat_features(course_grained=True) RealtimeLogger.info("Finished residue features") graph_features = s.calculate_graph() RealtimeLogger.info("Finished edge features") out_store.write_output_file(atom_features, key + "_atom.npy") out_store.write_output_file(residue_features, key + "_residue.npy") out_store.write_output_file(graph_features, key + "_edges.gz") for f in (pdb_path, atom_features, residue_features, graph_features): try: os.remove(f) except OSError: pass except (SystemExit, KeyboardInterrupt): raise except Exception as e: raise fail_key = "{}_error.fail".format(key) fail_file = os.path.join(work_dir, os.path.basename(key)) with open(fail_file, "w") as f: f.write("{}\n".format(e)) out_store.write_output_file(fail_file, fail_key) os.remove(fail_file)
def process_observed_interaction(job, int_id, sfam_id, ibisFileStoreID, pdbFileStoreID): work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] out_store = IOStore.get("{}:molmimic-interfaces".format(prefix)) mmdb_path = get_file(job, "PDB.h5", pdbFileStoreID) ibis_path = get_file(job, "IBIS_obs.h5", ibisFileStoreID) row = filter_hdf(ibis_path, "ObsInt", "obs_int_id", int_id) try: #Read in face1 residues face1 = filter_hdf(unicode(ibis_path), "MolResFace", "obs_int_id", int_id) face1.columns = ["obs_int_id", "mol_res"] print face1.shape #Keep entries from current CDD row = pd.merge(row, face1, how="left", on="obs_int_id") del face1 #Read in face2 residues and convert gzipped asn1 into res numbers face2 = filter_hdf(unicode(ibis_path), "IntResFace", "obs_int_id", int_id) face2.columns = ["obs_int_id", "int_res"] print face2.shape #Keep entries from current CDD row = pd.merge(row, face2, how="left", on="obs_int_id") print row.shape del face2 try: st_domain_mol = filter_hdf(unicode(mmdb_path), "StructuralDomains", "sdi", row.iloc[0]['mol_sdi_id']) st_domain_mol.columns = [ 'mol_sdi_id', 'mol_domNo', 'mol_gi', 'mol_pdb', 'mol_chain', 'mol_sdi_from', 'mol_sdi_to' ] row = pd.merge(row, st_domain_mol, how="left", on="mol_sdi_id") del st_domain_mol st_domain_int = filter_hdf(unicode(mmdb_path), "StructuralDomains", "sdi", row.iloc[0]['int_sdi_id']) st_domain_int.columns = [ 'int_sdi_id', 'int_domNo', 'int_gi', 'int_pdb', 'int_chain', 'int_sdi_from', 'int_sdi_to' ] row = pd.merge(row, st_domain_int, how="left", on="int_sdi_id") del st_domain_int except TypeError: #SDI's don't exists, must be obsolete print "Done row", int_id return updated_resi = {"mol_res": [], "int_res": []} for resi in row.itertuples(): try: updated_resi["mol_res"].append( decode_residues(job, resi.mol_pdb, resi.mol_chain, resi.mol_res, resi)) updated_resi["int_res"].append( decode_residues(job, resi.int_pdb, resi.int_chain, resi.int_res, resi)) except InvalidSIFTS: #This Row has failed converting binary to string skip iti updated_resi["mol_res"].append(np.NaN) updated_resi["int_res"].append(np.NaN) continue if len(updated_resi["mol_res"].dropna()) > 0: row = row.assign(**updated_resi) row.dropna() else: #This entire interation failed; returned None return None path = "{}.h5".format(int_id) row.to_hdf(path, "table", table=True, format="table", complib="bzip2", complevel=9, min_itemsize=1024) out_store.write_output_file( path, "{}/_obsrows/{}".format(int(sfam_id), path)) print "Done row", int_id try: os.remove(fail_file) except OSError: pass except (SystemExit, KeyboardInterrupt): raise except Exception as e: import traceback tb = traceback.format_exc() job.log("FAILED {} {}".format(int_id, e, tb)) fail_file = os.path.join(work_dir, "fail_file") with open(fail_file, "w") as f: f.write(str(e)) f.write(str(tb)) out_store.write_output_file( fail_file, "{}/_obsrows/{}.failed".format(int(sfam_id), int_id)) try: os.remove(fail_file) except OSError: pass