Beispiel #1
0
def convert_pdb_to_mmtf(job,
                        sfam_id,
                        jobStoreIDs=None,
                        clustered=True,
                        preemptable=True):
    raise NotImplementedError()

    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    clustered = "clustered" if clustered else "full"

    pdb_path = os.path.join(work_dir, "pdb")
    if not os.path.isdir(pdb_path):
        os.makedirs(pdb_path)

    #Download all with same sfam
    if jobStoreIDs is None:
        in_store = IOStore.get("{}:molmimic-{}-structures".format(prefix),
                               clustered)
        for f in in_store.list_input_directory(sfam_id):
            if f.endswith(".pdb"):
                in_store.read_input_file(f, os.path.join(work_dir, f))
    else:
        for jobStoreID in jobStoreIDs:
            job.fileStore.readGlobalFile(fileStoreID, userPath=pdb_path)

    PdbToMmtfFull(pdb_path, mmtf_path, work_dir=work_dir, job=job)

    out_store = IOStore.get("{}:molmimic-{}-mmtf".format(prefix, clustered))
    out_store.write_output_directory(mmtf_path, sfam_id)
def start_toil(job):
    print "Starting job"
    work_dir = job.fileStore.getLocalTempDir()
    in_store = IOStore.get("aws:us-east-1:molmimic-ibis")
    int_store = IOStore.get("aws:us-east-1:molmimic-interfaces")

    #Download PDB info
    pdb_file = os.path.join(work_dir, "PDB.h5")
    in_store.read_input_file("PDB.h5", pdb_file)

    #Add pdb info into local job store
    pdbFileStoreID = job.fileStore.writeGlobalFile(pdb_file)

    #Download PDB Taxonomy information
    tax_file = os.path.join(work_dir, "pdb_chain_taxonomy.h5")
    in_store.read_input_file("pdb_chain_taxonomy.h5", tax_file)

    #Add tax info into local job store
    taxFileStoreID = job.fileStore.writeGlobalFile(tax_file)

    tables = set(range(1,87))-set([51])

    sfams = pd.read_hdf(pdb_file, "Superfamilies", columns=
        ["sfam_id"]).drop_duplicates().dropna()["sfam_id"].sort_values()
    #RealtimeLogger.info("SFAMS: {}".format(sfams.shape[0]))
    sfamFileStoreIDs = {}
    for s in sfams:
        k = "{0}/{0}.observed_interactome".format(int(s))
        if int_store.exists(k):
            RealtimeLogger.info("Loading {}".format(s))
            f = job.fileStore.getLocalTempFileName()
            int_store.read_input_file(k, f)
            sfamFileStoreIDs[int(s)] = job.fileStore.writeGlobalFile(f)
            os.remove(f)
        else:
            RealtimeLogger.info("FAILED Loading {}".format(s))

    assert len(sfamFileStoreIDs) > 0

    os.remove(tax_file)
    os.remove(pdb_file)

    job.log("Running tables: {}".format(tables))
    j = job
    for table in table:
        j.addFollowOnJobFn(get_inferred_structural_interactome_by_table, table,
            pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs)
    # map_job(job, get_inferred_structural_interactome_by_table, tables,
    #     pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs)
    j.addFollowOnJobFn(merge_inferred_interactome, pdbFileStoreID)
def calculate_features_for_sfam(job, sfam_id, further_parallelize=False):
    work_dir = job.fileStore.getLocalTempDir()
    pdb_store = IOStore.get("aws:us-east-1:molmimic-full-structures")
    out_store = IOStore.get("aws:us-east-1:molmimic-structure-features")

    extensions = set(["atom.npy", "residue.npy", "edges.gz"])
    done_files = lambda k: set([f.rsplit("_", 1)[1] for f in \
        out_store.list_input_directory(k)])
    pdb_keys = [k for k in pdb_store.list_input_directory(str(int(sfam_id))) if \
        k.endswith(".pdb") and extensions != done_files(os.path.splitext(k)[0])]

    if further_parallelize:
        map_job(job, calculate_features, pdb_keys)
    else:
        for pdb_key in pdb_keys:  #pdb_store.list_input_directory(int(sfam_id)):
            calculate_features(job, pdb_key, work_dir=work_dir)
Beispiel #4
0
def get_sfam_ddi_sizes(job, sfam_id, observed=True):
    int_type = "observed" if observed else "inferred"
    work_dir = job.fileStore.getLocalTempDir()
    interface_store = IOStore.get("aws:us-east-1:molmimic-interfaces")

    interfaces_key = "{s}/{s}.{o}_interactome".format(
        s=sfam_id, o="observed" if observed else "inferred")
    interfaces_file = os.path.basename(interfaces_key)
    interface_store.read_input_file(interfaces_key, interfaces_file)

    interfaces = pd.read_hdf(interfaces_file, "table")

    RealtimeLogger.info("COLS: {}".format(interfaces.columns))
    counts = interfaces.fillna(-1.).groupby(
        ["mol_superfam_id",
         "int_superfam_id"]).size().reset_index(name="count")

    RealtimeLogger.info("SIZES :{}".format(counts))

    try:
        os.remove(interfaces_file)
    except OSError:
        pass

    return counts
Beispiel #5
0
def best_sfams(job, all_counts, max_sfams=300):
    import json
    work_dir = job.fileStore.getLocalTempDir()
    out_store = IOStore.get("aws:us-east-1:molmimic-ddi")

    #Merge into one dataframe
    counts = pd.concat(all_counts)

    #mol->int should be same as int->mol: remove dupes
    ddi_counts = {}
    for counts in all_counts:
        for row in counts.itertuples():
            ddi = tuple(
                map(int, sorted((row.mol_superfam_id, row.int_superfam_id))))
            if ddi in ddi_counts:
                RealtimeLogger.info("{} {}, are counts symmetrical? {}".format(
                    ddi[0], ddi[1],
                    "Yes" if ddi_counts[ddi] == row.count else "No"))
                continue
            ddi_counts[ddi] = row.count

    sfams = sorted(ddi_counts.iteritems(), key=lambda x: x[1], reverse=True)
    RealtimeLogger.info("sfams is {}".format(sfams))
    sfam_file = os.path.join(work_dir, "sorted_sfams.json")
    with open(sfam_file, "w") as f:
        json.dump(sfams, f)
    out_store.write_output_file(sfam_file, "sorted_sfams.json")

    return sfams[:max_sfams]
Beispiel #6
0
def process_sfams(job, max_sfams=300, memory="1G"):
    import json
    from collections import defaultdict

    work_dir = job.fileStore.getLocalTempDir()
    store = IOStore.get("aws:us-east-1:molmimic-ddi")

    json_file = os.path.join(work_dir, "sorted_sfams.json")
    store.read_input_file("sorted_sfams.json", json_file)

    with open(json_file) as f:
        counts = json.load(f)

    sfams = defaultdict(set)
    num_ddi = 0
    for i, ((mol_sfam, int_sfam), count) in enumerate(counts):
        RealtimeLogger.info("{}: {}-{}".format(i, mol_sfam, int_sfam))
        if -1 in map(int, (mol_sfam, int_sfam)):
            RealtimeLogger.info("SKIPPED")
            continue

        if count < 90:
            RealtimeLogger.info("SKIPPED count is less than 100")
            continue

        print(mol_sfam, int_sfam)
        #if i<30: continue
        # and (max_sfams is None or num_ddi<max_sfams) and \
        #mol_sfam not in sfams[int_sfam]:
        if mol_sfam != int_sfam:
            if (mol_sfam not in sfams
                    and int_sfam not in sfams) or (mol_sfam in sfams
                                                   and int_sfam in sfams):
                sfams[mol_sfam].add(int_sfam)
                RealtimeLogger.info("Added {}: {}-{}".format(
                    i, mol_sfam, int_sfam))
            elif mol_sfam in sfams and int_sfam not in sfams:
                sfams[mol_sfam].add(int_sfam)
                RealtimeLogger.info("Added {}: {}-{}".format(
                    i, mol_sfam, int_sfam))
            elif mol_sfam not in sfams and int_sfam in sfams:
                sfams[int_sfam].add(mol_sfam)
                RealtimeLogger.info("Added {}: {}-{}".format(
                    i, int_sfam, mol_sfam))
            else:
                RealtimeLogger.info(
                    "Could not add {}: {}-{}; mol_sfam {} sfams; int_sfam {} sfams"
                    .format(i, mol_sfam, int_sfam,
                            "in" if mol_sfam in sfams else "not in",
                            "in" if int_sfam in sfams else "not in"))
            num_ddi += 1
            #break

    RealtimeLogger.info("{} starting domains".format(len(sfams)))
    #mol_sfam = list(sfams.keys())[pd.np.random.randint(len(sfams))]
    #int_sfams = sfams[mol_sfam]
    #job.addChildJobFn(process_sfam, mol_sfam, int_sfams)

    for mol_sfam, int_sfams in sfams.iteritems():
        job.addChildJobFn(process_sfam, mol_sfam, int_sfams)
Beispiel #7
0
def process_sfam(job, sfam_id, pdbFileStoreID, cores=1):
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    in_store = IOStore.get("{}:molmimic-full-structures".format(prefix))

    sdoms_file = copy_pdb_h5(job, pdbFileStoreID)

    sdoms = pd.read_hdf(unicode(sdoms_file),
                        "merged")  #, where="sfam_id == {}".format(sfam_id))
    # skip_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "keep.csv")
    # if os.path.isfile(skip_file):
    #     skip = pd.read_csv(skip_file)
    #     sdoms = sdoms[sdoms["sdi"].isin(skip["sdi"])]

    sdoms = sdoms[sdoms["sfam_id"] == float(
        sfam_id)]["sdi"].drop_duplicates().dropna()
    #sdoms = sdoms[:1]

    if cores > 2:
        #Only makes sense for slurm or other bare-matal clsuters
        setup_dask(cores)
        d_sdoms = dd.from_pandas(sdoms, npartitions=cores)
        RealtimeLogger.info("Running sfam dask {}".format(sdoms))
        processed_domains = d_sdoms.apply(
            lambda row: process_domain(job, row.sdi, sdoms_file),
            axis=1).compute()
    else:
        processed_domains = job.addChildJobFn(map_job_rv,
                                              process_domain,
                                              sdoms,
                                              pdbFileStoreID,
                                              preemptable=True).rv()

    return processed_domains
Beispiel #8
0
def start_toil(job, memory="1G"):
    work_dir = job.fileStore.getLocalTempDir()
    store = IOStore.get("aws:us-east-1:molmimic-ddi")
    if not store.exists("sorted_sfams.json"):
        interface_store = IOStore.get("aws:us-east-1:molmimic-interfaces")
        observed = set(key.split("/")[0] for key in interface_store.list_input_directory() \
            if key.endswith("observed_interactome"))
        #inferred = set(key.split("/")[0] for key in keys if key.endswith("inferred_interactome"))

        interface_counts = [
            job.addChildJobFn(get_sfam_ddi_sizes, o).rv() for o in observed
        ]
        merge_job = job.addFollowOnJobFn(best_sfams, interface_counts)
    else:
        merge_job = job

    merge_job.addFollowOnJobFn(process_sfams)
Beispiel #9
0
def get_missing(job, observed=True):
    import datetime
    work_dir = job.fileStore.getLocalTempDir()

    file = "missing_{}_{}.h5".format(
        "observed" if observed else "inferred",
        str(datetime.datetime.now()).replace(" ", "-"))
    outfile = os.path.join(work_dir, file)

    store = IOStore.get("aws:us-east-1:molmimic-missing-structures")

    to_remove = []
    for k in store.list_input_directory(
            "observed" if observed else "inferred"):
        path = os.path.join(work_dir, k)
        store.read_input_file(k, path)
        df = pd.read_hdf(path, "table", mode="r")
        df.to_hdf(outfile,
                  "table",
                  table=True,
                  format="table",
                  append=True,
                  mode="a",
                  complib="bzip2",
                  complevel=9)
        to_remove.append(k)
        try:
            os.remove(path)
        except OSError:
            pass

    if not os.path.isfile(outfile):
        df = pd.DataFrame()
        df.to_hdf(outfile,
                  "table",
                  table=True,
                  format="table",
                  append=True,
                  mode="a",
                  complib="bzip2",
                  complevel=9)

    store.write_output_file(outfile, file)

    try:
        os.remove(outfile)
    except OSError:
        pass

    for f in to_remove:
        try:
            store.remove(k)
        except (SystemExit, KeyboardInterrupt):
            raise
        except:
            pass
Beispiel #10
0
def start_toil(job, pdbFileStoreID=None):
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    in_store = IOStore.get("{}:molmimic-ibis".format(prefix))
    out_store = IOStore.get("{}:molmimic-interfaces".format(prefix))

    if pdbFileStoreID is None:
        #Download PDB info
        pdb_path = os.path.join(work_dir, "PDB.h5")
        in_store.read_input_file("PDB.h5", pdb_path)

        #Add pdb info into local job store
        pdbFileStoreID = job.fileStore.writeGlobalFile(pdb_path)
    else:
        pdb_path = job.fileStore.readGlobalFile(pdbFileStoreID)

    ibis_obs_prefix = "IBIS_observed.h5"
    ibis_obs_path = os.path.join(work_dir, ibis_obs_prefix)
    in_store.read_input_file(ibis_obs_prefix, ibis_obs_path)

    #Add ibis info into local job store
    ibisObsFileStoreID = job.fileStore.writeGlobalFile(ibis_obs_path)

    #Choose which superfamilies to run, skip those already present
    skip_sfam = set([float(f.split("/", 1)[0]) for f in out_store.list_input_directory() \
        if f.endswith(".observed_interactome")])
    pdb = filter_hdf_chunks(unicode(ibis_obs_path),
                            "ObsInt",
                            columns=["mol_superfam_id"]).drop_duplicates()
    sfams = pdb[~pdb["mol_superfam_id"].isin(skip_sfam)][
        "mol_superfam_id"].drop_duplicates().dropna().astype(int)
    print "Will run a total of {} SFAMS".format(len(sfams))

    #Run all superfamilies
    map_job(job, get_observed_structural_interactome, sfams, pdbFileStoreID,
            ibisObsFileStoreID)

    #Cleanup
    job.addFollowOnJobFn(cleanup)
    os.remove(ibis_obs_path)
    os.remove(pdb_path)
def cleanup(job, sfam_ids):
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    store = IOStore.get("aws:us-east-1:molmimic-interfaces")

    for sfam_id in sfam_ids:
        infrows = store.list_input_directory("{}/_infrows".format(int(sfam_id)))
        inftables = store.list_input_directory("{}/_inftables".format(int(sfam_id)))
        finished_tables = [k.split("/")[-1].split("_")[0] for k in inftables]
        for k in infrows:
            if k.split("/")[2] in finished_tables:
                store.remove_file(k)
Beispiel #12
0
def compare_sfams(job, useExisting=False, observed=True):
    work_dir = job.fileStore.getLocalTempDir()
    store = IOStore.get("aws:us-east-1:molmimic-missing-structures")
    all_missing = "missing_{}.h5".format(
        "observed" if observed else "inferred")
    all_missing_f = os.path.join(work_dir, all_missing)

    if not useExisting or not store.exists(all_missing):
        inf_store = IOStore.get("aws:us-east-1:molmimic-interfaces")
        ending = ".{}_interactome".format(
            "observed" if observed else "inferred")
        sfams = [k.split("/",1)[0] for k in inf_store.list_input_directory() \
            if k.endswith(ending)]
    else:
        store.read_input_file(all_missing, all_missing_f)
        sfams = pd.read_hdf(all_missing_f, "table",
                            columns=["sfam"])["sfams"].drop_duplicates()

    map_job(job, compare_sfam, sfams)

    job.addFollowOnJobFn(get_missing, observed=observed)
Beispiel #13
0
def cleanup(job):
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    in_store = IOStore.get("{}:molmimic-interfaces".format(prefix))
    keys = list(in_store.list_input_directory())
    finished = set(
        key.split("/")[0] for key in keys
        if key.endswith("observed_interactome"))
    failed = set(
        key.split("/")[0] for key in keys
        if "_obsrows" in key and key.endswith("failed"))

    for key in failed - finished:
        in_store.remove_file(key)
def get_inferred_structural_interactome_by_table(job, table, pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs):
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    in_store = IOStore.get("aws:us-east-1:molmimic-ibis")
    out_store = IOStore.get("aws:us-east-1:molmimic-interfaces")

    RealtimeLogger.info("Running table {}".format(table))

    #Read in H5 for entire table
    tableInfPath = get_file(job, "IBIS_inferred_{}.h5".format(table), in_store)
    tableInfPathFileStoreID = job.fileStore.writeGlobalFile(tableInfPath)

    sfams = filter_hdf_chunks(tableInfPath, "Intrac{}".format(table),
        columns=["nbr_superfam_id"]).drop_duplicates().dropna()
    skip_sfam = set([s for s in sfams["nbr_superfam_id"] if \
        out_store.exists("{0}/{0}.inferred_interactome".format(int(s))) or \
        not out_store.exists("{0}/{0}.observed_interactome".format(int(s)))])

    # skip_sfam = set([int(f.split("/", 1)[0]) for f in out_store.list_input_directory() \
    #    if f.endswith(".inferred_interactome")])

    sfams = sfams[~sfams["nbr_superfam_id"].isin(skip_sfam)]
    sfams = sfams["nbr_superfam_id"].drop_duplicates().dropna().astype(int).tolist()

    # partial_sfams = set(int(k.split("/")[0]) for sfam in sfams for k in \
    #     out_store.list_input_directory(
    #         "{sfam}/_inftables/Intrac{table}_{sfam}.inferred_interactome".format( \
    #         sfam=sfam, table=table)) if not k.endswith("failed"))

    #sfams = list(set(sfams)-partial_sfams)

    if len(sfams) > 0:
        map_job(job, get_table_sfams, sfams, table, tableInfPathFileStoreID,
            pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs)

    try:
        os.remove(tableInfPath)
    except OSError:
        pass
def start_toil(job):
    import pandas as pd
    work_dir = job.fileStore.getLocalTempDir()
    in_store = IOStore.get("aws:us-east-1:molmimic-ibis")

    pdb_file = os.path.join(work_dir, "PDB.h5")
    in_store.read_input_file("PDB.h5", pdb_file)

    sfams = pd.read_hdf(pdb_file, "Superfamilies", columns=[
        "sfam_id"
    ]).drop_duplicates().dropna()["sfam_id"].sort_values()

    #sfams = [299845.0]

    map_job(job, calculate_features_for_sfam, sfams)

    os.remove(pdb_file)
Beispiel #16
0
def start_toil(job, name="prep_protein"):
    """Start the workflow to process PDB files"""
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    in_store = IOStore.get("{}:molmimic-ibis".format(prefix))

    #Download PDB info
    sdoms_file = os.path.join(work_dir, "PDB.h5")
    in_store.read_input_file("PDB.h5", sdoms_file)

    #Add pdb info into local job store
    pdbFileStoreID = job.fileStore.writeGlobalFile(sdoms_file)

    #Get all unique superfamilies
    sdoms = pd.read_hdf(unicode(sdoms_file), "merged")

    # skip_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "keep.csv")
    # if os.path.isfile(skip_file):
    #     skip = pd.read_csv(skip_file)
    #     sdoms = sdoms[sdoms["sdi"].isin(skip["sdi"])]
    #     RealtimeLogger.info("SKIPPING {} sdis; RUNIING {} sdis".format(skip.shape[0], sdoms.shape[0]))
    #
    sfams = sdoms["sfam_id"].drop_duplicates().dropna()
    #sfams = sfams[:1]
    #sfams = ["653504"]

    # max_cores = job.fileStore.jobStore.config.maxCores if \
    #     job.fileStore.jobStore.config.maxCores > 2 else \
    #     job.fileStore.jobStore.config.defaultCores

    max_cores = job.fileStore.jobStore.config.defaultCores
    #Add jobs for each sdi
    job.addChildJobFn(map_job,
                      process_sfam,
                      sfams,
                      pdbFileStoreID,
                      cores=max_cores)

    #Add jobs for to post process each sfam
    #job.addFollowOnJobFn(map_job, post_process_sfam, sfams, pdbFileStoreID,
    #    cores=max_cores)

    del sdoms
    os.remove(sdoms_file)
Beispiel #17
0
def get_sifts(pdb, job=None):
    if job is not None:
        work_dir = job.fileStore.getLocalTempDir()
        prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
        in_store = IOStore.get("{}:molmimic-sifts".format(prefix))
        sifts_prefix = "{}/{}.xml.gz".format(pdb[1:3].lower(), pdb.lower())
        sifts_path = os.path.join(work_dir, os.path.basename(sifts_prefix))
        job.log("Saving {}:molmimic-sifts :: {} to {}".format(
            prefix, sifts_prefix, sifts_path))

        try:
            in_store.read_input_file(sifts_prefix, sifts_path)
        except (SystemExit, KeyboardInterrupt):
            raise
        except:
            raise InvalidSIFTS("Cannot open {}".format(pdb))

        with open(sifts_path) as f:
            yield f

        os.remove(sifts_path)
    else:
        path = os.path.join(
            os.environ.get("PDB_SNAPSHOT",
                           os.path.join(data_path_prefix, "pdb")), "sifts",
            pdb[1:3].lower(), "{}.xml.gz".format(pdb.lower()))
        try:
            with open(path) as f:
                yield file
        except IOError as e:
            url = "ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/xml/{}.xml.gz".format(
                pdb.lower())
            try:
                sifts = requests.get(url)
                file = StringIO(sifts.content)
                yield file
            except requests.exceptions.RequestException:
                file.close()
                raise InvalidSIFTS("Not found: " + url + " orgig error:" +
                                   str(e))
            finally:
                file.close()
        os.remove(sifts)
Beispiel #18
0
def create_data_loader(job, sfam_id, preemptable=True):
    """Create H5 for Molmimic3dCNN to read

    Note: move this somewhere else
    """
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]

    pdb_path = os.path.join(work_dir, "pdb")
    if not os.path.isdir(pdb_path):
        os.makedirs(pdb_path)

    id_format = re.compile(
        "^([A-Z0-9]{4})_([A-Za-z0-9]+)_sdi([0-9]+)_d([0-9]+)$")

    #Get all with keys same sfam, but do not download

    in_store = IOStore.get("{}:molmimic-clustered-structures".format(prefix))
    keys = [id_format.match(f).groups() for f in in_store.list_input_directory(sfam_id) \
        if f.endswith(".pdb") and id_format.match(f)]

    pdb_path = os.path.join(PDB_PATH, dataset_name, "by_superfamily",
                            str(int(sfam_id)))
    clusters_file = os.path.join(pdb_path, "{}_nr.fasta".format(int(sfam_id)))

    try:
        pdb, chain, sdi, domain = zip(*[id_format.match(seq.id[:-2]).groups() \
            for s in SeqIO.parse(clusters_file, "fasta")])
    except ValueError:
        RealtimeLogger.info(
            "Unable to create data loading file for {}.".format(sfam_id))
        return

    domains = pd.DataFrame({
        "pdb": pdb,
        "chain": chain,
        "domNo": domain,
        "sdi": sdi
    })

    data_loader = os.path.join(pdb_path, "{}.h5".format(int(sfam_id)))
    domains.to_hdf(unicode(data_loader), "table", complevel=9, complib="bzip2")
Beispiel #19
0
def download_pdb(job,
                 sfam_id,
                 pdb,
                 chain,
                 sdi,
                 domNo,
                 extracted=False,
                 work_dir=None):
    if work_dir is None:
        work_dir = os.getcwd()

    in_store = IOStore.get("aws:us-east-1:molmimic-full-structures")

    prefix = "{}/{}/{}_{}_sdi{}_d{}.pdb".format(int(sfam_id), pdb[1:3].lower(),
                                                pdb.upper(), chain, sdi, domNo)
    file = os.path.join(work_dir, os.path.basename(prefix))
    if extracted:
        prefix += ".extracted"
    assert in_store.exists(prefix)
    in_store.read_input_file(prefix, file)
    return file
Beispiel #20
0
def download_consurf(pdb=None, chain=None, consurf_path=None):
    if consurf_path is None:
        consurf_path = os.getcwd()

    if (pdb, chain).count(None) == 0:
        pdb_id = pdb.upper()
        if chain != " ": pdb_id += "_" + chain
        consurf_db_file = os.path.join(consurf_path, pdb[1:3].upper(), pdb_id)
        download_all = False
    elif (pdb, chain).count(None) == 2:
        download_all = True
        pdb_id = None

    store = IOStore.get("aws:us-east-1:molmimic-consurf")
    pdb_list = os.path.join(consurf_path, "pdbaa_list.nr")

    done_consurf = [os.path.splitext(os.path.basename(k))[0] for k in \
        store.list_input_directory() if k != "pdbaa_list.nr"]

    if not store.exists("pdbaa_list.nr"):
        r = requests.get(
            "http://bental.tau.ac.il/new_ConSurfDB/ConSurfDB_list_feature.zip")
        zip_path = os.path.join(consurf_path, "ConSurfDB_list_feature.zip")
        with open(zip_path, "w") as zip:
            zip.write(r.content)
        with zipfile.ZipFile(zip_path) as z:
            with z.open("pdbaa_list.nr") as zf, open(pdb_list, 'wb') as f:
                copyfileobj(zf, f)
        store.write_output_file(pdb_list, "pdbaa_list.nr")
        os.remove(zip_path)
    else:
        store.read_input_file("pdbaa_list.nr", pdb_list)

    with open(pdb_list) as f:
        Parallel(n_jobs=-1)(delayed(parse_consurf_line)(line, pdb_id=pdb_id, \
            consurf_path=consurf_path, download_all=download_all, \
            done_consurf=done_consurf) for line in f)

    os.remove(pdb_list)
def merge_inferred_interactome(job, pdbFileStoreID):
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    out_store = IOStore.get("aws:us-east-1:molmimic-interfaces")

    pdb_file = get_file(job, "PDB.h5", pdbFileStoreID)
    sfams = pd.read_hdf(pdb_file, "Superfamilies", columns=
        ["sfam_id"]).drop_duplicates()["sfam_id"]
    os.remove(pdb_file)

    skip_sfam = [s for s in sfams if out_store.exists(
        "{0}/{0}.inferred_interactome".format(s))]

    sfam_to_run = [s for s in sfams if out_store.exists(
        "{0}/{0}.observed_interactome".format(s)) and s not in skip_sfam]

    # all_sfam = [os.path.basename(f).split(".") for f in out_store.list_input_directory() if not f.endswith("failed")]
    # skip_sfam = [f[0] for f in all_sfam if f[1] == "inferred_interactome"]
    # sfam_to_run = [f[0] for f in all_sfam if f[1] == "observed_interactome" \
    #     and f[0] not in skip_sfam]
    map_job(job, merge_inferred_interactome_sfam, sfam_to_run)
    job.addFollowOnJobFn(cleanup, sfam_to_run)
Beispiel #22
0
def get_observed_structural_interactome(job, sfam_id, pdbFileStoreID,
                                        ibisObsFileStoreID):
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    out_store = IOStore.get("{}:molmimic-interfaces".format(prefix))

    ibis_obs_path = get_file(job, "IBIS_obs.h5", ibisObsFileStoreID)
    try:
        df = filter_hdf(ibis_obs_path,
                        "ObsInt",
                        "mol_superfam_id",
                        float(sfam_id),
                        columns=["obs_int_id"])
        int_ids = df["obs_int_id"].drop_duplicates().dropna().astype(int)
        if len(int_ids) == 0:
            job.log("EMPTY OBS SFAM {}".format(sfam_id))
            print "EMPTY OBS SFAM {}".format(sfam_id)
            return
    except (SystemExit, KeyboardInterrupt):
        raise
    except Exception as e:
        job.log("FAILED OBS SFAM {} {}".format(sfam_id, e))
        print "FAILED OBS SFAM {} {}".format(sfam_id, e)
        return

    current_rows = set(
        int(os.path.basename(key)[:-3])
        for key in out_store.list_input_directory("{}/_obsrows".format(
            int(sfam_id))) if not key.endswith("failed"))
    int_ids = list(set(int_ids) - current_rows)
    print "Will run {} ids: {}".format(len(int_ids), int_ids)

    if len(int_ids) > 0:
        #Add jobs for each interaction
        map_job(job, process_observed_interaction, int_ids, sfam_id,
                ibisObsFileStoreID, pdbFileStoreID)

    #Merge converted residues
    job.addFollowOnJobFn(merge_interactome_rows, sfam_id)
Beispiel #23
0
def compare_sfam(job, sfam, useExisting=False, observed=True):
    work_dir = job.fileStore.getLocalTempDir()
    out_store = IOStore.get("aws:us-east-1:molmimic-missing-structures")
    inf_store = IOStore.get("aws:us-east-1:molmimic-interfaces")
    struc_store = IOStore.get("aws:us-east-1:molmimic-full-structures")

    all_missing = "missing_{}.h5".format(
        "observed" if observed else "inferred")
    all_missing_f = os.path.join(work_dir, all_missing)

    if not useExisting or not out_store.exists(all_missing):
        obs_key = "{sfam}/{sfam}.{type}_interactome".format(
            sfam=sfam, type="observed" if observed else "inferred")
        obs_f = os.path.join(work_dir, os.path.basename(obs_key))

        try:
            inf_store.read_input_file(obs_key, obs_f)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            job.log("Unable to open {} ({})".format(obs_key, e))
            return []

        st = pd.HDFStore(obs_f)
        df = st.get("/table")
        st.close()

        get_key = lambda f, p, c, s, d: "{}/{}/{}_{}_sdi{}_d{}.pdb".format(
            int(f), p[1:3].lower(), p.upper(), c, s, d)

        mol_ints = df[[
            "mol_pdb", "mol_chain", "mol_sdi_id", "mol_domNo",
            "mol_superfam_id"
        ]]
        mol_ints = mol_ints.rename(
            columns={
                "mol_pdb": "pdb",
                "mol_chain": "chain",
                "mol_sdi_id": "sdi",
                "mol_domNo": "domNo",
                "mol_superfam_id": "sfam_id"
            })
        int_ints = df[[
            "int_pdb", "int_chain", "int_sdi_id", "int_domNo",
            "int_superfam_id"
        ]]
        int_ints = int_ints.rename(
            columns={
                "int_pdb": "pdb",
                "int_chain": "chain",
                "int_sdi_id": "sdi",
                "int_domNo": "domNo",
                "int_superfam_id": "sfam_id"
            })
        pdbs = pd.concat((mol_ints, int_ints)).drop_duplicates()

    else:
        out_store.read_input_file(all_missing, all_missing_f)
        sfams = pd.read_hdf(all_missing_f, "table")
        ibis_store = IOStore.get("aws:us-east-1:molmimic-ibis")
        pdbf = ibis_store.read_input_file("PDB.h5",
                                          os.path.join(work_dir, "PDB.h5"))
        pdb = pd.read_hdf(os.path.join(work_dir, "PDB.h5"),
                          "merged",
                          columns=["sdi", "sfam_id"]).drop_duplicates()
        pdbs = pd.merge(sfams, pdb, on="sdi")

    missing = []

    for i, row in pdbs.iterrows():
        try:
            if not struc_store.exists(
                    get_key(row.sfam_id, row.pdb, row.chain, row.sdi,
                            row.domNo)):
                raise
        except (KeyboardInterrupt, SystemExit):
            raise
        except:
            missing.append(row)
            continue
        RealtimeLogger.info("    Found - {} {} {} {} {}".format(
            row.sfam_id, row.pdb, row.chain, row.sdi, row.domNo))

    if len(missing) > 0 and not useExisting:
        RealtimeLogger.info("{} Missing {} entries".format(sfam, len(missing)))

        missing = pd.DataFrame(missing)

        key = "{}_{}.h5".format("observed" if observed else "inferred",
                                int(sfam))
        path = os.path.join(work_dir, key)
        missing.to_hdf(path, "table")
        out_store.write_output_file(path, key)
    elif len(missing) > 0 and useExisting:
        missing = pd.DataFrame(missing)
        file = "missing_{}_{}.h5".format(
            "observed" if observed else "inferred",
            str(datetime.datetime.now()).replace(" ", "-"))
        outfile = os.path.join(work_dir, file)
        missing.to_hdf(outfile, "table")
        out_store.write_output_file(outfile, file)
def merge_inferred_interactome_sfam(job, sfam_id):
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    iostore = IOStore.get("aws:us-east-1:molmimic-interfaces")

    sfam_prefix = "{}/_infrows".format(int(sfam_id))
    sfam_file = "{s}/{s}.inferred_interactome".format(s=int(sfam_id))
    merged_file = job.fileStore.getLocalTempFileName()

    to_delete = []
    done_tables = []
    for table_prefix in iostore.list_input_directory(sfam_prefix):
        if table_prefix.endswith("failed"): continue

        table = int(os.path.basename(os.path.dirname(table_prefix)).replace("Intrac", ""))

        if table in done_tables:
            continue

        RealtimeLogger.info("Running table sfam {}".format(table_prefix))

        try:
            RealtimeLogger.info("Merge {} {}".format(sfam_id, table))
            table_sfam_prefix = merge_table_sfam(job, sfam_id, table)
            if table_sfam_prefix is None:
                RealtimeLogger.info("Merging failed for {} {}".format(sfam_id, table))
            table_file = os.path.join(work_dir, os.path.basename(table_sfam_prefix))
            iostore.read_input_file(table_sfam_prefix, table_file)
            for df in pd.read_hdf(unicode(table_file), "table", chunksize=1000):
                df.to_hdf(unicode(merged_file), "table", mode="a", append=True, format="table",
                    table=True, complevel=9, complib="bzip2", min_itemsize=1024)
            to_delete.append(table_sfam_prefix)
        except (IOError, ValueError) as e:
            raise
            job.log("Failed to read {} bc {}".format(table_prefix, e))

        try:
            os.remove(table_file)
        except OSError:
            pass

        done_tables.append(table)


    if os.path.isfile(merged_file):
        #Write output file
        iostore.write_output_file(merged_file, sfam_file)

        #Cleanup
        try:
            os.remove(merged_file)
        except OSError:
            pass
        for key in to_delete:
            iostore.remove_file(key)
            rows_prefix = "{}/_infrows".format(int(sfam_id))
            if iostore.exists(rows_prefix):
                for f in iostore.list_input_directory(rows_prefix):
                    iostore.remove(f)

    else:
        failed_file = os.path.join(work_dir, "failed_file")
        with open(failed_file, "w") as f:
            print >>f, "No merged_file present"
        iostore.write_output_file(failed_file, sfam_file+".failed")
        try:
            os.remove(failed_file)
        except OSError:
            pass
Beispiel #25
0
def merge_interactome_rows(job, sfam_id):
    print "Start merge", sfam_id
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    out_store = IOStore.get("{}:molmimic-interfaces".format(prefix))

    status = "observed"
    new_cols = ["mol_res", "int_res"]
    resi_prefix = "{}.observed_interactome".format(int(sfam_id))
    data_cols = ["obs_int_id", "mol_sdi", "int_sdi"]

    resi_path = os.path.join(work_dir, resi_prefix)

    #Combine residues into dataframe
    possible_errors = []
    nrows = None
    for nrows, row_prefix in enumerate(
            out_store.list_input_directory("{}/_obsrows/".format(
                int(sfam_id)))):
        if row_prefix.endswith("failed"): continue
        job.log("Running {} {}".format(sfam_id, row_prefix))

        row_file = os.path.join(work_dir, os.path.basename(row_prefix))
        out_store.read_input_file(row_prefix, row_file)

        df = pd.read_hdf(row_file, "table")
        try:
            df.to_hdf(unicode(resi_path),
                      "table",
                      table=True,
                      format="table",
                      append=True,
                      mode="a",
                      data_columns=data_cols,
                      complib="bzip2",
                      complevel=9,
                      min_itemsize=1024)
        except (SystemExit, KeyboardInterrupt):
            raise
        except:
            import traceback
            tb = traceback.format_exc()
            job.log("Failed writing {}: {} {}".format(sfam_id, resi_path, tb))
            possible_errors.append(tb)
            continue

        try:
            os.remove(row_file)
        except OSError:
            pass

        out_store.remove_file(row_prefix)

    if os.path.isfile(resi_path):
        #Upload to S3
        out_store.write_output_file(
            resi_path, os.path.join(str(int(sfam_id)), resi_prefix))

        #Cleanup
        os.remove(resi_path)
        print "End merge", sfam_id
    elif nrows is not None:
        job.log("Failed merging: {}".format(resi_path))
        print "Failed merging: {}".format(resi_path)
        fail_file = os.path.join(work_dir, "fail_file")
        with open(fail_file, "w") as f:
            f.write("No rows?")
            for e in possible_errors:
                f.write(e)
                f.write("\n")
        out_store.write_output_file(
            fail_file, "{}/{}.failed".format(int(sfam_id), resi_prefix))
        try:
            os.remove(fail_file)
        except OSError:
            pass
def get_table_sfams(job, mol_sfam_id, table, tableInfStoreID, pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs):
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    in_store = IOStore.get("aws:us-east-1:molmimic-ibis")
    out_store = IOStore.get("aws:us-east-1:molmimic-interfaces")

    RealtimeLogger.info("Running table {} sfam {}".format(table, mol_sfam_id))


    pdbFilePath = get_file(job, "PDB.h5", pdbFileStoreID)
    taxFilePath = get_file(job, "pdb_chain_taxonomy.h5", taxFileStoreID)
    # obsFile = get_file(job, "IBIS_observed.h5", in_store)
    #
    # try:
    #     observed_interactome = filter_hdf_chunks("IBIS_observed.h5", "table", "obs_int_id", mol_sfam_id)
    # except (SystemExit, KeyboardInterrupt):
    #     raise
    # except:
    #     try:
    #         observed_interactome = filter_hdf_chunks("IBIS_observed.h5", "table", "obs_int_id", float(mol_sfam_id))
    #     except (SystemExit, KeyboardInterrupt):
    #         raise
    #     except:
    #         job.log("Failed reading IBIS_observed.h5")
    #         return
    sfamFileStoreID = sfamFileStoreIDs[mol_sfam_id]
    obsFilePath = get_file(job, "{}_obs.h5".format(int(mol_sfam_id)),
        sfamFileStoreID, work_dir=work_dir)

    observed_interactome = pd.read_hdf(obsFilePath, "table")
    RealtimeLogger.info("Obs has {} rows".format(observed_interactome.shape))

    # obsFilePath = os.path.join(work_dir, "{0}.observed_interactome".format(int(mol_sfam_id)))
    # out_store.read_input_file("{0}/{0}.observed_interactome".format(int(mol_sfam_id)), obsPath)

    tableInfPath = get_file(job, "IBIS_inferred_{}.h5".format(table), tableInfStoreID)
    # skip_int = set([tuple(map(int, os.path.basename(f)[:-3].split("_"))) for f in out_store.list_input_directory(
    #     "{}/_infrows/Intrac{}".format(int(mol_sfam_id),  table)) if f.endswith(".h5")])
    try:
        inf_int_ids = filter_hdf_chunks(tableInfPath, "Intrac{}".format(table), chunksize=100,
            nbr_superfam_id=mol_sfam_id)
    except (RuntimeError, TypeError):
        job.log("Unable to find sfam {} in table {}, Skipping".format(mol_sfam_id, table))
        return

    #inf_int_ids = set([tuple(row) for row in inf_int_ids.itertuples()])
    #inf_int_ids -= skip_int
    #print "Starting table sfam", mol_sfam_id, inf_int_ids

    #would this be better to just ran as a loop?
    #map_job(job, process_inferred_interaction, list(inf_int_ids), mol_sfam_id, table, tableInfStoreID, pdbFileStoreID, taxFileStoreID)
    try:
        fail_file = os.path.join(work_dir, "fail_file")
        for row in inf_int_ids.iterrows():
            #if tuple(row) in skip_int: continue
            nbr_obs_int_id = row[1]["nbr_obs_int_id"]
            nbr_sdi = row[1]["nbr_sdi_id"]
            mol_sdi = row[1]["mol_sdi_id"]
            inf_int_id_file = "{}_{}_{}_{}".format(row[0], nbr_obs_int_id, nbr_sdi, mol_sdi)

            if out_store.exists("{}/_infrows/Intrac{}/{}.inf.h5".format(int(mol_sfam_id), table, inf_int_id_file)):
                continue

            RealtimeLogger.info("Running {}".format(row))
            out = process_inferred_interaction(
                job,
                row,
                mol_sfam_id,
                table,
                observed_interactome,
                pdbFilePath,
                taxFilePath,
                isrow=True, work_dir=work_dir)
            if out is not None:
                inf_int_id, tb = out
                with open(fail_file, "a") as f:
                    f.write(str(inf_int_id))
                    f.write("\n")
                    f.write(tb)
        if os.path.isfile(fail_file):
            out_store.write_output_file(fail_file, "{}/_infrows/Intrac{}/failed".format(int(mol_sfam_id), table))
            try:
                os.remove(fail_file)
            except OSError:
                 pass
        #job.addFollowOnJobFn(merge_table_sfam, mol_sfam_id, table)
    except (SystemExit, KeyboardInterrupt):
        raise
    except:
        raise
    finally:
        for f in [tableInfPath, obsFilePath, pdbFilePath, taxFilePath]:
            try:
                os.remove(f)
            except OSError:
                pass
def merge_table_sfam(job, sfam_id, table):
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    in_store = IOStore.get("aws:us-east-1:molmimic-ibis")
    out_store = IOStore.get("aws:us-east-1:molmimic-interfaces")

    print "Start merge", sfam_id

    status = "inferred (table {}) {}".format(table, sfam_id)
    resi_prefix = "Intrac{}_{}.inf.h5".format(table, int(sfam_id))
    new_cols = ["mol_res"]
    data_cols = ["nbr_obs_int_id", "nbr_sdi", "mol_sdi", "int_sdi"]

    resi_path = os.path.join(work_dir, resi_prefix)

    #Combine residues into dataframe
    possible_errors = []
    to_delete = []
    row_key = "{}/_infrows/Intrac{}".format(int(sfam_id), table)
    RealtimeLogger.info("Merginf rows from {}".format(row_key))
    for row_prefix in out_store.list_input_directory(row_key):
        if not row_prefix.endswith(".inf.h5"): continue
        RealtimeLogger.info("Running  {} {}".format(sfam_id, row_prefix))

        row_file = os.path.join(work_dir, os.path.basename(row_prefix))

        try:
            out_store.read_input_file(row_prefix, row_file)
            df = pd.read_hdf(row_file, "table")
            for col, _ in df.dtypes[df.dtypes == 'int64'].iteritems():
                df[col] = df[col].astype(float)
            df.to_hdf(unicode(resi_path), "table", table=True, format="table", append=True, mode="a",
                data_columns=data_cols, complib="bzip2", complevel=9, min_itemsize=1024)

        except (SystemExit, KeyboardInterrupt):
            raise
        except:
            import traceback
            tb = traceback.format_exc()
            job.log("Failed writing {}: {} {}".format(sfam_id, resi_path, tb))
            possible_errors.append(tb)

        try:
            os.remove(row_file)
        except OSError:
            pass
        #to_delete.append(row_prefix)


    if os.path.isfile(resi_path):
        #Upload to S3
        outfile = "{}/_inftables/{}".format(int(sfam_id), resi_prefix)
        outprefix = "{}/_inftables/{}".format(int(sfam_id), resi_prefix)
        out_store.write_output_file(resi_path, outprefix)

        #Cleanup
        os.remove(resi_path)
        for key in to_delete:
            out_store.remove_file(key)

        print "End merge", sfam_id, table
        return outprefix
    else:
        job.log("Failed merging: {}".format(resi_path))
        print "Failed merging: {}".format(resi_path)
        fail_file = os.path.join(work_dir, "fail_file")
        with open(fail_file, "w") as f:
            f.write("no rows?\n")
            for e in possible_errors:
                f.write(e)
                f.write("\n")
        out_store.write_output_file(fail_file, "{}/_inftables/{}.failed".format(int(sfam_id), resi_prefix))
def process_inferred_interaction(job, inf_int_id, nbr_sfam, table, tableInfStoreID, pdbFileStoreID, taxFileStoreID, isrow=False, work_dir=None):
    if work_dir is None:
        work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    in_store = IOStore.get("aws:us-east-1:molmimic-ibis")
    out_store = IOStore.get("aws:us-east-1:molmimic-interfaces")

    inf_int_id_file = os.path.basename(job.fileStore.getLocalTempFileName())
    try:
        if not isrow:
            inf_int_id_file = "_".join(map(str, inf_int_id))
            inf_int_id, nbr_obs_int_id, nbr_sdi, mol_sdi = inf_int_id
            tableInfFile = get_file(job, "IBIS_inferred_{}.h5".format(table), tableInfStoreID)
            inferred_interfaces = filter_hdf_chunks(tableInfFile, "Intrac{}".format(table),
                nbr_obs_int_id = nbr_obs_int_id,
                nbr_sdi_id = nbr_sdi,
                mol_sdi_id = mol_sdi)
            inferred_interfaces = inferred_interfaces.loc[inf_int_id].to_frame().T
        else:
            inferred_interfaces = inf_int_id[1].to_frame().T
            #inf_int_id = inferred_interfaces[""]
            nbr_obs_int_id = inf_int_id[1]["nbr_obs_int_id"]
            nbr_sdi = inf_int_id[1]["nbr_sdi_id"]
            mol_sdi = inf_int_id[1]["mol_sdi_id"]
            inf_int_id_file = "{}_{}_{}_{}".format(inf_int_id[0], nbr_obs_int_id, nbr_sdi, mol_sdi)

        inferred_interfaces["mol_sdi_id"] = inferred_interfaces["mol_sdi_id"].astype(float)

        if inferred_interfaces.shape[0] == 0:
            return

        pdb_file = get_file(job, "PDB.h5", pdbFileStoreID) if not isrow else pdbFileStoreID
        tax_file = get_file(job, "pdb_chain_taxonomy.h5", taxFileStoreID) if not isrow else taxFileStoreID

        try:
            struct_domains = filter_hdf(
                pdb_file, "merged",
                columns = ["sdi", "domNo", "gi", "pdbId", "chnLett", "from", "to", "sfam_id"],
                sdi = mol_sdi).drop_duplicates()
        except RuntimeError:
            job.log("SDI {} is obsolete".format(mol_sdi))
            return

        pdbs = struct_domains["pdbId"]
        taxa = pd.read_hdf(tax_file, "table", where="pdbId in pdbs")
        struct_domains = pd.merge(struct_domains, taxa, on=["pdbId", "chnLett"], how="left")

        #Add PDB, chain, and sdi information. Inner join to only allow sdi's that are in databses, not obsolete
        inferred_interfaces = pd.merge(inferred_interfaces, struct_domains, how="inner", left_on="mol_sdi_id", right_on="sdi")
        #RealtimeLogger.info("INTERFACE: {}".format(inferred_interfaces.iloc[0]))

        if isrow:
            #RealtimeLogger.info("tableInfStoreID is {} {}".format(type(tableInfStoreID), nbr_obs_int_id))
            #RealtimeLogger.info("{}".format(tableInfStoreID.iloc[0]))
            observed_interactome = tableInfStoreID[tableInfStoreID["obs_int_id"]==nbr_obs_int_id]
            #RealtimeLogger.info("Got obs file from row")

        else:
            obspath = job.fileStore.getLocalTempFileName()
            out_store.read_input_file("{0}/{0}.observed_interactome".format(nbr_sfam), obspath)
            raise RuntimeError("Not a row!")

            try:
                observed_interactome = filter_hdf_chunks(obspath, "table", "obs_int_id", nbr_obs_int_id)
            except (SystemExit, KeyboardInterrupt):
                raise
            except:
                try:
                    observed_interactome = filter_hdf_chunks(obspath, "table", "obs_int_id", float(nbr_obs_int_id))
                except (SystemExit, KeyboardInterrupt):
                    raise
                except:
                    job.log("Failed reading {}".format(obspath))
                    raise

        observed_interactome = observed_interactome.rename(columns={
            'mol_pdb': 'nbr_pdb',
            'mol_chain': 'nbr_chain',
            'mol_domNo': 'nbr_domNo',
            'mol_res': 'nbr_res',
            'mol_gi_x': 'nbr_gi',
            #'mol_taxid': 'nbr_taxid',
            #'mol_superfam_id': 'nbr_superfam_id'
            })

        #Add in neghibor information from observed interactome
        inferred_interfaces["nbr_obs_int_id"] = inferred_interfaces["nbr_obs_int_id"].astype(int)
        inferred_interfaces = pd.merge(inferred_interfaces, observed_interactome,
            how="left", left_on="nbr_obs_int_id", right_on="obs_int_id",
            suffixes=["_inf", "_obs"])
        del observed_interactome
        #RealtimeLogger.info("INTERFACE: {}".format(inferred_interfaces.iloc[0]))

        #Select relevant columns
        if "int_superfam_id" in inferred_interfaces.columns:
            int_superfam_id_col = "int_superfam_id"
        elif "int_superfam_id_inf" in inferred_interfaces.columns:
            int_superfam_id_col = "int_superfam_id_inf"
        elif "int_superfam_id_x" in inferred_interfaces.columns:
            #Suffix not working?
            int_superfam_id_col = "int_superfam_id_x"
        else:
            raise RuntimeError("Merge faled for obs and inf: {}".format(inferred_interfaces.columns))
        try:
            inferred_interfaces = inferred_interfaces[[
                "sdi", "pdbId", "chnLett", "domNo", "from", "to", "resn", "resi",
                "taxid", "gi", "sfam_id",
                "nbr_sdi_id", "nbr_pdb", "nbr_chain", "nbr_domNo", "nbr_res",
                "nbr_taxid", "nbr_gi", "nbr_superfam_id", "nbr_obs_int_id", "nbr_score",
                "int_sdi_id", "int_pdb", "int_chain", "int_domNo", "int_res",
                "int_sdi_from", "int_sdi_to", "int_taxid", "int_gi_x", "int_gi_y",
                int_superfam_id_col]]
        except KeyError as e:
            job.log("Unable to filter df. Columns are: {}. Error is: {}".format(inferred_interfaces.columns, e))
            raise

        #Rename columns
        inferred_interfaces = inferred_interfaces.rename(columns={
            "sdi":"mol_sdi_id",
            "pdbId":"mol_pdb",
            "chnLett":"mol_chain",
            "domNo":"mol_domNo",
            "from":"mol_sdi_from",
            "to":"mol_sdi_to",
            "resn":"mol_resn",
            "resi":"mol_resi",
            "taxid":"mol_taxid",
            "gi":"mol_gi",
            "sfam_id":"mol_superfam_id",
            int_superfam_id_col:"int_superfam_id",
        })

        try:
            inferred_interfaces = inferred_interfaces[ \
                ~inferred_interfaces["mol_sdi_id"].isnull() & \
                ~inferred_interfaces["int_sdi_id"].isnull()]
        except KeyError as e:
            job.log("Unable to drop na. Columns are: {}. Error is: {}".format(inferred_interfaces.columns, e))
            raise

        # taxa = []
        #
        # for row in inferred_interfaces.itertuples():
        #     #Should only be one row, but maybe not
        #     try:
        #         taxas = filter_hdf(tax_file, "table",
        #             pdbId = row.mol_pdb,
        #             chnLett = row.mol_chain)
        #         taxa.append(float(taxas.iloc[0]["taxId"]))
        #     except (KeyboardInterrupt, SystemExit):
        #         raise
        #     else:
        #         taxa.append(np.NaN)
        # taxa = pd.Series(taxa, index=inferred_interfaces.index)
        # inferred_interfaces = inferred_interfaces.assign(mol_taxid=taxa)

        try:
            resi = pd.Series([decode_residues(job, row.mol_pdb, row.mol_chain, row.mol_resi, row) \
                for row in inferred_interfaces.itertuples()], index=inferred_interfaces.index)
        except (KeyboardInterrupt, SystemExit):
            raise
        except:
            import traceback
            tb = traceback.format_exc()
            job.log("FAILED {}".format(tb))
            print tb
            raise
            resi = inferred_interfaces["mol_resi"].copy()


        inferred_interfaces = inferred_interfaces.assign(mol_res=resi)
        del resi
        del inferred_interfaces["mol_resi"]
        del inferred_interfaces["mol_resn"]

        str_cols = ["mol_pdb", "mol_chain", "mol_res",
                    "int_pdb",  "int_chain", "int_res",
                    "nbr_pdb", "nbr_chain", "nbr_res"]

        #RealtimeLogger.info("INTERFACE: {}".format(inferred_interfaces.iloc[0]))

        for col in inferred_interfaces.columns:
            inferred_interfaces[col] = inferred_interfaces[col].astype(
                str if col in str_cols else float)

        mol_sfams = inferred_interfaces["mol_superfam_id"].drop_duplicates()
        if len(mol_sfams) == 0:
            return
        elif len(mol_sfams) == 1:
            #Write to HDF file
            df_file = job.fileStore.getLocalTempFileName()
            inferred_interfaces.to_hdf(unicode(df_file), "table", format="table",
                table=True, complevel=9, complib="bzip2", min_itemsize=1024,
                data_coumns=["nbr_obs_int_id", "nbr_sdi_id", "mol_sdi_id", "int_sdi_id"])
            job.log("Wrote "+df_file)
            df_files = [df_file]

            #Add ibis info into out store
            out_store.write_output_file(df_file, "{}/_infrows/Intrac{}/{}.inf.h5".format(int(mol_sfams[0]), table, inf_int_id_file))
        else:
            df_files = []
            for i, inf_row in inferred_interfaces.iterrows():
                mol_sfam = inf_row["mol_superfam_id"]
                inf_row = inf_row.to_frame().T
                for col in inf_row.columns:
                    inf_row[col] = inf_row[col].astype(str if col in str_cols else float)

                #Write to HDF file
                df_file = job.fileStore.getLocalTempFileName()
                inf_row.to_hdf(unicode(df_file), "table", format="table",
                    table=True, complevel=9, complib="bzip2", min_itemsize=1024,
                    data_coumns=["nbr_obs_int_id", "nbr_sdi_id", "mol_sdi_id", "int_sdi_id"])
                job.log("Wrote "+df_file)
                df_files.append(df_file)

                #Add ibis info into out store
                out_store.write_output_file(df_file, "{}/_infrows/Intrac{}/{}.inf.h5".format(int(mol_sfam), table, inf_int_id_file))


    except (KeyboardInterrupt, SystemExit):
        raise
    except:
        import traceback
        tb = traceback.format_exc()
        job.log("FAILED {} {}".format(nbr_sfam, tb))

        if not isrow:
            fail_file = os.path.join(work_dir, "fail_file")
            with open(fail_file, "w") as f:
                f.write(str(inf_int_id))
                f.write("\n")
                f.write(tb)
            out_store.write_output_file(fail_file, "{}/_infrows/Intrac{}/{}.failed".format(int(nbr_sfam), table, inf_int_id_file))
            try:
                os.remove(fail_file)
            except OSError:
                 pass
        else:
            return inf_int_id, tb


        try:
            for f in (df_file, pdb_file, tableInfFile):
                try:
                    os.remove(f)
                except OSError:
                    pass
        except:
            pass
    finally:
        try:
            files = df_files if isrow else df_files+[pdb_file, tableInfFile]
            for f in files:
                try:
                    os.remove(f)
                except OSError:
                    pass
        except:
            pass
def calculate_features(job,
                       pdb_or_key,
                       sfam_id=None,
                       chain=None,
                       sdi=None,
                       domNo=None,
                       work_dir=None):
    from molmimic.common.featurizer import ProteinFeaturizer

    if work_dir is None and job is not None:
        work_dir = job.fileStore.getLocalTempDir()

    if work_dir is None or not os.path.isdir(work_dir):
        work_dir = os.getcwd()

    in_store = IOStore.get("aws:us-east-1:molmimic-full-structures")
    out_store = IOStore.get("aws:us-east-1:molmimic-structure-features")

    if [sfam_id, chain, sdi, domNo].count(None) == 0:
        #pdb_or_key is pdb
        pdb = pdb_or_key
        key = "{}/{}/{}_{}_sdi{}_d{}".format(int(sfam_id),
                                             pdb.lower()[1:3], pdb.upper(),
                                             chain, sdi, domNo)
    else:
        #pdb_or_key is key
        assert pdb_or_key.count("_") == 3
        key = os.path.splitext(pdb_or_key)[0]
        pdb, chain, sdi, domNo = os.path.basename(key).split("_")
        sdi, domNo = sdi[3:], domNo[1:]

    try:
        pdb_path = os.path.join(work_dir, os.path.basename(key) + ".pdb")
        in_store.read_input_file(key + ".pdb", pdb_path)

        s = ProteinFeaturizer(pdb_path,
                              pdb,
                              chain,
                              sdi=sdi,
                              domNo=domNo,
                              work_dir=work_dir,
                              job=job)

        _, atom_features = s.calculate_flat_features()
        RealtimeLogger.info("Finished atom features")
        _, residue_features = s.calculate_flat_features(course_grained=True)
        RealtimeLogger.info("Finished residue features")
        graph_features = s.calculate_graph()
        RealtimeLogger.info("Finished edge features")

        out_store.write_output_file(atom_features, key + "_atom.npy")
        out_store.write_output_file(residue_features, key + "_residue.npy")
        out_store.write_output_file(graph_features, key + "_edges.gz")

        for f in (pdb_path, atom_features, residue_features, graph_features):
            try:
                os.remove(f)
            except OSError:
                pass
    except (SystemExit, KeyboardInterrupt):
        raise
    except Exception as e:
        raise
        fail_key = "{}_error.fail".format(key)
        fail_file = os.path.join(work_dir, os.path.basename(key))
        with open(fail_file, "w") as f:
            f.write("{}\n".format(e))
        out_store.write_output_file(fail_file, fail_key)
        os.remove(fail_file)
Beispiel #30
0
def process_observed_interaction(job, int_id, sfam_id, ibisFileStoreID,
                                 pdbFileStoreID):
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    out_store = IOStore.get("{}:molmimic-interfaces".format(prefix))

    mmdb_path = get_file(job, "PDB.h5", pdbFileStoreID)

    ibis_path = get_file(job, "IBIS_obs.h5", ibisFileStoreID)
    row = filter_hdf(ibis_path, "ObsInt", "obs_int_id", int_id)

    try:
        #Read in face1 residues
        face1 = filter_hdf(unicode(ibis_path), "MolResFace", "obs_int_id",
                           int_id)
        face1.columns = ["obs_int_id", "mol_res"]
        print face1.shape
        #Keep entries from current CDD
        row = pd.merge(row, face1, how="left", on="obs_int_id")
        del face1

        #Read in face2 residues and convert gzipped asn1 into res numbers
        face2 = filter_hdf(unicode(ibis_path), "IntResFace", "obs_int_id",
                           int_id)
        face2.columns = ["obs_int_id", "int_res"]
        print face2.shape

        #Keep entries from current CDD
        row = pd.merge(row, face2, how="left", on="obs_int_id")
        print row.shape
        del face2

        try:
            st_domain_mol = filter_hdf(unicode(mmdb_path), "StructuralDomains",
                                       "sdi", row.iloc[0]['mol_sdi_id'])
            st_domain_mol.columns = [
                'mol_sdi_id', 'mol_domNo', 'mol_gi', 'mol_pdb', 'mol_chain',
                'mol_sdi_from', 'mol_sdi_to'
            ]
            row = pd.merge(row, st_domain_mol, how="left", on="mol_sdi_id")
            del st_domain_mol

            st_domain_int = filter_hdf(unicode(mmdb_path), "StructuralDomains",
                                       "sdi", row.iloc[0]['int_sdi_id'])
            st_domain_int.columns = [
                'int_sdi_id', 'int_domNo', 'int_gi', 'int_pdb', 'int_chain',
                'int_sdi_from', 'int_sdi_to'
            ]
            row = pd.merge(row, st_domain_int, how="left", on="int_sdi_id")
            del st_domain_int
        except TypeError:
            #SDI's don't exists, must be obsolete
            print "Done row", int_id
            return

        updated_resi = {"mol_res": [], "int_res": []}

        for resi in row.itertuples():
            try:
                updated_resi["mol_res"].append(
                    decode_residues(job, resi.mol_pdb, resi.mol_chain,
                                    resi.mol_res, resi))
                updated_resi["int_res"].append(
                    decode_residues(job, resi.int_pdb, resi.int_chain,
                                    resi.int_res, resi))
            except InvalidSIFTS:
                #This Row has failed converting binary to string skip iti
                updated_resi["mol_res"].append(np.NaN)
                updated_resi["int_res"].append(np.NaN)
                continue

        if len(updated_resi["mol_res"].dropna()) > 0:
            row = row.assign(**updated_resi)
            row.dropna()
        else:
            #This entire interation failed; returned None
            return None

        path = "{}.h5".format(int_id)
        row.to_hdf(path,
                   "table",
                   table=True,
                   format="table",
                   complib="bzip2",
                   complevel=9,
                   min_itemsize=1024)
        out_store.write_output_file(
            path, "{}/_obsrows/{}".format(int(sfam_id), path))
        print "Done row", int_id
        try:
            os.remove(fail_file)
        except OSError:
            pass
    except (SystemExit, KeyboardInterrupt):
        raise
    except Exception as e:
        import traceback
        tb = traceback.format_exc()
        job.log("FAILED {} {}".format(int_id, e, tb))
        fail_file = os.path.join(work_dir, "fail_file")
        with open(fail_file, "w") as f:
            f.write(str(e))
            f.write(str(tb))
        out_store.write_output_file(
            fail_file, "{}/_obsrows/{}.failed".format(int(sfam_id), int_id))
        try:
            os.remove(fail_file)
        except OSError:
            pass