def __make_seq_file(seq_record, file_name=".seq.fa"):

    # Remove seq file if exists...
    if os.path.exists(file_name):
        os.remove(file_name)

    # Write
    Jglobals.write(file_name, seq_record.format("fasta"))
Exemple #2
0
def __get_Pfam_alignments(taxon, out_dir=out_dir):

    # Skip if Pfam JSON file already exists
    pfam_json_file = os.path.join(out_dir, taxon + pfam_file_ext)
    if not os.path.exists(pfam_json_file):

        # Change dir
        os.chdir(out_dir)

        # Initialize
        pfams = {}
        seq_file = ".seq.fasta"
        hmm_db = os.path.join("pfam", "All.hmm")
        uniprot_json_file = taxon + uniprot_file_ext

        # Load JSON file
        with open(uniprot_json_file) as f:
            uniaccs = json.load(f)

        # For each uniacc...
        for u in uniaccs:

            # Initialize
            pfams.setdefault(u, [])

            # Make seq file
            seq = Seq(uniaccs[u][1], IUPAC.protein)
            record = SeqRecord(seq, id=u, name=u, description=u)
            __make_seq_file(record, seq_file)

            # For each DBD...
            for pfam_id_std, start, end, evalue in hmmscan(
                    seq_file, hmm_db, non_overlapping_domains=True):

                # Initialize
                hmm_file = os.path.join("pfam", "%s.hmm" % pfam_id_std)

                # Make seq file
                sub_seq = seq[start:end]
                record = SeqRecord(sub_seq, id=u, name=u, description=u)
                __make_seq_file(record, seq_file)

                # Add DBDs
                alignment = hmmalign(seq_file, hmm_file)
                pfams[u].append(
                    (pfam_id_std, alignment, start + 1, end, evalue))

        # Write
        Jglobals.write(pfam_json_file,
                       json.dumps(pfams, sort_keys=True, indent=4))

        # Remove seq file
        if os.path.exists(seq_file):
            os.remove(seq_file)

        # Change dir
        os.chdir(cwd)
Exemple #3
0
def __format_BLAST_database(taxon, out_dir=out_dir):

    # Skip if taxon FASTA file already exists
    fasta_file = os.path.join(out_dir, "%s.fa" % taxon)
    if not os.path.exists(fasta_file):

        # Load JSON file
        uniprot_json_file = taxon + uniprot_file_ext
        with open(uniprot_json_file) as f:
            uniaccs = json.load(f)

        # For each UniProt Accession...
        for uniacc in sorted(uniaccs):
            seq = uniaccs[uniacc][1]
            Jglobals.write(fasta_file, ">%s\n%s" % (uniacc, seq))

        # Make BLAST+ database
        cmd = "makeblastdb -in %s -dbtype prot" % fasta_file
        process = subprocess.run([cmd],
                                 shell=True,
                                 stdout=subprocess.DEVNULL,
                                 stderr=subprocess.DEVNULL)
def infer_profiles(fasta_file, dummy_dir="/tmp/", files_dir=files_dir,
    output_file=None, threads=1, latest=False, n=5, taxons=Jglobals.taxons):

    # Initialize
    base_name = os.path.basename(__file__)
    pid = os.getpid()

    # Load data
    cisbp = __load_CisBP_models(files_dir)
    # jaspar = __load_JASPAR_files_n_models(files_dir, models_dir, taxons)
    jaspar = __load_JASPAR_files_n_models(files_dir, taxons)

    # Create dummy dir
    dummy_dir = os.path.join(dummy_dir, "%s.%s" % (base_name, pid))
    dummy_file = os.path.join(dummy_dir, "inferred_profiles.tsv")
    if not os.path.exists(dummy_dir):
        os.makedirs(dummy_dir)

    # Get sequences as SeqRecords
    # Note: https://biopython.org/wiki/SeqRecord
    seq_records = []
    for seq_record in Jglobals.parse_fasta_file(fasta_file):
        seq_records.append(seq_record)

    # Write
    # columns = ["Query", "TF Name", "TF Matrix", "E-value", "Query Start-End",
    #     "TF Start-End", "DBD %ID", "Cis-BP", "JASPAR"]
    columns = ["Query", "TF Name", "TF Matrix", "E-value", "Query Start-End",
        "TF Start-End", "DBD %ID"]
    Jglobals.write(dummy_file, "\t".join(columns))

    # Infer SeqRecord profiles
    kwargs = {"total": len(seq_records), "bar_format": bar_format}
    pool = Pool(min([threads, len(seq_records)]))
    p = partial(infer_SeqRecord_profiles, cisbp=cisbp, dummy_dir=dummy_dir,
        files_dir=files_dir, jaspar=jaspar, latest=latest, n=n, taxons=taxons)
    for inferences in tqdm(pool.imap(p, seq_records), **kwargs):
        for inference in inferences:
            Jglobals.write(dummy_file, "\t".join(map(str, inference)))
    pool.close()
    pool.join()

    # Write
    if output_file:
        shutil.copy(dummy_file, output_file)
    else:
        with open(dummy_file) as f:
            # For each line...
            for line in f:
                Jglobals.write(None, line.strip("\n"))

    # Remove dummy dir
    shutil.rmtree(dummy_dir)
Exemple #5
0
def __get_profile_info(taxon, out_dir=out_dir):

    # Skip if taxon profiles JSON file already exists
    profiles_json_file = os.path.join(out_dir, taxon + profiles_file_ext)
    if not os.path.exists(profiles_json_file):

        # Initialize
        profiles = {}
        url = os.path.join(jaspar_url, "api", "v1", "taxon", taxon)
        response = client.get(url)
        json_obj = json.loads(codec.encode(response))

        # While there are more pages...
        while json_obj["next"] is not None:

            # For each profile...
            for profile in json_obj["results"]:

                # Add profiles from the CORE collection...
                if profile["collection"] == "CORE":
                    profiles.setdefault(profile["matrix_id"], profile["name"])

            # Go to next page
            response = client.get(json_obj["next"])
            json_obj = json.loads(codec.encode(response))

        # Do last page
        for profile in json_obj["results"]:

            # Add profiles from the CORE collection...
            if profile["collection"] == "CORE":
                profiles.setdefault(profile["matrix_id"], profile["name"])

        # Write
        Jglobals.write(profiles_json_file,
                       json.dumps(profiles, sort_keys=True, indent=4))
Exemple #6
0
def __download_UniProt_sequences(taxon, out_dir=out_dir):

    # Initialize
    faulty_profiles = {
        "MA1826.1": ["B4FU91"],
    }
    faulty_sequences = {
        "B9GPL8": [
            "MEEVGAQVAAPIFIHEALSSRYCDMTSMAKKHDLSYQSPNSQLQQHQFLQASREKNWNSK",
            "AWDWDSVDDDGLGLNLGGSLTSVEEPVSRPNKRVRSGSPGNGSYPMCQVDNCKEDLSKAK",
            "DYHRRHKVCQVHSKATKALVGKQMQRFCQQCSRFHPLTEFDEGKRSCRRRLAGHNRRRRK",
            "TQPEDVTSRLLLPGNPDMNNNGNLDIVNLLTALARSQGKTYLPMIDFYVPPFVLTNCPTV",
            "PDKDQLIQILNKINSLPLPMDLAAKLSNIASLNVKNPNQPYLGHQNRLNGTASSPSTNDL",
            "LAVLSTTLAASAPDALAILSQRSSQSSDNDKSKLPGPNQVTVPHLQKRSNVEFPAVGVER",
            "ISRCYESPAEDSDYQIQESRPNLPLQLFSSSPENESRQKPASSGKYFSSDSSNPIEERSP",
            "SSSPPVVQKLFPLQSTAETMKSEKMSVSREVNANVEGDRSHGCVLPLELFRGPNREPDHS",
            "SFQSFPYRGGYTSSSGSDHSPSSQNSDPQDRTGRIIFKLFDKDPSHFPGTLRTKIYNWLS",
            "NSPSEMESYIRPGCVVLSVYLSMPSASWEQLERNLLQLVDSLVQDSDSDLWRSGRFLLNT",
            "GRQLASHKDGKVRLCKSWRTWSSPELILVSPVAVIGGQETSLQLKGRNLTGPGTKIHCTY",
            "MGGYTSKEVTDSSSPGSMYDEINVGGFKIHGPSPSILGRCFIEVENGFKGNSFPVIIADA",
            "SICKELRLLESEFDENAVVSNIVSEEQTRDLGRPRSREEVMHFLNELGWLFQRKSMPSMH",
            "EAPDYSLNRFKFLLIFSVERDYCVLVKTILDMLVERNTCRDELSKEHLEMLYEIQLLNRS",
            "VKRRCRKMADLLIHYSIIGGDNSSRTYIFPPNVGGPGGITPLHLAACASGSDGLVDALTN",
            "DPHEIGLSCWNSVLDANGLSPYAYAVMTKNHSYNLLVARKLADKRNGQISVAIGNEIEQA",
            "ALEQEHVTISQFQRERKSCAKCASVAAKMHGRFLGSQGLLQRPYVHSMLAIAAVCVCVCL",
            "FFRGAPDIGLVAPFKWENLNYGTI"
        ]
    }

    # Change dir
    os.chdir(out_dir)

    # Skip if pickle file already exists
    pickle_file = ".%s.uniaccs.pickle" % taxon
    if not os.path.exists(pickle_file):

        # Initialize
        uniaccs = {}

        # Load JSON file
        profiles_json_file = taxon + profiles_file_ext
        with open(profiles_json_file) as f:
            profiles = json.load(f)

        # For each profile...
        for profile in sorted(profiles):

            # Get profile detailed info
            url = os.path.join(jaspar_url, "api", "v1", "matrix", profile)
            response = client.get(url)
            json_obj = json.loads(codec.encode(response))

            # Fix faulty profiles
            if json_obj["matrix_id"] in faulty_profiles:
                json_obj["uniprot_ids"] = faulty_profiles[
                    json_obj["matrix_id"]]

            # For each UniProt Accession...
            for uniacc in json_obj["uniprot_ids"]:

                # Skip
                if uniacc == "":
                    continue

                # Initialize
                uniacc = uniacc.strip(" ")
                uniaccs.setdefault(uniacc, [[], None])

                # Add uniacc
                if profile not in uniaccs[uniacc][0]:
                    uniaccs[uniacc][0].append(profile)

        # Write pickle file
        with open(pickle_file, "wb") as f:
            pickle.dump(uniaccs, f)

    # Skip if taxon uniprot JSON file already exists
    uniprot_json_file = taxon + uniprot_file_ext
    if not os.path.exists(uniprot_json_file):

        # Load pickle file
        with open(pickle_file, "rb") as f:
            uniaccs = pickle.load(f)

        # For each UniProt Accession...
        for uniacc in uniaccs:

            # Fix faulty sequences
            if uniacc in faulty_sequences:
                uniaccs[uniacc][1] = "".join(faulty_sequences[uniacc])
                continue

            # Get UniProt sequence
            u = uniprot.queryUniprot(uniacc)
            uniaccs[uniacc][1] = "".join(u["sequence   0"].split("\n"))

        # Write
        Jglobals.write(uniprot_json_file,
                       json.dumps(uniaccs, sort_keys=True, indent=4))

    # Change dir
    os.chdir(cwd)
Exemple #7
0
def __download_Pfam_DBD_HMMs(out_dir=out_dir):

    # Skip if Pfam file already exists
    json_file = os.path.join(out_dir, "pfam.json")
    if not os.path.exists(json_file):

        # Initialize
        pfams = {}
        pfam_ids = set()
        url = "http://cisbp.ccbr.utoronto.ca/data/2.00/" + \
              "DataFiles/Bulk_downloads/EntireDataset/"
        cisbp_file = "TF_Information_all_motifs.txt.zip"

        # Create Pfam dir
        pfam_dir = os.path.join(out_dir, "pfam")
        if not os.path.isdir(pfam_dir):
            os.makedirs(pfam_dir)

        # Change dir
        os.chdir(pfam_dir)

        # Skip if Cis-BP file already exists
        if not os.path.exists(cisbp_file):
            urlretrieve(os.path.join(url, cisbp_file), cisbp_file)

        # Get DBD/cut-off pairs
        cmd = "unzip -p %s | cut -f 11 | sort | uniq | grep -v DBDs" % \
            cisbp_file
        process = subprocess.run([cmd],
                                 shell=True,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)

        # For each line...
        for line in process.stdout.decode("utf-8").split("\n"):

            # For each Pfam ID...
            for pfam_id in line.split(","):

                # Skip if not Pfam ID
                if pfam_id == "UNKNOWN" or pfam_id == "":
                    continue

                # Add Pfam ID
                pfam_ids.add(pfam_id)

        # For each Pfam ID...
        for pfam_id in pfam_ids:

            # Fetch MSA from Pfam
            attempts = 0
            while attempts < 5:
                try:
                    msa_file = pfam.fetchPfamMSA(pfam_id, alignment="seed")
                    break
                except:
                    # i.e. try again in 5 seconds
                    attempts += 1
                    time.sleep(5)

            # For each line...
            for line in Jglobals.parse_file(msa_file):

                m = re.search("^#=GF\sID\s+(\S+)$", line)
                if m:
                    pfam_id_std = m.group(1)

                m = re.search("^#=GF\sAC\s+(PF\d{5}).\d+$", line)
                if m:
                    pfam_ac = m.group(1)
                    break

            # HMM build
            hmm_file = "%s.hmm" % pfam_id_std
            cmd = "hmmbuild %s %s" % (hmm_file, msa_file)
            process = subprocess.run([cmd],
                                     shell=True,
                                     stdout=subprocess.DEVNULL,
                                     stderr=subprocess.DEVNULL)

            # HMM press
            cmd = "hmmpress -f %s" % hmm_file
            process = subprocess.run([cmd],
                                     shell=True,
                                     stdout=subprocess.DEVNULL,
                                     stderr=subprocess.DEVNULL)

            # Add Pfam
            pfams.setdefault(pfam_ac, pfam_id_std)

            # Remove MSA file
            os.remove(msa_file)

        # Skip if HMM database of all DBDs already exists
        hmm_db = "All.hmm"
        if not os.path.exists(hmm_db):

            # For each HMM file...
            for hmm_file in os.listdir("."):

                # Skip if not HMM file
                if not hmm_file.endswith(".hmm"): continue

                # Add HMM to database
                for line in Jglobals.parse_file(hmm_file):
                    Jglobals.write(hmm_db, line)

            # HMM press
            cmd = "hmmpress -f %s" % hmm_db
            process = subprocess.run([cmd],
                                     shell=True,
                                     stdout=subprocess.DEVNULL,
                                     stderr=subprocess.DEVNULL)

        # Remove Cis-BP file
        if os.path.exists(cisbp_file):
            os.remove(cisbp_file)

        # Write
        Jglobals.write(json_file, json.dumps(pfams, sort_keys=True, indent=4))

        # Change dir
        os.chdir(cwd)