コード例 #1
0
ファイル: countgenbank.py プロジェクト: bcpd/EdwardsLab
def count_feats(gbkf, verbose=False):
    if verbose:
        message(f"Reading {gbkf}", "BLUE")

    count = {}
    for seq in genbank_seqio(gbkf):
        for feat in seq.features:
            count[feat.type] = count.get(feat.type, 0) + 1
    return count
コード例 #2
0
ファイル: run_hmmer.py プロジェクト: bcpd/EdwardsLab
def hmmsearch_print_then_parse(gbkf, hmmf):
    aaout = NamedTemporaryFile(mode='w+t', delete=False)
    sys.stderr.write(f"Writing the amino acids to {aaout.name}\n")
    aaout.seek(0)
    for seq in genbank_seqio(gbkf):
        for feat in seq.features:
            if feat.type != 'CDS':
                continue
            aa = ""
            if 'translation' in feat.qualifiers:
                aa = feat.qualifiers['translation'][0]
            else:
                aa = str(feat.extract(seq).translate().seq)
            myid = feature_id(seq, feat)
            aaout.write(f">{myid}\n{aa}\n")

    aaout.close()

    sys.stderr.write("Searching\n")
    try:
        search = subprocess.Popen([
            "hmmsearch", '--cpu', '6', '-E', '1e-10', '--domE', '1e-5',
            '--noali', hmmf, aaout.name
        ],
                                  stdout=subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        sys.stderr.write(f"Error running hmmscan:\n{e}\n")
        sys.exit(-1)

    sys.stderr.write("Parsing\n")
    hmmresult = search.communicate()[0]
    results = SearchIO.parse(StringIO(hmmresult.decode()), 'hmmer3-text')
    allhits = {}
    hitcount = 0
    rescount = 0
    for res in results:
        allhits[res.id] = {}

        rescount += 1
        for hit in res:
            allhits[res.id][hit.id] = hit.evalue
            # print(f"Result: {res.id}: Hit: {hit.id} Eval: {hit.evalue}")
            hitcount += 1

    print(
        f"Using hmmsearch and tempfiles there were {rescount} results and {hitcount} hits, and our dict has {len(allhits)} entries"
    )
コード例 #3
0
ファイル: run_hmmer.py プロジェクト: bcpd/EdwardsLab
def stream_hmmsearch(gbkf, hmmf):
    """
    NOTE: THIS DOES NOT WORK!!

    You can't stream against hmmsearch as it can't rewind the sequences. You either need to use hmmscan (slow) or a temp file
    (fast!)

    :param gbkf:
    :param hmmf:
    :return:
    """
    allhits = {}
    hitcount = 0
    rescount = 0
    for seq in genbank_seqio(gbkf):
        prots = []
        for feat in seq.features:
            if feat.type != 'CDS':
                continue
            aa = ""
            if 'translation' in feat.qualifiers:
                aa = feat.qualifiers['translation'][0]
            else:
                aa = str(feat.extract(seq).translate().seq)
            myid = feature_id(seq, feat)
            prots.append(f">{myid}\n{aa}")

        search = subprocess.Popen([
            "hmmsearch", '--cpu', '6', '-E', '1e-10', '--domE', '1e-5',
            '--noali', hmmf, '-'
        ],
                                  stdin=subprocess.PIPE,
                                  stdout=subprocess.PIPE)
        hmmresult = search.communicate(input="\n".join(prots).encode())[0]

        results = SearchIO.parse(StringIO(hmmresult.decode()), 'hmmer3-text')
        for res in results:
            allhits[res.id] = {}
            rescount += 1
            for hit in res:
                allhits[res.id][hit.id] = hit.evalue
                hitcount += 1

    print(
        f"Using hmmsearch and streaming all at once there were {rescount} results and {hitcount} hits, and our dict has {len(allhits)} entries"
    )
コード例 #4
0
ファイル: run_hmmer.py プロジェクト: bcpd/EdwardsLab
def run_hmmscan_aao(gbkf, hmmf):

    allhits = {}
    hitcount = 0
    rescount = 0
    for seq in genbank_seqio(gbkf):
        prots = []
        for feat in seq.features:
            if feat.type != 'CDS':
                continue
            aa = ""
            if 'translation' in feat.qualifiers:
                aa = feat.qualifiers['translation'][0]
            else:
                aa = str(feat.extract(seq).translate().seq)
            myid = feature_id(seq, feat)
            prots.append(f">{myid}\n{aa}")

        search = subprocess.Popen([
            "hmmscan", '--cpu', '6', '-E', '1e-10', '--domE', '1e-5',
            '--noali', hmmf, '-'
        ],
                                  stdin=subprocess.PIPE,
                                  stdout=subprocess.PIPE)
        hmmresult = search.communicate(input="\n".join(prots).encode())[0]

        results = SearchIO.parse(StringIO(hmmresult.decode()), 'hmmer3-text')
        for res in results:
            allhits[res.id] = {}
            rescount += 1
            for hit in res:
                allhits[res.id][hit.id] = hit.evalue
                hitcount += 1

    print(
        f"Using hmmscan and streaming all at once there were {rescount} results and {hitcount} hits, and our dict has {len(allhits)} entries"
    )
コード例 #5
0
ファイル: genbank_has_phage.py プロジェクト: bcpd/EdwardsLab
Test a directory of genbank files and note whether they have the is_phage qualifier for their genomes
"""

import os
import sys
import argparse

from roblib import genbank_seqio, message

__author__ = 'Rob Edwards'
__copyright__ = 'Copyright 2020, Rob Edwards'
__credits__ = ['Rob Edwards']
__license__ = 'MIT'
__maintainer__ = 'Rob Edwards'
__email__ = '*****@*****.**'

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=" ")
    parser.add_argument('-d', help='directory of genbank files', required=True)
    parser.add_argument('-v', help='verbose output', action='store_true')
    args = parser.parse_args()

    for f in os.listdir(args.d):
        if args.v:
            message(f"Reading {f}", "GREEN")
        pc = 0
        for s in genbank_seqio(os.path.join(args.d, f)):
            for feat in s.features:
                if 'is_phage' in feat.qualifiers:
                    pc += 1
        print(f"{f}\t{pc}")