def count_feats(gbkf, verbose=False): if verbose: message(f"Reading {gbkf}", "BLUE") count = {} for seq in genbank_seqio(gbkf): for feat in seq.features: count[feat.type] = count.get(feat.type, 0) + 1 return count
def hmmsearch_print_then_parse(gbkf, hmmf): aaout = NamedTemporaryFile(mode='w+t', delete=False) sys.stderr.write(f"Writing the amino acids to {aaout.name}\n") aaout.seek(0) for seq in genbank_seqio(gbkf): for feat in seq.features: if feat.type != 'CDS': continue aa = "" if 'translation' in feat.qualifiers: aa = feat.qualifiers['translation'][0] else: aa = str(feat.extract(seq).translate().seq) myid = feature_id(seq, feat) aaout.write(f">{myid}\n{aa}\n") aaout.close() sys.stderr.write("Searching\n") try: search = subprocess.Popen([ "hmmsearch", '--cpu', '6', '-E', '1e-10', '--domE', '1e-5', '--noali', hmmf, aaout.name ], stdout=subprocess.PIPE) except subprocess.CalledProcessError as e: sys.stderr.write(f"Error running hmmscan:\n{e}\n") sys.exit(-1) sys.stderr.write("Parsing\n") hmmresult = search.communicate()[0] results = SearchIO.parse(StringIO(hmmresult.decode()), 'hmmer3-text') allhits = {} hitcount = 0 rescount = 0 for res in results: allhits[res.id] = {} rescount += 1 for hit in res: allhits[res.id][hit.id] = hit.evalue # print(f"Result: {res.id}: Hit: {hit.id} Eval: {hit.evalue}") hitcount += 1 print( f"Using hmmsearch and tempfiles there were {rescount} results and {hitcount} hits, and our dict has {len(allhits)} entries" )
def stream_hmmsearch(gbkf, hmmf): """ NOTE: THIS DOES NOT WORK!! You can't stream against hmmsearch as it can't rewind the sequences. You either need to use hmmscan (slow) or a temp file (fast!) :param gbkf: :param hmmf: :return: """ allhits = {} hitcount = 0 rescount = 0 for seq in genbank_seqio(gbkf): prots = [] for feat in seq.features: if feat.type != 'CDS': continue aa = "" if 'translation' in feat.qualifiers: aa = feat.qualifiers['translation'][0] else: aa = str(feat.extract(seq).translate().seq) myid = feature_id(seq, feat) prots.append(f">{myid}\n{aa}") search = subprocess.Popen([ "hmmsearch", '--cpu', '6', '-E', '1e-10', '--domE', '1e-5', '--noali', hmmf, '-' ], stdin=subprocess.PIPE, stdout=subprocess.PIPE) hmmresult = search.communicate(input="\n".join(prots).encode())[0] results = SearchIO.parse(StringIO(hmmresult.decode()), 'hmmer3-text') for res in results: allhits[res.id] = {} rescount += 1 for hit in res: allhits[res.id][hit.id] = hit.evalue hitcount += 1 print( f"Using hmmsearch and streaming all at once there were {rescount} results and {hitcount} hits, and our dict has {len(allhits)} entries" )
def run_hmmscan_aao(gbkf, hmmf): allhits = {} hitcount = 0 rescount = 0 for seq in genbank_seqio(gbkf): prots = [] for feat in seq.features: if feat.type != 'CDS': continue aa = "" if 'translation' in feat.qualifiers: aa = feat.qualifiers['translation'][0] else: aa = str(feat.extract(seq).translate().seq) myid = feature_id(seq, feat) prots.append(f">{myid}\n{aa}") search = subprocess.Popen([ "hmmscan", '--cpu', '6', '-E', '1e-10', '--domE', '1e-5', '--noali', hmmf, '-' ], stdin=subprocess.PIPE, stdout=subprocess.PIPE) hmmresult = search.communicate(input="\n".join(prots).encode())[0] results = SearchIO.parse(StringIO(hmmresult.decode()), 'hmmer3-text') for res in results: allhits[res.id] = {} rescount += 1 for hit in res: allhits[res.id][hit.id] = hit.evalue hitcount += 1 print( f"Using hmmscan and streaming all at once there were {rescount} results and {hitcount} hits, and our dict has {len(allhits)} entries" )
Test a directory of genbank files and note whether they have the is_phage qualifier for their genomes """ import os import sys import argparse from roblib import genbank_seqio, message __author__ = 'Rob Edwards' __copyright__ = 'Copyright 2020, Rob Edwards' __credits__ = ['Rob Edwards'] __license__ = 'MIT' __maintainer__ = 'Rob Edwards' __email__ = '*****@*****.**' if __name__ == '__main__': parser = argparse.ArgumentParser(description=" ") parser.add_argument('-d', help='directory of genbank files', required=True) parser.add_argument('-v', help='verbose output', action='store_true') args = parser.parse_args() for f in os.listdir(args.d): if args.v: message(f"Reading {f}", "GREEN") pc = 0 for s in genbank_seqio(os.path.join(args.d, f)): for feat in s.features: if 'is_phage' in feat.qualifiers: pc += 1 print(f"{f}\t{pc}")