def blast(db_seq, qry_seq, seqtype='nucl', blast_out=None, blast_outfmt=None, ncpu=4): if seqtype == 'nucl': blast_app = 'blastn' elif seqtype == 'prot': blast_app = 'blastp' else: raise ValueError( 'Unknown molecule type "{}" for blast'.format(seqtype)) if blast_out is None: blast_out = qry_seq + '.blastout' if blast_outfmt is None: blast_outfmt = '6' cmd = 'makeblastdb -in {} -dbtype {}'.format(db_seq, seqtype) run_cmd(cmd, logger=logger) cmd = '{} -query {} -db {} -out {} -outfmt {} -num_threads {}'.format( blast_app, qry_seq, db_seq, blast_out, blast_outfmt, ncpu) # cmd += " " + blast_opts run_cmd(cmd, logger=logger) return blast_out
def hmmscan(inSeq, hmmdb='rexdb.hmm', hmmout=None, ncpu=4): if hmmout is None: hmmout = prefix + '.domtbl' cmd = 'hmmscan --notextw -E 0.01 --domE 0.01 --noali --cpu {} --domtblout {} {} {} > /dev/null'.format( ncpu, hmmout, hmmdb, inSeq) run_cmd(cmd, logger=logger) return hmmout
def main(inSeq, domains, outSeq=sys.stdout, tmpdir='/tmp'): d_domain = {domain: [] for domain in domains} uid = uuid.uuid1() # intersect for rc in SeqIO.parse(inSeq, 'fasta'): domain = re.compile(r'gene=([^;\s]+)').search( rc.description).groups()[0] if domain in d_domain: raw_id = '#'.join(rc.id.split('#')[:-1]) d_domain[domain] += [raw_id] i = 0 for domain, rawids in d_domain.iteritems(): i += 1 if i == 1: intersect = set(rawids) continue intersect = intersect & set(rawids) print >> sys.stderr, '{} sequences contain {} domains'.format( len(intersect), domains) # open files d_file = {} files = [] for i, domain in enumerate(domains): outfile = '{}/{}.{}.fa'.format(tmpdir, uid, i) fout = open(outfile, 'w') d_file[domain] = fout files += [outfile] # write for rc in SeqIO.parse(inSeq, 'fasta'): domain = re.compile(r'gene=([^;\s]+)').search( rc.description).groups()[0] if not domain in d_file: continue raw_id = '#'.join(rc.id.split('#')[:-1]) if raw_id not in intersect: continue fout = d_file[domain] rc.id = raw_id SeqIO.write(rc, fout, 'fasta') # close files for fout in d_file.values(): fout.close() # align alnfiles = [] for seqfile in files: alnfile = seqfile + '.aln' cmd = 'mafft --auto {} > {}'.format(seqfile, alnfile) # os.system(cmd) run_cmd(cmd, log=True) alnfiles += [alnfile] # concatenate catAln(alnfiles, outSeq)
def update_hmmer(self, db): from small_tools import backup_file bk_db, db = backup_file(db) for suffix in ['.h3f', '.h3i', '.h3m', '.h3p']: backup_file(bk_db + suffix) cmd = 'hmmconvert {} > {}'.format(bk_db, db) out, err, status0 = run_cmd(cmd, logger=logger) cmd = 'hmmpress {}'.format(db) out, err, status1 = run_cmd(cmd, logger=logger) if status0 + status1 == 0: logger.info('HMM converted. it will continue') else: logger.error('HMM failed to convert. exit') sys.exit(1)
def check_presence(self, program): cmd = 'which {}'.format(program) out, err, status = run_cmd(cmd) if status == 0: return True else: return False
def pipeline(self): for clade, records in self.BinSeqsByClade().iteritems(): clade0 = clade.replace('/', '__').replace(':', '--') cladeSeqs = '{}/{}.fa'.format(self.tmpdir, clade0) with open(cladeSeqs, 'w') as f: for rc in records: SeqIO.write(rc, f, 'fasta') alnSeqs = cladeSeqs + '.aln' cmd = 'mafft --auto {} > {} 2> /dev/null'.format( cladeSeqs, alnSeqs) # cmd = 'mafft --auto {} 2> /dev/null| prepareAlign | mafft --auto - > {} 2> /dev/null'.format(cladeSeqs, alnSeqs) run_cmd(cmd) alnHMM = cladeSeqs + '.hmm' cmd = 'hmmbuild -n {} {} {} > /dev/null'.format( clade, alnHMM, alnSeqs) run_cmd(cmd)
def check_blast_version(self, program): cmd = '{} -version'.format(program) out, err, status = run_cmd(cmd) version = re.compile(r'blast\S* ([\d\.\+]+)').search(out).groups()[0] return version
def check_hmmer_verion(self, program): cmd = '{} -h'.format(program) out, err, status = run_cmd(cmd) version = re.compile(r'HMMER (\S+)').search(out).groups()[0] return version