Exemple #1
0
def blast(db_seq,
          qry_seq,
          seqtype='nucl',
          blast_out=None,
          blast_outfmt=None,
          ncpu=4):
    if seqtype == 'nucl':
        blast_app = 'blastn'
    elif seqtype == 'prot':
        blast_app = 'blastp'
    else:
        raise ValueError(
            'Unknown molecule type "{}" for blast'.format(seqtype))
    if blast_out is None:
        blast_out = qry_seq + '.blastout'
    if blast_outfmt is None:
        blast_outfmt = '6'
    cmd = 'makeblastdb -in {} -dbtype {}'.format(db_seq, seqtype)
    run_cmd(cmd, logger=logger)

    cmd = '{} -query {} -db {} -out {} -outfmt {} -num_threads {}'.format(
        blast_app, qry_seq, db_seq, blast_out, blast_outfmt, ncpu)
    #	cmd += " " + blast_opts
    run_cmd(cmd, logger=logger)
    return blast_out
Exemple #2
0
def hmmscan(inSeq, hmmdb='rexdb.hmm', hmmout=None, ncpu=4):
    if hmmout is None:
        hmmout = prefix + '.domtbl'
    cmd = 'hmmscan --notextw -E 0.01 --domE 0.01 --noali --cpu {} --domtblout {} {} {} > /dev/null'.format(
        ncpu, hmmout, hmmdb, inSeq)
    run_cmd(cmd, logger=logger)
    return hmmout
def main(inSeq, domains, outSeq=sys.stdout, tmpdir='/tmp'):
    d_domain = {domain: [] for domain in domains}
    uid = uuid.uuid1()
    # intersect
    for rc in SeqIO.parse(inSeq, 'fasta'):
        domain = re.compile(r'gene=([^;\s]+)').search(
            rc.description).groups()[0]
        if domain in d_domain:
            raw_id = '#'.join(rc.id.split('#')[:-1])
            d_domain[domain] += [raw_id]

    i = 0
    for domain, rawids in d_domain.iteritems():
        i += 1
        if i == 1:
            intersect = set(rawids)
            continue
        intersect = intersect & set(rawids)
    print >> sys.stderr, '{} sequences contain {} domains'.format(
        len(intersect), domains)
    # open files
    d_file = {}
    files = []
    for i, domain in enumerate(domains):
        outfile = '{}/{}.{}.fa'.format(tmpdir, uid, i)
        fout = open(outfile, 'w')
        d_file[domain] = fout
        files += [outfile]

    # write
    for rc in SeqIO.parse(inSeq, 'fasta'):
        domain = re.compile(r'gene=([^;\s]+)').search(
            rc.description).groups()[0]
        if not domain in d_file:
            continue
        raw_id = '#'.join(rc.id.split('#')[:-1])
        if raw_id not in intersect:
            continue
        fout = d_file[domain]
        rc.id = raw_id
        SeqIO.write(rc, fout, 'fasta')

    # close files
    for fout in d_file.values():
        fout.close()

    # align
    alnfiles = []
    for seqfile in files:
        alnfile = seqfile + '.aln'
        cmd = 'mafft --auto {} > {}'.format(seqfile, alnfile)
        #		os.system(cmd)
        run_cmd(cmd, log=True)
        alnfiles += [alnfile]
    # concatenate
    catAln(alnfiles, outSeq)
Exemple #4
0
 def update_hmmer(self, db):
     from small_tools import backup_file
     bk_db, db = backup_file(db)
     for suffix in ['.h3f', '.h3i', '.h3m', '.h3p']:
         backup_file(bk_db + suffix)
     cmd = 'hmmconvert {} > {}'.format(bk_db, db)
     out, err, status0 = run_cmd(cmd, logger=logger)
     cmd = 'hmmpress {}'.format(db)
     out, err, status1 = run_cmd(cmd, logger=logger)
     if status0 + status1 == 0:
         logger.info('HMM converted. it will continue')
     else:
         logger.error('HMM failed to convert. exit')
         sys.exit(1)
Exemple #5
0
 def check_presence(self, program):
     cmd = 'which {}'.format(program)
     out, err, status = run_cmd(cmd)
     if status == 0:
         return True
     else:
         return False
Exemple #6
0
 def pipeline(self):
     for clade, records in self.BinSeqsByClade().iteritems():
         clade0 = clade.replace('/', '__').replace(':', '--')
         cladeSeqs = '{}/{}.fa'.format(self.tmpdir, clade0)
         with open(cladeSeqs, 'w') as f:
             for rc in records:
                 SeqIO.write(rc, f, 'fasta')
         alnSeqs = cladeSeqs + '.aln'
         cmd = 'mafft --auto {} > {} 2> /dev/null'.format(
             cladeSeqs, alnSeqs)
         #			cmd = 'mafft --auto {} 2> /dev/null| prepareAlign | mafft --auto - > {} 2> /dev/null'.format(cladeSeqs, alnSeqs)
         run_cmd(cmd)
         alnHMM = cladeSeqs + '.hmm'
         cmd = 'hmmbuild -n {} {} {} > /dev/null'.format(
             clade, alnHMM, alnSeqs)
         run_cmd(cmd)
Exemple #7
0
 def check_blast_version(self, program):
     cmd = '{} -version'.format(program)
     out, err, status = run_cmd(cmd)
     version = re.compile(r'blast\S* ([\d\.\+]+)').search(out).groups()[0]
     return version
Exemple #8
0
 def check_hmmer_verion(self, program):
     cmd = '{} -h'.format(program)
     out, err, status = run_cmd(cmd)
     version = re.compile(r'HMMER (\S+)').search(out).groups()[0]
     return version