def trf_run_CMD(fas_file, seq, seqtype,softmask=False,tmpdir=None): if seqtype == 'p': return fas_file olddir = os.getcwd() os.chdir(os.path.dirname(fas_file.name)) f = open(os.devnull, 'w') util.run_cmd("trf %s 2 7 7 80 10 50 2000 -m -h" % fas_file.name, verbose=False,stdout=f) fileres = fas_file.name + '.2.7.7.80.10.50.2000.mask' fileres2 = fas_file.name + '.2.7.7.80.10.50.2000.dat' result = Read(fileres,sep='\t') util.run_cmd('rm %s' % fileres, shell=True) util.run_cmd('rm %s' % fileres2, shell=True) fas_file.close() fas_new = tempfile.NamedTemporaryFile(suffix='.fasta', dir=tmpdir) if softmask: nresult = [] for seqold,seqnew in zip(seq, result.seq()): seqnew = ''.join([c1 if c2 != 'N' or c1 == 'N' else c1.lower() for c1,c2 in zip(seqold, seqnew)]) nresult.append(seqnew) write_fasta_text(result.Get(0)(), nresult, len(nresult), fas_new) else: write_fasta_text(result.Get(0)(), result.seq(), len(result), fas_new) fas_new.flush() os.chdir(olddir) return fas_new
def last(data, type, folder=None, pos_mode = 'last', dbargs='', alargs='', lsargs='', probs=0, trf=False, last_split=True, calc_evalue=False,softmask=False,tmpdir=None): alargs = [alargs] dbargs = [dbargs] lsargs = [lsargs] if probs: alargs.append('-j %d' % (probs+4)) seq_1 = data[0]; seq_2 = data[1]; if type[0] != type[1]: if type[0] == 'n' and type[1] == 'p': util.warning('Reversing order, last supports only Prot to DNA and not DNA to Prot') seq_1,seq_2 = seq_2,seq_1 type = (type[1],type[0]) if trf: seq_1 = [s.upper() for s in seq_1] seq_2 = [s.upper() for s in seq_2] if not_contains(dbargs, '-c'): dbargs.append('-c') if not_contains(alargs, '-u'): alargs.append('-u2') title_1 = [ "%d" % i for i in xrange(len(seq_1)) ]; title_2 = [ "%d" % i for i in xrange(len(seq_2)) ]; fas_1 = tempfile.NamedTemporaryFile(suffix='.fasta',dir=tmpdir) fas_2 = tempfile.NamedTemporaryFile(suffix='.fasta',dir=tmpdir) res = tempfile.NamedTemporaryFile(suffix='.maf',dir=tmpdir) db_1 = fas_1.name[:-4] db_2 = fas_2.name[:-6] md5_1 = write_fasta_text(title_1, seq_1, len(seq_1), fas_1); md5_2 = write_fasta_text(title_2, seq_2, len(seq_2), fas_2); fas_1.flush() fas_2.flush() if trf: fas_1 = trf_run_CMD(fas_1, seq_1, type[0],softmask=softmask, tmpdir=tmpdir) fas_2 = trf_run_CMD(fas_2, seq_2, type[1],softmask=softmask, tmpdir=tmpdir) if type[0] != type[1]: calc_evalue = False util.run_cmd(last_make_db_CMD(db_1, fas_1.name, type[0], dbargs), verbose=False) if calc_evalue: util.run_cmd(last_make_db_CMD(db_2, fas_2.name, type[1], dbargs), verbose=False) util.run_cmd(last_run_CMD(db_1, type[0], db_2, fas_2.name, type[1], alargs, lsargs, last_split, calc_evalue), shell=True, stdout=res, verbose=False) res.flush() data = last_result2(res.name, pos_mode, probs>0, last_split, calc_evalue); fas_1.close(); fas_2.close(); res.close() util.run_cmd('rm %s*' % db_1, shell=True) if calc_evalue: util.run_cmd('rm %s*' % db_2, shell=True) return data
def blast(data, type, folder, reciprocal = True, normalize = False, overwrite = False, blastopts='-num_threads %d' % multiprocessing.cpu_count()): seq_1 = data[0]; seq_2 = data[1]; title_1 = [ "%d" % i for i in xrange(len(seq_1)) ]; title_2 = [ "%d" % i for i in xrange(len(seq_2)) ]; fas_1 = tempfile.NamedTemporaryFile(delete = False); fas_2 = tempfile.NamedTemporaryFile(delete = False); db_1 = "%s.blastdb" % (fas_1.name); db_2 = "%s.blastdb" % (fas_2.name) md5_1 = write_fasta_text(title_1, seq_1, len(seq_1), fas_1); md5_2 = write_fasta_text(title_2, seq_2, len(seq_2), fas_2); fas_1.close(); fas_2.close(); mkdb_CMDs = []; blst_CMDs = []; # perform blast for 12 file_12 = "%s/%s-%s.tsv" % (folder, md5_1, md5_2); mkdb_CMDs = mkdb_CMDs + [ blast_make_db_CMD(fas_2.name, db_2, type[1]) ]; blst_CMDs = blst_CMDs + [ blast_run_CMD(fas_1.name, db_2, type[0], file_12, blastopts, overwrite) ]; if reciprocal: # perform blast for 21 file_21 = "%s/%s-%s.tsv" % (folder, md5_2, md5_1); mkdb_CMDs = mkdb_CMDs + [ blast_make_db_CMD(fas_1.name, db_1, type[1]) ]; blst_CMDs = blst_CMDs + [ blast_run_CMD(fas_2.name, db_1, type[0], file_21, blastopts, overwrite) ]; #fi if normalize: # perform blast for 11 file_11 = "%s/%s-%s.tsv" % (folder, md5_1, md5_1); blst_CMDs = blst_CMDs + [ blast_run_CMD(fas_1.name, db_1, type[0], file_11, blastopts, overwrite) ]; # perform blast for 22 file_22 = "%s/%s-%s.tsv" % (folder, md5_2, md5_2); blst_CMDs = blst_CMDs + [ blast_run_CMD(fas_2.name, db_2, type[1], file_22, blastopts, overwrite) ]; #fi util.run_par_cmds(mkdb_CMDs); util.run_seq_cmds(blst_CMDs); del_CMDs = [ "rm -f '/tmp/%s'" % f for f in os.listdir('/tmp') if (fnmatch.fnmatch(f, "*%s*" % fas_1.name.split('/')[2])) or (fnmatch.fnmatch(f, "*%s*" % fas_2.name.split('/')[2])) ]; util.run_seq_cmds(del_CMDs); ab = blast_res_to_dict(file_12); # if reciprocal blast_reciprocal(file_12.name, file_21.name) else blast_res_to_dict(file_12.name) if reciprocal: ba = blast_res_to_dict(file_21); ab = blast_reciprocal(ab, ba); #fi if normalize: aa = blast_res_to_dict(file_11, max=True); bb = blast_res_to_dict(file_22, max=True); ab = blast_bitscore_normalize(ab, aa, bb); #fi # qseqid sseqid qlen qstart qend slen sstart send length mismatch gapopen pident evalue bitscore sp_types = (int, int, int, int, int, int, int, int, int, int, int, float, float, float) ab = [ [ list(p[0]) + h for h in p[1] ] for p in ab.items() ]; ab = [ item for sublist in ab for item in sublist]; if len(ab) == 0: return tuple([ util.darray([],type) for type in sp_types ] ) #fi return tuple([ util.darray(row,type) for (type,row) in zip( sp_types, map(lambda *row: list(row), *ab)) ] );