def runBlast(self, ref, qry) : logger('Run BLASTn starts') if not self.qrySeq : self.qrySeq, self.qryQual = readFastq(qry) if not self.refSeq : self.refSeq, self.refQual = readFastq(ref) refDb = refNA = os.path.join(self.dirPath, 'refNA') with open(refNA, 'w') as fout : for n,s in self.refSeq.items() : fout.write('>{0}\n{1}\n'.format(n, s)) Popen('{makeblastdb} -dbtype nucl -in {refNA} -out {refDb}'.format(makeblastdb=makeblastdb, refNA=refNA, refDb = refDb).split(), stderr=PIPE, stdout=PIPE, universal_newlines=True).communicate() qrySeq = sorted(list(self.qrySeq.items()), key=lambda s:-len(s[1])) qrys = [ os.path.join(self.dirPath, 'qryNA.{0}'.format(id)) for id in range(min(len(qrySeq), self.n_thread))] for id, q in enumerate(qrys) : with open(q, 'w') as fout : for n, s in qrySeq[id::self.n_thread] : fout.write('>{0}\n{1}\n'.format(n, s)) blastab = [] for r in self.pool.imap_unordered(poolBlast, [ [blastn, refDb, q, self.min_id, self.min_cov, self.min_ratio] for q in qrys ]) : if r is not None : blastab.append(np.load(r, allow_pickle=True)) os.unlink(r) if len(blastab) : blastab = np.vstack(blastab) else : blastab = np.empty([0, 15], dtype=object) logger('Run BLASTn finishes. Got {0} alignments'.format(blastab.shape[0])) return blastab
def reScore(self, ref, qry, blastab, mode, perBatch=10000): if not self.qrySeq: self.qrySeq, self.qryQual = readFastq(qry) if not self.refSeq: self.refSeq, self.refQual = readFastq(ref) for k, v in self.qrySeq.items(): self.qrySeq[k] = nucEncoder[np.array(list(v)).view(asc2int)] for k, v in self.refSeq.items(): self.refSeq[k] = nucEncoder[np.array(list(v)).view(asc2int)] nTab = len(blastab) for bId in xrange(0, blastab.shape[0], perBatch): logger('Update scores: {0} / {1}'.format(bId, nTab)) tabs = blastab[bId:bId + perBatch] #scores = np.array([ cigar2score([t[14], self.refSeq[str(t[1])][t[8]-1:t[9]] if t[8] < t[9] else 4 - self.refSeq[str(t[1])][t[9]-1:t[8]][::-1], self.qrySeq[str(t[0])][t[6]-1:t[7]], t[6], mode, 6, 1]) for t in tabs ]) scores = np.array( list( map(cigar2score, ([ t[14], self.refSeq[str(t[1])][t[8] - 1:t[9]] if t[8] < t[9] else 4 - self.refSeq[str(t[1])][t[9] - 1:t[8]][::-1], self.qrySeq[str(t[0])][t[6] - 1:t[7]], t[6], mode, 6, 1 ] for t in tabs)))) tabs.T[2], tabs.T[11] = scores.T return blastab
def runDiamond(self, ref, qry, nhits=10, frames='7') : logger('Run diamond starts') refAA = os.path.join(self.dirPath, 'refAA') qryAA = os.path.join(self.dirPath, 'qryAA') aaMatch = os.path.join(self.dirPath, 'aaMatch') if not self.qrySeq : self.qrySeq, self.qryQual = readFastq(qry) if not self.refSeq : self.refSeq, self.refQual = readFastq(ref) qryAASeq = transeq(self.qrySeq, frame='F', transl_table=self.table_id) with open(qryAA, 'w') as fout : for n, ss in sorted(qryAASeq.items()) : _, id, s = min([ (len(s[:-1].split('X')), id, s) for id, s in enumerate(ss) ]) fout.write('>{0}:{1}\n{2}\n'.format(n, id+1, s)) diamond_fmt = '{diamond} makedb --db {qryAA} --in {qryAA}'.format( diamond=diamond, qryAA=qryAA) p = Popen(diamond_fmt.split(), stderr=PIPE, stdout=PIPE, universal_newlines=True).communicate() refAASeq = transeq(self.refSeq, frames, transl_table=self.table_id) toWrite = [] for n, ss in sorted(refAASeq.items()) : for id, s in enumerate(ss) : cdss = re.findall('.{1000,}?X|.{1,1000}$', s + 'X') cdss[-1] = cdss[-1][:-1] cdsi = np.cumsum([0]+list(map(len, cdss[:-1]))) for ci, cs in zip(cdsi, cdss) : if len(cs) : toWrite.append('>{0}:{1}:{2}\n{3}\n'.format(n, id+1, ci, cs)) for id in xrange(5) : with open('{0}.{1}'.format(refAA, id), 'w') as fout : for line in toWrite[id::5] : fout.write(line) diamond_cmd = '{diamond} blastp --no-self-hits --threads {n_thread} --db {refAA} --query {qryAA} --out {aaMatch} --id {min_id} --query-cover {min_ratio} --evalue 1 -k {nhits} --dbsize 5000000 --outfmt 101'.format( diamond=diamond, refAA='{0}.{1}'.format(refAA, id), qryAA=qryAA, aaMatch='{0}.{1}'.format(aaMatch, id), n_thread=self.n_thread, min_id=self.min_id*100., nhits=nhits, min_ratio=self.min_ratio*100.) Popen(diamond_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() blastab = [] for r in self.pool.imap_unordered(parseDiamond, [ ['{0}.{1}'.format(aaMatch, id), self.refSeq, self.qrySeq, self.min_id, self.min_cov, self.min_ratio] for id in xrange(5) ]) : if r is not None : blastab.append(np.load(r, allow_pickle=True)) os.unlink(r) blastab = np.vstack(blastab) logger('Run diamond finishes. Got {0} alignments'.format(blastab.shape[0])) return blastab
def prepReference(prefix, ref_tag, reference, aligner, pilercr, trf, **args) : def mask_tandem(fasta_file) : cmd = '{0} {1} 2 4 7 80 10 60 2000 -d -h -ngs'.format(trf, fasta_file) trf_run = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, universal_newlines=True) region = [] for line in iter(trf_run.stdout.readline, r'') : if line[0] == '@' : cont_name = line[1:].strip().split()[0] else : part = line.split(' ',2)[:2] region.append([cont_name, int(part[0])-2, int(part[1])+2]) return region def mask_crispr(fasta_file, prefix) : cmd = '{0} -in {1} -out {2}.crispr'.format(pilercr, fasta_file, prefix) subprocess.Popen(cmd.split(), stderr=subprocess.PIPE).communicate() summary_trigger = 0 region = [] with open('{0}.crispr'.format(prefix)) as fin : for line in fin : if line.startswith('SUMMARY BY POSITION') : summary_trigger = 1 elif summary_trigger : if line[0] == '>' : cont_name = line[1:].strip().split()[0] elif len(line) > 10 and line.strip()[0] in '0123456789' : part = line[24:].strip().split() region.append([cont_name, int(part[0]), int(part[0]) + int(part[1]) -1]) os.unlink('{0}.crispr'.format(prefix)) return region # prepare reference if reference : if not isinstance(aligner, list) : subprocess.Popen('{0} -k15 -w5 -d {2}.mmi {1}'.format(aligner, reference, prefix).split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() else : subprocess.Popen('{0} -cR01 {2}.mmi {1}'.format(aligner[0], reference, prefix).split()).communicate() import tempfile with tempfile.NamedTemporaryFile(dir='.') as tf : seq, _ = readFastq(reference) tf_fas = '{0}.fasta'.format(tf.name) with open(tf_fas, 'wt') as fout: for n, s in seq.items() : fout.write('>{0}\n{1}\n'.format(n, s)) #tf_fas = '{0}.fasta'.format(tf.name) #if reference.upper().endswith('GZ') : # subprocess.Popen('{0} -cd {1} > {2}'.format(externals['pigz'], reference, tf_fas), shell=True).communicate() #else : # subprocess.Popen('cp {1} {2}'.format(externals['pigz'], reference, tf_fas), shell=True).communicate() repeats = mask_tandem(tf_fas) + mask_crispr(tf_fas, tf.name) os.unlink(tf_fas) alignments = alignAgainst([prefix +'.' + ref_tag.rsplit('.', 1)[0] + '.0', aligner, prefix + '.mmi', [ref_tag, reference], [ref_tag, reference]]) with uopen(alignments[1], 'a') as fout : for r in repeats : fout.write('{0}\trefMapper\tunsure\t{1}\t{2}\t.\t+\t.\t/inference="repetitive_regions"\n'.format( r[0], r[1], r[2], )) return alignments
def runBlast(self, ref, qry): logger('Run BLASTn starts') if not self.qrySeq: self.qrySeq, self.qryQual = readFastq(qry) if not self.refSeq: self.refSeq, self.refQual = readFastq(ref) refDb = refNA = os.path.join(self.dirPath, 'refNA') if self.refQual is not None: with open(refNA, 'w') as fout: for n, s in self.refSeq.items(): fout.write('>{0}\n{1}\n'.format(n, s)) else: refNA = ref Popen('{makeblastdb} -dbtype nucl -in {refNA} -out {refDb}'.format( makeblastdb=makeblastdb, refNA=refNA, refDb=refDb).split(), stderr=PIPE, stdout=PIPE, universal_newlines=True).communicate() qrySeq = sorted(list(self.qrySeq.items()), key=lambda s: -len(s[1])) qrys = [ os.path.join(self.dirPath, 'qryNA.{0}'.format(id)) for id in range(min(len(qrySeq), self.n_thread)) ] for id, q in enumerate(qrys): with open(q, 'w') as fout: for n, s in qrySeq[id::self.n_thread]: fout.write('>{0}\n{1}\n'.format(n, s)) res = self.pool.map( poolBlast, [[blastn, refDb, q, self.min_id, self.min_cov, self.min_ratio] for q in qrys]) #res = list(map(poolBlast, [ [blastn, refDb, q, self.min_id, self.min_cov, self.min_ratio] for q in qrys ])) res = [r for r in res if r is not None] blastab = pd.DataFrame( np.vstack([pd.read_msgpack(r).values for r in res])) blastab[14] = [[list(t) for t in tab] for tab in blastab[14].tolist()] for r in res: os.unlink(r) logger('Run BLASTn finishes. Got {0} alignments'.format( blastab.shape[0])) return blastab
def run_lastal( refdb, query, output, lastal ) : cmd = '{0} -j4 -r1 -q2 -a7 -b1 {1} {2}'.format( lastal, refdb, query ) lastal_run = subprocess.Popen( cmd.split(), stdout=subprocess.PIPE, universal_newlines=True ) with open(output, 'w') as fout: fout.write(lastal_run.communicate()[0]) if lastal_run.returncode != 0 : fastq = readFastq(query) with open(output+'.qry', 'w') as fout : for n, (s, q) in fastq.items() : fout.write('@{0}\n{1}\n+\n{2}\n'.format(n, s, re.sub(r'[!"#$%&\']', '(', q))) cmd = '{0} -Q1 -j4 -r1 -q2 -a7 -b1 {1} {2}'.format( lastal, refdb, output + '.qry' ) lastal_run = subprocess.Popen( cmd.split(), stdout=subprocess.PIPE ) with open(output, 'w') as fout: fout.write(lastal_run.communicate()[0]) os.unlink(output + '.qry') return output
def getMatrix(prefix, reference, alignments, core, matrixOut, alignmentOut) : refSeq, refQual = readFastq(reference) coreSites = { n:np.zeros(len(refSeq[n]), dtype=int) for n in refSeq } matSites = { n:np.zeros(len(refSeq[n]), dtype=int) for n in refSeq } alnId = { aln[0]:id for id, aln in enumerate(alignments) } res = pool.map(readMap, alignments) matrix = {} for presences, absences, mutations in res : for mut in mutations : j = alnId[mut[0]] site = tuple(mut[1:3]) if site not in matrix : matrix[site] = [[], []] matSites[mut[1]][mut[2]-1] = mut[2] if len(mut[4]) == 1 : if len(matrix[site][0]) == 0 : matrix[site][0] = ['-' for id in alnId] matrix[site][0][j] = mut[4] else : if len(matrix[site][1]) == 0 : matrix[site][1] = ['-' for id in alnId] matrix[site][1][j] = mut[4] for (mTag, mFile), (presences, absences, mutations) in zip(alignments, res) : j = alnId[mTag] for n, s, e in presences : coreSites[n][s-1:e] +=1 mutations = matSites[n][s-1:e] for kk in mutations[mutations > 0] : k = (n, kk) if len(matrix[k][0]) and matrix[k][0][j] == '-' : matrix[k][0][j] = '.' if len(matrix[k][1]) and matrix[k][1][j] == '-' : matrix[k][1][j] = '.' for n, s, e, m in absences : coreSites[n][s-1:e] -=1 mutations = matSites[n][s-1:e] for kk in mutations[mutations > 0] : k = (n, kk) if len(matrix[k][0]) and matrix[k][0][j] == '.' : matrix[k][0][j] = '-' if len(matrix[k][1]) and matrix[k][1][j] == '.' : matrix[k][1][j] = '-' pres = np.unique(np.concatenate(list(coreSites.values())), return_counts=True) pres = [pres[0][pres[0] > 0], pres[1][pres[0] > 0]] coreNum = len(alignments) * core for p, n in zip(*pres) : sys.stderr.write('#{2} {0} {1}\n'.format(p, n, '' if p > coreNum else '#')) missings = [] coreBases = {'A':0, 'C':0, 'G':0, 'T':0} for n in sorted(coreSites) : sites = coreSites[n] for site, num in enumerate(sites) : cSite = (n, site+1) if num < coreNum and cSite in matrix and len(matrix[cSite][1]) > 0 : num = np.sum(matrix[cSite][1] != '-') matrix[cSite][0] = [] if num < coreNum : matrix.pop(cSite, None) if len(missings) == 0 or missings[-1][0] != n or missings[-1][2] + 1 < cSite[1] : missings.append([n, cSite[1], cSite[1]]) else : missings[-1][2] = cSite[1] else : b = refSeq[n][cSite[1]-1] if cSite in matrix and len(matrix[cSite][0]) : matrix[cSite][0] = [ (b if s == '.' else s) for s in matrix[cSite][0]] else : coreBases[b] = coreBases.get(b, 0) + 1 outputs = {} if matrixOut : outputs['matrix'] = prefix + '.matrix.gz' with uopen(prefix + '.matrix.gz', 'w') as fout : fout.write('## Constant_bases: {A} {C} {G} {T}\n'.format(**coreBases)) for n in refSeq : fout.write('## Sequence_length: {0} {1}\n'.format(n, len(refSeq[n]))) for region in missings : fout.write('## Missing_region: {0} {1} {2}\n'.format(*region)) fout.write('\t'.join(['#Seq', '#Site'] + [ mTag for mTag, mFile in alignments ]) + '\n') for site in sorted(matrix) : bases = matrix[site] if len(bases[0]) : fout.write('{0}\t{1}\t{2}\n'.format(site[0], site[1], '\t'.join(bases[0]))) if len(bases[1]) : fout.write('{0}\t{1}\t{2}\n'.format(site[0], site[1], '\t'.join(bases[1]))) if alignmentOut : outputs['alignment'] = prefix + '.fasta.gz' sequences = [] for (mTag, mFile), (presences, absences, mutations) in zip(alignments, res) : j = alnId[mTag] seq = { n:['-']*len(s) for n, s in refSeq.items() } if j > 0 else { n:list(s) for n, s in refSeq.items() } if j : for n, s, e in presences : seq[n][s-1:e] = refSeq[n][s-1:e] for n, s, e, c in absences : seq[n][s-1:e] = '-' * (e-s+1) for site in matrix : bases = matrix[site] if len(bases[0]) : seq[site[0]][site[1]-1] = bases[0][j] sequences.append(seq) with uopen(prefix + '.fasta.gz', 'w') as fout : for id, n in enumerate(sorted(refSeq)) : if id : fout.write('=\n') for (mTag, mFile), seq in zip(alignments, sequences) : fout.write('>{0}:{1}\n{2}\n'.format(mTag, n, ''.join(seq[n]))) return outputs
def alignAgainst(data) : prefix, minimap2, db, (rtag, reference), (tag, query) = data try : qrySeq, qryQual = readFastq(query) except : return [tag, query] refSeq, refQual = readFastq(reference) proc = subprocess.Popen('{0} -c -t1 --frag=yes -A2 -B8 -O20,40 -E3,2 -r20 -g200 -p.000001 -N5000 -f1000,5000 -n2 -m30 -s30 -z200 -2K10m --heap-sort=yes --secondary=yes {1} {2}'.format( minimap2, db, query).split(), stdout=subprocess.PIPE, universal_newlines=True) alignments = [] for lineId, line in enumerate(proc.stdout) : part = line.strip().split('\t') part[1:4] = [int(p) for p in part[1:4]] part[6:11] = [int(p) for p in part[6:11]] part[11] = float(part[13][5:]) part[12], part[13] = lineId, part[11]/part[10] part[14:17] = [[], [], []] alignments.append(part) proc.wait() deleteChain = {} nItem = len(alignments) alignments.sort(key=lambda x:x[:4]) for i1, p1 in enumerate(alignments) : for i2 in xrange(i1+1, nItem) : p2 = alignments[i2] if p1[0] != p2[0] : break s, e = max(p1[2], p2[2]), min(p1[3], p2[3]) if s > e+10 : break if (e-s) >= 0.9 * (p1[3]-p1[2]) and p2[13] - 0.1 >= p1[13] : deleteChain[p1[12]] = deleteChain.get(p1[12], set([])) | set([p2[12]]) if (e-s) >= 0.9 * (p2[3]-p2[2]) and p1[13] - 0.1 >= p2[13] : deleteChain[p2[12]] = deleteChain.get(p2[12], set([])) | set([p1[12]]) alignments.sort(key=lambda x:x[5:9]) for i1, p1 in enumerate(alignments) : for i2 in xrange(i1+1, nItem) : p2 = alignments[i2] if p1[5] != p2[5] : break s, e = max(p1[7], p2[7]), min(p1[8], p2[8]) if s > e+10 : break if (e-s) >= 0.9 * (p1[8]-p1[7]) and p2[13] - 0.05 >= p1[13] : deleteChain[p1[12]] = deleteChain.get(p1[12], set([])) | set([p2[12]]) if (e-s) >= 0.9 * (p2[8]-p2[7]) and p1[13] - 0.05 >= p2[13] : deleteChain[p2[12]] = deleteChain.get(p2[12], set([])) | set([p1[12]]) deleted = {} for p in sorted(alignments, key=lambda x:x[11], reverse=True) : id = p[12] if id in deleteChain : for jd in deleteChain[id] : if jd not in deleted : deleted[id] = 1 break alignments = [p for p in alignments if p[12] not in deleted] # repeats in qry nItem = len(alignments) alignments.sort(key=lambda x:x[:4]) for i1, p1 in enumerate(alignments) : for i2 in xrange(i1+1, nItem) : p2 = alignments[i2] if p1[0] != p2[0] : break s, e = max(p1[2], p2[2]), min(p1[3], p2[3]) if e > s : p1[16].append([s, e]) p2[16].append([s, e]) else : break # repeats in ref alignments.sort(key=lambda x:x[5:9]) for i1, p1 in enumerate(alignments) : for i2 in xrange(i1+1, nItem) : p2 = alignments[i2] if p1[5] != p2[5] : break s, e = max(p1[7], p2[7]), min(p1[8], p2[8]) if e > s : p1[15].append([s, e]) p2[15].append([s, e]) else : break maskedRegion = {} refRepeat = [] for p in alignments : # prepare a unique set of repeat region qryRepeat = [] if len(p[16]) > 0 : qryRepeat.append(p[16][0]) for pp in p[16][1:] : if pp[0] > qryRepeat[-1][1]+20 : qryRepeat.append(pp) elif pp[1] > qryRepeat[-1][1]: qryRepeat[-1][1] = pp[1] ref = [refSeq[p[5]], refQual[p[5]]] qry = [qrySeq[p[0]], qryQual[p[0]]] cigar = p[-1][5:] d = 1 if p[4] == '+' else -1 if d < 0 : qryRepeat = [[q[1], q[0], -1, -1] for q in qryRepeat] else : qryRepeat = [[q[0], q[1], -1, -1] for q in reversed(qryRepeat)] mut = [] alnSite = [p[7], p[2] if d > 0 else p[3]-1] for cl, ct in re.findall(r'(\d+)([MID])', cigar) : cl = int(cl) if ct == 'M' : # extract aligned sequences r = ref[0][alnSite[0]:alnSite[0]+cl] r1 = ref[1][alnSite[0]:alnSite[0]+cl] q = qry[0][alnSite[1]:alnSite[1]+cl] if d > 0 else rc(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)]) q1 = qry[1][alnSite[1]:alnSite[1]+cl] if d > 0 else ''.join(reversed(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)])) e =[alnSite[0]+cl, alnSite[1]+cl*d] for qid in xrange(len(qryRepeat)-1, -1, -1) : qr = qryRepeat[qid] if d*qr[0] <= d*e[1] : if qr[2] == -1 : qr[2] = alnSite[0] + d*(qr[0] - alnSite[1]) if d*qr[1] <= d*e[1] : qr[3] = alnSite[0] + d*(qr[1] - alnSite[1]) p[15].append(qr[2:]) del qryRepeat[qid] else : break for id, (rr, rr1, qq, qq1) in enumerate(np.array([list(r), list(r1), list(q), list(q1)]).T) : if ord(rr1) < 43 or ord(qq1) < 43 : maskedRegion[(p[5], alnSite[0]+id)] = 0 if rr != qq and rr != 'N' and qq != 'N' : mut.append([alnSite[0]+id, alnSite[1]+id*d, rr, qq, p[4]]) alnSite = e elif ct == 'I' : q = qry[0][alnSite[1]:alnSite[1]+cl] if d < 0 else rc(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)] ) q1 = qry[1][alnSite[1]:alnSite[1]+cl] if d > 0 else ''.join(reversed(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)] )) e = alnSite[1] + cl*d for qid in xrange(len(qryRepeat)-1, -1, -1) : qr = qryRepeat[qid] if d*qr[0] <= d*e : if qr[2] == -1 : qr[2] = alnSite[0] if d*qr[1] <= d*e : qr[3] = alnSite[0] p[15].append(qr[2:]) del qryRepeat[qid] else : break if ord(min(list(q1))) >= 43 : mut.append([alnSite[0], min(alnSite[1], e), '.', '+' + q, p[4]]) for site in xrange(alnSite[0], alnSite[0]+2) : maskedRegion[(p[5], site)] = 0 alnSite[1] = e elif ct == 'D' : r = ref[0][alnSite[0]:alnSite[0]+cl] r1 = ref[1][alnSite[0]:alnSite[0]+cl] if ord(min(list(r1))) >= 43 : mut.append([alnSite[0], int(alnSite[1]+0.5*d), '.', '-' + r, p[4]]) for site in xrange(alnSite[0], alnSite[0]+2) : maskedRegion[(p[5], site)] = 0 alnSite[0]+=cl p[14] = mut refRepeat.extend([ [p[5], pp[0], pp[1]] for pp in p[15] ]) repeats = [] if len(refRepeat) : refRepeat.sort() repeats = [refRepeat[0]] for p in refRepeat[1:] : if p[0] != repeats[-1][0] or p[1] - 20 > repeats[-1][2] : repeats.append(p) elif p[2] > repeats[-1][2] : repeats[-1][2] = p[2] for p in repeats : for site in xrange(p[1], p[2]) : maskedRegion[(p[0], site)] = 1 repeats = [] for cont, site in sorted(maskedRegion) : if len(repeats) == 0 or repeats[-1][0] != cont or repeats[-1][2]+1 < site : repeats.append([cont, site, site]) else : repeats[-1][2] = site mutations = [] alignments = [aln for aln in alignments if aln[9] >= 100] for aln in alignments : for m in aln[14] : if len(m[3]) == 1 : if (aln[5], m[0]) not in maskedRegion : mutations.append([aln[5], aln[0]] + m) elif maskedRegion.get((aln[5], m[0]), 0) != 1 : if m[3].startswith('-') and maskedRegion.get((aln[5], m[0]+len(m[3])-2), 0) > 0 : continue mutations.append([aln[5], aln[0]] + m) with uopen(prefix + '.gff.gz', 'w') as fout : fout.write('##gff-version 3\n') fout.write('## Reference: {0}\n'.format(reference)) fout.write('## Query: {0}\n'.format(query)) fout.write('## Tag: {0}\n'.format(tag)) for aln in alignments : if aln[5] == aln[0] and aln[2] == aln[7] and aln[3] == aln[8] : fout.write('{0}\trefMapper\tmisc_feature\t{1}\t{2}\t{3}\t{4}\t.\t/inference="Self%20Alignments"\n'.format( aln[5], aln[7]+1, aln[8], aln[9], aln[4], aln[0], aln[2]+1, aln[3], )) else : fout.write('{0}\trefMapper\tmisc_feature\t{1}\t{2}\t{3}\t{4}\t.\t/inference="Aligned%20with%20{5}:{6}-{7}"\n'.format( aln[5], aln[7]+1, aln[8], aln[9], aln[4], aln[0], aln[2]+1, aln[3], )) for p in repeats : fout.write('{0}\trefMapper\tunsure\t{1}\t{2}\t.\t+\t.\t/inference="Uncertain%20base%20calling%20or%20ambigious%20alignment"\n'.format( p[0], p[1]+1, p[2]+1, )) for mut in mutations : e1 = mut[2] if not mut[5].startswith('-') else mut[2] + len(mut[5]) - 2 e2 = mut[3] if not mut[5].startswith('+') else mut[3] + len(mut[5]) - 2 if len(mut[5]) > 26 : mut[5] = '{0}[{1}bps]'.format(mut[5][0], len(mut[5])-1) fout.write('{0}\trefMapper\tvariation\t{1}\t{2}\t.\t+\t.\t/replace="{7}";/compare="{3}:{4}-{5}:{8}";/origin="{6}"\n'.format( mut[0], mut[2]+1, e1+1, mut[1], mut[3]+1, e2+1, mut[4], mut[5], mut[6] )) return [tag, prefix + '.gff.gz']
def runMMseq(self, ref, qry): logger('Run MMSeqs starts') def parseMMSeq(fin, refseq, qryseq, min_id, min_cov, min_ratio): blastab = pd.read_csv(fin, sep='\t', header=None) blastab = blastab[blastab[2] >= min_id] qlen = blastab[0].apply(lambda r: len(qryseq[r])) rlen = blastab[1].apply(lambda r: len(refseq[r])) cigar = blastab[14].apply(lambda x: [[int(n) * 3, t] for n, t in re .findall(r'(\d+)([A-Z])', x)]) ref_sites = pd.concat([3 * (blastab[6] - 1) + 1, 3 * blastab[7]], keys=[0, 1], axis=1) d = ref_sites[1] - qlen d[d < 0] = 0 def ending(x, y): x[-1][0] -= y np.vectorize(ending)(cigar, d) ref_sites[1] -= d direction = (blastab[8] < blastab[9]) qry_sites = pd.concat([blastab[8], blastab[9] - d], axis=1) qry_sites[~direction] = pd.concat([blastab[8] - d, blastab[9]], axis=1)[~direction] blastab = pd.DataFrame( np.hstack([ blastab[[0, 1, 2]], np.apply_along_axis(lambda x: x[1] - x[0] + 1, 1, ref_sites.values)[:, np.newaxis], pd.DataFrame(np.zeros([blastab.shape[0], 2], dtype=int)), ref_sites, qry_sites, blastab[[10, 11]], qlen[:, np.newaxis], rlen[:, np.newaxis], cigar[:, np.newaxis] ])) return blastab[(blastab[3] >= min_cov) & (blastab[3] >= blastab[12] * min_ratio)] tmpDir = os.path.join(self.dirPath, 'tmp') refNA = os.path.join(self.dirPath, 'refNA') qryNA = os.path.join(self.dirPath, 'qryNA') refCDS = os.path.join(self.dirPath, 'refCDS') qryAA = os.path.join(self.dirPath, 'qryAA') aaMatch = os.path.join(self.dirPath, 'aaMatch2') Popen('{0} createdb {1} {2} --dont-split-seq-by-len'.format( mmseqs, ref, refNA).split(), stdout=PIPE).communicate() Popen('{0} createdb {1} {2} --dont-split-seq-by-len'.format( mmseqs, qry, qryNA).split(), stdout=PIPE).communicate() Popen('{0} translatenucs {1} {2}'.format(mmseqs, qryNA, qryAA).split(), stdout=PIPE).communicate() for ite in range(9): if os.path.isdir(tmpDir): shutil.rmtree(tmpDir) p = Popen('{0} search {1} {2} {3} {4} -a --alt-ali 30 -s 6 --translation-table 11 --threads {5} --min-seq-id {6} -e 10 --cov-mode 2 -c {7}'.format(\ mmseqs, qryAA, refNA, aaMatch, tmpDir, self.n_thread, self.min_id, self.min_ratio).split(), stdout=PIPE) p.communicate() if p.returncode == 0: break if ite > 2: Popen('{0} extractorfs {2} {3}'.format(mmseqs, qryAA, refNA, refCDS).split(), stdout=PIPE).communicate() p = Popen('{0} search {1} {2} {3} {4} -a --alt-ali 30 -s 6 --translation-table 11 --threads {5} --min-seq-id {6} -e 10 --cov-mode 2 -c {7}'.format(\ mmseqs, qryAA, refCDS, aaMatch, tmpDir, self.n_thread, self.min_id, self.min_ratio).split(), stdout=PIPE) p.communicate() if p.returncode == 0: break time.sleep(1) Popen('{0} convertalis {1} {2} {3} {3}.tab --threads {4} --format-output'.format(\ mmseqs, qryAA, refNA, aaMatch, self.n_thread).split() + ['query,target,pident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,raw,qlen,tlen,cigar'], stdout=PIPE).communicate() if not self.qrySeq: self.qrySeq, self.qryQual = readFastq(qry) if not self.refSeq: self.refSeq, self.refQual = readFastq(ref) blastab = parseMMSeq(open(aaMatch + '.tab'), self.refSeq, self.qrySeq, self.min_id, self.min_cov, self.min_ratio) logger('Run MMSeqs finishes. Got {0} alignments'.format( blastab.shape[0])) return blastab
def runUBlast(self, ref, qry, nhits=6, frames='7'): logger('Run uBLAST starts') def parseUBlast(fin, refseq, qryseq, min_id, min_cov, min_ratio): blastab = pd.read_csv(fin, sep='\t', header=None) blastab[2] /= 100. blastab = blastab[blastab[2] >= min_id] blastab[3], blastab[4] = blastab[3] * 3, blastab[4] * 3 qf, rf = blastab[0].str.rsplit( ':', 1, expand=True), blastab[1].str.rsplit(':', 1, expand=True) if np.all(qf[0].str.isdigit()): qf[0] = qf[0].astype(int) if np.all(rf[0].str.isdigit()): rf[0] = rf[0].astype(int) blastab[0], qf = qf[0], qf[1].astype(int) blastab[1], rf = rf[0], rf[1].astype(int) blastab[6], blastab[ 7] = blastab[6] * 3 + qf - 3, blastab[7] * 3 + qf - 1 blastab[14] = [[ [3 * vv[0], vv[1]] for vv in v ] for v in map(getCIGAR, zip(blastab[15], blastab[14]))] blastab[12], blastab[13] = blastab[0].apply(lambda x: len(qryseq[ str(x)])), blastab[1].apply(lambda x: len(refseq[str(x)])) rf3 = (rf <= 3) blastab.loc[rf3, 8], blastab.loc[rf3, 9] = blastab.loc[rf3, 8] * 3 + rf[ rf3] - 3, blastab.loc[rf3, 9] * 3 + rf[rf3] - 1 blastab.loc[~rf3, 8], blastab.loc[ ~rf3, 9] = blastab.loc[~rf3, 13] - ( blastab.loc[~rf3, 8] * 3 + rf[~rf3] - 3 - 3) + 1, blastab.loc[~rf3, 13] - (blastab.loc[~rf3, 9] * 3 + rf[~rf3] - 3 - 1) + 1 d = np.max([ blastab[7] - blastab[12], blastab[9] - blastab[13], 1 - blastab[9], np.zeros(blastab.shape[0], dtype=int) ], axis=0) blastab[7] -= d def ending(x, y): x[-1][0] -= y np.vectorize(ending)(blastab[14], d) d[~rf3] *= -1 blastab[9] -= d blastab = blastab[ (blastab[7] - blastab[6] + 1 >= min_ratio * blastab[12]) & (blastab[7] - blastab[6] + 1 >= min_cov)] return blastab.drop(columns=[15, 16]) refAA = os.path.join(self.dirPath, 'refAA') qryAA = os.path.join(self.dirPath, 'qryAA') aaMatch = os.path.join(self.dirPath, 'aaMatch') if not self.qrySeq: self.qrySeq, self.qryQual = readFastq(qry) if not self.refSeq: self.refSeq, self.refQual = readFastq(ref) qryAASeq = transeq(self.qrySeq, frame='F') with open(qryAA, 'w') as fout: for n, ss in sorted(qryAASeq.items()): _, id, s = min([(len(s[:-1].split('X')), id, s) for id, s in enumerate(ss)]) fout.write('>{0}:{1}\n{2}\n'.format(n, id + 1, s)) refAASeq = transeq(self.refSeq, frames) toWrite = [] for n, ss in sorted(refAASeq.items()): for id, s in enumerate(ss): toWrite.append('>{0}:{1}\n{2}\n'.format(n, id + 1, s)) blastab = [] for id in xrange(5): with open(refAA, 'w') as fout: for line in toWrite[id::4]: fout.write(line) ublast_cmd = '{usearch} -self -threads {n_thread} -db {refAA} -ublast {qryAA} -mid {min_id} -query_cov {min_ratio} -evalue 1 -accel 0.9 -maxhits {nhits} -userout {aaMatch} -ka_dbsize 5000000 -userfields query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+raw+ql+tl+qrow+trow+qstrand'.format( usearch=usearch, refAA=refAA, qryAA=qryAA, aaMatch=aaMatch, n_thread=self.n_thread, min_id=self.min_id * 100., nhits=nhits, min_ratio=self.min_ratio) p = Popen(ublast_cmd.split(), stderr=PIPE, stdout=PIPE, universal_newlines=True).communicate() if os.path.getsize(aaMatch) > 0: blastab.append( parseUBlast(open(aaMatch), self.refSeq, self.qrySeq, self.min_id, self.min_cov, self.min_ratio)) blastab = pd.concat(blastab) logger('Run uBLAST finishes. Got {0} alignments'.format( blastab.shape[0])) return blastab
def runDiamond(self, ref, qry, nhits=10, frames='7'): logger('Run diamond starts') def parseDiamond(fin, refseq, qryseq, min_id, min_cov, min_ratio): blastab = [] for line in fin: if line.startswith('@'): continue part = line.strip().split('\t') if part[2] == '*': continue qn, qf = part[0].rsplit(':', 1) rn, rf, rx = part[2].rsplit(':', 2) rs = int(part[3]) + int(rx) ql, rl = len(qryseq[str(qn)]), len(refseq[str(rn)]) qm = len(part[9]) if qm * 3 < min_cov: continue cov_ratio = qm * 3. / ql if cov_ratio < min_ratio: continue cigar = [[int(n) * 3, t] for n, t in re.findall(r'(\d+)([A-Z])', part[5])] cl = np.sum([c[0] for c in cigar]) variation = float(part[12][5:]) * 3 if part[12].startswith( 'NM:') else float(re.findall('NM:i:(\d+)', line)[0]) * 3 iden = 1 - round(variation / cl, 3) if iden < min_id: continue qf, rf = int(qf), int(rf) qs = int(part[18][5:]) if part[18].startswith('ZS:') else int( re.findall('ZS:i:(\d+)', line)[0]) rm = int( np.sum([c[0] for c in cigar if c[1] in {'M', 'D'}]) / 3) if rf <= 3: rs, r_e = rs * 3 + rf - 3, (rs + rm - 1) * 3 + rf - 1 else: rs, r_e = rl - (rs * 3 + rf - 6) + 1, rl - ( (rs + rm - 1) * 3 + rf - 4) + 1 if qf <= 3: qs, qe = qs * 3 + qf - 3, (qs + qm - 1) * 3 + qf - 1 else: qs, qe = ql - (qs * 3 + qf - 6) + 1, ql - ( (qs + qm - 1) * 3 + qf - 4) + 1 qs, qe, rs, r_e = qe, qs, r_e, rs cigar = list(reversed(cigar)) cd = [c[0] for c in cigar if c[1] != 'M'] score = int( part[14][5:]) if part[14].startswith('ZR:') else int( re.findall('ZR:i:(\d+)', line)[0]) blastab.append([ qn, rn, iden, cl, int(variation - sum(cd)), len(cd), qs, qe, rs, r_e, 0.0, score, ql, rl, cigar ]) blastab = pd.DataFrame(blastab) blastab[[0, 1]] = blastab[[0, 1]].astype(str) return blastab refAA = os.path.join(self.dirPath, 'refAA') qryAA = os.path.join(self.dirPath, 'qryAA') aaMatch = os.path.join(self.dirPath, 'aaMatch') if not self.qrySeq: self.qrySeq, self.qryQual = readFastq(qry) if not self.refSeq: self.refSeq, self.refQual = readFastq(ref) qryAASeq = transeq(self.qrySeq, frame='F', transl_table=self.table_id) with open(qryAA, 'w') as fout: for n, ss in sorted(qryAASeq.items()): _, id, s = min([(len(s[:-1].split('X')), id, s) for id, s in enumerate(ss)]) fout.write('>{0}:{1}\n{2}\n'.format(n, id + 1, s)) diamond_fmt = '{diamond} makedb --db {qryAA} --in {qryAA}'.format( diamond=diamond, qryAA=qryAA) p = Popen(diamond_fmt.split(), stderr=PIPE, stdout=PIPE, universal_newlines=True).communicate() refAASeq = transeq(self.refSeq, frames, transl_table=self.table_id) toWrite = [] for n, ss in sorted(refAASeq.items()): for id, s in enumerate(ss): cdss = re.findall('.{1000,}?X|.{1,1000}$', s + 'X') cdss[-1] = cdss[-1][:-1] cdsi = np.cumsum([0] + list(map(len, cdss[:-1]))) for ci, cs in zip(cdsi, cdss): if len(cs): toWrite.append('>{0}:{1}:{2}\n{3}\n'.format( n, id + 1, ci, cs)) blastab = [] for id in xrange(5): #logger('{0}'.format(id)) with open(refAA, 'w') as fout: for line in toWrite[id::5]: fout.write(line) diamond_cmd = '{diamond} blastp --no-self-hits --threads {n_thread} --db {refAA} --query {qryAA} --out {aaMatch} --id {min_id} --query-cover {min_ratio} --evalue 1 -k {nhits} --dbsize 5000000 --outfmt 101'.format( diamond=diamond, refAA=refAA, qryAA=qryAA, aaMatch=aaMatch, n_thread=self.n_thread, min_id=self.min_id * 100., nhits=nhits, min_ratio=self.min_ratio * 100.) p = Popen(diamond_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() if os.path.getsize(aaMatch) > 0: tab = parseDiamond(open(aaMatch), self.refSeq, self.qrySeq, self.min_id, self.min_cov, self.min_ratio) os.unlink(aaMatch) if tab is not None: blastab.append(tab) blastab = pd.concat(blastab) logger('Run diamond finishes. Got {0} alignments'.format( blastab.shape[0])) return blastab