def runDiamond(self, ref, qry, nhits=10, frames='7') : logger('Run diamond starts') refAA = os.path.join(self.dirPath, 'refAA') qryAA = os.path.join(self.dirPath, 'qryAA') aaMatch = os.path.join(self.dirPath, 'aaMatch') if not self.qrySeq : self.qrySeq, self.qryQual = readFastq(qry) if not self.refSeq : self.refSeq, self.refQual = readFastq(ref) qryAASeq = transeq(self.qrySeq, frame='F', transl_table=self.table_id) with open(qryAA, 'w') as fout : for n, ss in sorted(qryAASeq.items()) : _, id, s = min([ (len(s[:-1].split('X')), id, s) for id, s in enumerate(ss) ]) fout.write('>{0}:{1}\n{2}\n'.format(n, id+1, s)) diamond_fmt = '{diamond} makedb --db {qryAA} --in {qryAA}'.format( diamond=diamond, qryAA=qryAA) p = Popen(diamond_fmt.split(), stderr=PIPE, stdout=PIPE, universal_newlines=True).communicate() refAASeq = transeq(self.refSeq, frames, transl_table=self.table_id) toWrite = [] for n, ss in sorted(refAASeq.items()) : for id, s in enumerate(ss) : cdss = re.findall('.{1000,}?X|.{1,1000}$', s + 'X') cdss[-1] = cdss[-1][:-1] cdsi = np.cumsum([0]+list(map(len, cdss[:-1]))) for ci, cs in zip(cdsi, cdss) : if len(cs) : toWrite.append('>{0}:{1}:{2}\n{3}\n'.format(n, id+1, ci, cs)) for id in xrange(5) : with open('{0}.{1}'.format(refAA, id), 'w') as fout : for line in toWrite[id::5] : fout.write(line) diamond_cmd = '{diamond} blastp --no-self-hits --threads {n_thread} --db {refAA} --query {qryAA} --out {aaMatch} --id {min_id} --query-cover {min_ratio} --evalue 1 -k {nhits} --dbsize 5000000 --outfmt 101'.format( diamond=diamond, refAA='{0}.{1}'.format(refAA, id), qryAA=qryAA, aaMatch='{0}.{1}'.format(aaMatch, id), n_thread=self.n_thread, min_id=self.min_id*100., nhits=nhits, min_ratio=self.min_ratio*100.) Popen(diamond_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() blastab = [] for r in self.pool.imap_unordered(parseDiamond, [ ['{0}.{1}'.format(aaMatch, id), self.refSeq, self.qrySeq, self.min_id, self.min_cov, self.min_ratio] for id in xrange(5) ]) : if r is not None : blastab.append(np.load(r, allow_pickle=True)) os.unlink(r) blastab = np.vstack(blastab) logger('Run diamond finishes. Got {0} alignments'.format(blastab.shape[0])) return blastab
def get_allele_info(allele_file): if os.path.isfile(allele_file + '.stat'): return json.load(open(allele_file + '.stat')) alleles = readFasta(allele_file) allele_aa = transeq(alleles) allele_stat = {} for n, s in alleles.items(): locus, allele_id = n.rsplit('_', 1) if locus not in allele_stat: allele_stat[locus] = {} if len(s) % 3 > 0: pseudo = 2 # frameshift else: aa = allele_aa.get(n + '_1', 'A') if aa[:-1].find('X') >= 0: pseudo = 3 # premature elif s[:3] not in ('ATG', 'GTG', 'TTG'): pseudo = 4 # no start elif aa[-1] != 'X': pseudo = 5 # no stop else: pseudo = 6 # intact allele_stat[locus][ allele_id] = int(allele_id) * 1000000 + len(s) * 10 + pseudo json.dump(allele_stat, open(allele_file + '.stat', 'w')) return allele_stat
def write_refsets(self, reference): ref_aa = '{0}.refset.aa'.format(parameters['unique_key']) refseq = self.readSequence(reference) refamino = transeq({n: s[0] for n, s in refseq.iteritems()}, 1) with open(ref_aa, 'w') as fout: for n, s in refamino.iteritems(): if s[:-1].find('X') == -1: fout.write('>{0}\n{1}\n'.format(n, s)) return ref_aa
def MLSTdb(): for arg in sys.argv[1:]: if arg.find('=') >= 0: k, v = arg.split('=', 1) if k in parameters: parameters[k] = v else: parameters['fasta'].append(arg) alleles = readFastaToList(parameters['fasta']) loci = {allele_id.rsplit('_', 1)[0]: [] for allele_id, seq in alleles} for allele_id, seq in alleles: locus, id = allele_id.rsplit('_', 1) loci[locus].append([id, seq]) del alleles with open('{0}.refset.fna'.format(parameters['prefix']), 'w') as refout: for locus, alleles in loci.iteritems(): with open('{0}.refset'.format(parameters['prefix']), 'w') as fout: id, seq = alleles[0] fout.write('>{0}\n{1}\n'.format(id, seq)) with open('{0}.alleles'.format(parameters['prefix']), 'w') as fout: for id, seq in alleles[1:]: fout.write('>{0}\n{1}\n'.format(id, seq)) format_cmd = '{formatdb} -dbtype nucl -in {prefix}.refset'.format( **parameters) Popen(format_cmd.split(), stderr=PIPE, stdout=PIPE).communicate() blast_cmd = '{blast} -db {prefix}.refset -query {prefix}.alleles -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue score qlen slen" -num_threads 6 -task blastn -evalue 1e-3 -dbsize 5000000 -reward 2 -penalty -2 -gapopen 6 -gapextend 2'.format( **parameters) p = Popen(blast_cmd, stderr=PIPE, stdout=PIPE, shell=True) ids = {alleles[0][0]: 1} for line in p.stdout: p = np.array(line.strip().split(), dtype=float) if p[6] - 1 < 10 and p[6] == p[8] and p[12] - p[7] < 10 and p[ 13] - p[9] == p[12] - p[7]: ids[str(int(p[0]))] = 1 with open('{0}.alleles'.format(parameters['prefix']), 'w') as fout: for id, seq in alleles: if id in ids: fout.write('>{0}_{1}\n{2}\n'.format(locus, id, seq)) outfile = mmseq_cluster(parameters['prefix'], parameters['prefix'] + '.alleles', parameters['id']) with open(outfile) as fin: for line in fin: refout.write(line) refseq = readFastaToList('{0}.refset.fna'.format(parameters['prefix'])) ref_aa = transeq(dict(refseq)) with open('{0}.refset.faa'.format(parameters['prefix']), 'w') as fout: for n, s in ref_aa.iteritems(): if s[:-1].find('X') < 0: fout.write('>{0}\n{1}\n'.format(n, s)) return '{0}.refset.fna'.format( parameters['prefix']), '{0}.refset.faa'.format(parameters['prefix'])
def write_query(self, query): fna, faa = '{0}.query.na'.format( parameters['unique_key']), '{0}.query.aa'.format( parameters['unique_key']) qryseq = self.readSequence(query) qryamino = transeq({n: s[0] for n, s in qryseq.iteritems()}, frame=7) with open(fna, 'w') as fout: for n, s in qryseq.iteritems(): fout.write('>{0}\n{1}\n'.format(n, s[0])) with open(faa, 'w') as fout: for n, s in qryamino.iteritems(): fout.write('>{0}\n{1}\n'.format(n, s)) return qryseq, fna, faa
def checkCDS(n, s): if len(s) < params['min_cds']: logger('{0} is too short'.format(n)) return False if params['incompleteCDS']: return True if len(s) % 3 > 0: logger('{0} is discarded due to frameshifts'.format(n)) return False aa = transeq({'n': s.upper()}, frame=1, transl_table='starts')['n'][0] if aa[0] != 'M': logger('{0} is discarded due to lack of start codon'.format(n)) return False if aa[-1] != 'X': logger('{0} is discarded due to lack of stop codon'.format(n)) return False if len(aa[:-1].split('X')) > 1: logger('{0} is discarded due to internal stop codons'.format(n)) return False return True
def get_allele_info(alleles): allele_aa = transeq(alleles) allele_stat = {} for n, s in alleles.iteritems(): locus, allele_id = n.rsplit('_', 1) if locus not in allele_stat: allele_stat[locus] = {} if len(s) % 3 > 0: pseudo = 2 # frameshift else: aa = allele_aa.get(n + '_1', 'A') if aa[:-1].find('X') >= 0: pseudo = 3 # premature elif s[:3] not in ('ATG', 'GTG', 'TTG'): pseudo = 4 # no start elif aa[-1] != 'X': pseudo = 5 # no stop else: pseudo = 6 # intact allele_stat[locus][allele_id] = [len(s), pseudo] return allele_stat
def runUBlast(self, ref, qry, nhits=6, frames='7'): logger('Run uBLAST starts') def parseUBlast(fin, refseq, qryseq, min_id, min_cov, min_ratio): blastab = pd.read_csv(fin, sep='\t', header=None) blastab[2] /= 100. blastab = blastab[blastab[2] >= min_id] blastab[3], blastab[4] = blastab[3] * 3, blastab[4] * 3 qf, rf = blastab[0].str.rsplit( ':', 1, expand=True), blastab[1].str.rsplit(':', 1, expand=True) if np.all(qf[0].str.isdigit()): qf[0] = qf[0].astype(int) if np.all(rf[0].str.isdigit()): rf[0] = rf[0].astype(int) blastab[0], qf = qf[0], qf[1].astype(int) blastab[1], rf = rf[0], rf[1].astype(int) blastab[6], blastab[ 7] = blastab[6] * 3 + qf - 3, blastab[7] * 3 + qf - 1 blastab[14] = [[ [3 * vv[0], vv[1]] for vv in v ] for v in map(getCIGAR, zip(blastab[15], blastab[14]))] blastab[12], blastab[13] = blastab[0].apply(lambda x: len(qryseq[ str(x)])), blastab[1].apply(lambda x: len(refseq[str(x)])) rf3 = (rf <= 3) blastab.loc[rf3, 8], blastab.loc[rf3, 9] = blastab.loc[rf3, 8] * 3 + rf[ rf3] - 3, blastab.loc[rf3, 9] * 3 + rf[rf3] - 1 blastab.loc[~rf3, 8], blastab.loc[ ~rf3, 9] = blastab.loc[~rf3, 13] - ( blastab.loc[~rf3, 8] * 3 + rf[~rf3] - 3 - 3) + 1, blastab.loc[~rf3, 13] - (blastab.loc[~rf3, 9] * 3 + rf[~rf3] - 3 - 1) + 1 d = np.max([ blastab[7] - blastab[12], blastab[9] - blastab[13], 1 - blastab[9], np.zeros(blastab.shape[0], dtype=int) ], axis=0) blastab[7] -= d def ending(x, y): x[-1][0] -= y np.vectorize(ending)(blastab[14], d) d[~rf3] *= -1 blastab[9] -= d blastab = blastab[ (blastab[7] - blastab[6] + 1 >= min_ratio * blastab[12]) & (blastab[7] - blastab[6] + 1 >= min_cov)] return blastab.drop(columns=[15, 16]) refAA = os.path.join(self.dirPath, 'refAA') qryAA = os.path.join(self.dirPath, 'qryAA') aaMatch = os.path.join(self.dirPath, 'aaMatch') if not self.qrySeq: self.qrySeq, self.qryQual = readFastq(qry) if not self.refSeq: self.refSeq, self.refQual = readFastq(ref) qryAASeq = transeq(self.qrySeq, frame='F') with open(qryAA, 'w') as fout: for n, ss in sorted(qryAASeq.items()): _, id, s = min([(len(s[:-1].split('X')), id, s) for id, s in enumerate(ss)]) fout.write('>{0}:{1}\n{2}\n'.format(n, id + 1, s)) refAASeq = transeq(self.refSeq, frames) toWrite = [] for n, ss in sorted(refAASeq.items()): for id, s in enumerate(ss): toWrite.append('>{0}:{1}\n{2}\n'.format(n, id + 1, s)) blastab = [] for id in xrange(5): with open(refAA, 'w') as fout: for line in toWrite[id::4]: fout.write(line) ublast_cmd = '{usearch} -self -threads {n_thread} -db {refAA} -ublast {qryAA} -mid {min_id} -query_cov {min_ratio} -evalue 1 -accel 0.9 -maxhits {nhits} -userout {aaMatch} -ka_dbsize 5000000 -userfields query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+raw+ql+tl+qrow+trow+qstrand'.format( usearch=usearch, refAA=refAA, qryAA=qryAA, aaMatch=aaMatch, n_thread=self.n_thread, min_id=self.min_id * 100., nhits=nhits, min_ratio=self.min_ratio) p = Popen(ublast_cmd.split(), stderr=PIPE, stdout=PIPE, universal_newlines=True).communicate() if os.path.getsize(aaMatch) > 0: blastab.append( parseUBlast(open(aaMatch), self.refSeq, self.qrySeq, self.min_id, self.min_cov, self.min_ratio)) blastab = pd.concat(blastab) logger('Run uBLAST finishes. Got {0} alignments'.format( blastab.shape[0])) return blastab
def getClust(prefix, genes, params): groups = {} dirPath = tempfile.mkdtemp(prefix='NS_', dir='.') try: if not params['translate']: geneFile = genes else: na_seqs = readFasta(genes) aa_seqs = transeq(na_seqs, frame='1', transl_table='starts') with open(os.path.join(dirPath, 'seq.aa'), 'w') as fout: for n, s in aa_seqs: fout.write('>{0}\n{1}\n'.format(n, s[0])) geneFile = os.path.join(dirPath, 'seq.aa') seqDb = os.path.join(dirPath, 'seq.db') tmpDb = os.path.join(dirPath, 'tmp') lcDb = os.path.join(dirPath, 'seq.lc') tabFile = os.path.join(dirPath, 'clust.tab') refFile = os.path.join(dirPath, 'seq.ref') nRef = 999999999999999 for ite in xrange(3): if os.path.isdir(tmpDb): shutil.rmtree(tmpDb) os.makedirs(tmpDb) if os.path.isfile(seqDb): list(map(os.unlink, glob.glob(seqDb + '*'))) if os.path.isfile(lcDb): list(map(os.unlink, glob.glob(lcDb + '*'))) subprocess.Popen('{0} createdb {2} {1} -v 0'.format( externals['mmseqs'], seqDb, geneFile).split()).communicate() subprocess.Popen('{0} linclust {1} {2} {3} --min-seq-id {4} -c {5} --threads {6} -v 0'.format( \ externals['mmseqs'], seqDb, lcDb, tmpDb, params['identity'], params['coverage'], params['n_thread']).split(), stdout=subprocess.PIPE).communicate() subprocess.Popen('{0} createtsv {1} {1} {2} {3}'.format(\ externals['mmseqs'], seqDb, lcDb, tabFile).split(), stdout = subprocess.PIPE).communicate() with open(tabFile) as fin: for line in fin: part = line.strip().split() groups[part[1]] = part[0] tmp = [] with open(geneFile) as fin: toWrite, used_grps = False, {None: 1} for line in fin: if line.startswith('>'): name = line[1:].strip().split()[0] grp = groups.get(name, None) toWrite = False if grp in used_grps else True if toWrite: used_grps[grp] = name if toWrite: tmp.append(line) for gene, grp in groups.items(): if grp in used_grps: groups[gene] = used_grps[grp] with open(refFile, 'w') as fout: for line in tmp: fout.write(line) if nRef <= len(used_grps): break nRef = len(used_grps) geneFile = refFile if not params['translate']: shutil.copy2(refFile, '{0}.clust.exemplar'.format(prefix)) else: rSeq = readFasta(refFile) na_seqs = dict(na_seqs) with open('{0}.clust.exemplar'.format(prefix), 'w') as fout: for n, s in rSeq: fout.write('>{0}\n{1}\n'.format(n, na_seqs[n])) finally: shutil.rmtree(dirPath) with open('{0}.clust.tab'.format(prefix), 'w') as fout: for gene, grp in sorted(groups.items()): g = gene while g != grp: g, grp = grp, groups[grp] groups[gene] = grp fout.write('{0}\t{1}\n'.format(gene, grp)) return '{0}.clust.exemplar'.format(prefix), '{0}.clust.tab'.format(prefix)
def write_output(prefix, prediction, genomes, clust_ref, old_prediction): predictions, alleles = {}, {} allele_file = open('{0}.allele.fna'.format(prefix), 'w') prediction = pd.read_csv(prediction, sep='\t', header=None).values for part in prediction: #with open(prediction) as fin : #for line in fin : #part = line.strip().split() if part[0] not in alleles: alleles[part[0]] = {clust_ref[part[0]]: 1} allele_file.write('>{0}_{1}\n{2}\n'.format(part[0], 1, clust_ref[part[0]])) if part[9] < part[10]: l, r, d = min(part[7] - 1, part[9] - 1), min(part[12] - part[8], part[13] - part[10]), 1 else: l, r, d = min(part[7] - 1, part[13] - part[9]), min(part[12] - part[8], part[10] - 1), -1 if l <= 6 and part[7] - l == 1: part[7], part[9] = part[7] - l, part[9] - l * d else: ll = (part[7] - 1) % 3 if ll > 0: part[7], part[9] = part[7] + 3 - ll, part[9] + (3 - ll) * d if r <= 6 and part[8] + r == part[12]: part[8], part[10] = part[8] + r, part[10] + r * d else: rr = (part[12] - part[8]) % 3 if rr > 0: part[8], part[10] = part[8] - 3 + rr, part[10] - (3 + rr) * d if part[9] < part[10]: part[9:12] = part[9], part[10], '+' else: part[9:12] = part[10], part[9], '-' if part[4] not in predictions: predictions[part[4]] = [] elif predictions[part[4]][-1][2] == part[2]: prev = predictions[part[4]][-1] if prev[5] == part[5] and part[7] - prev[8] < 500: if part[11] == '+' and part[9] - prev[10] < 500: prev[8], prev[10] = part[8], part[10] continue elif part[11] == '-' and prev[9] - part[10] < 500: prev[8], prev[9] = part[8], part[9] continue predictions[part[4]][-1][1], part[1] = -1, -1 predictions[part[4]].append(part) op = ['', 0, []] with open('{0}.EToKi.gff'.format(prefix), 'w') as fout: for gid, (g, predict) in enumerate(predictions.items()): predict.sort(key=itemgetter(5, 9, 10)) for pid, pred in enumerate(predict): if pred[1] == -1 or (pred[10] - pred[9] + 1) <= 0.8 * pred[12]: cds, allele_id = 'fragment:{0:.2f}%'.format( (pred[10] - pred[9] + 1) * 100 / pred[12]), 'uncertain' start, stop = pred[9:11] else: s, e = pred[9:11] if pred[11] == '+': s2, e2 = s - min(int(3 * ((s - 1) / 3)), 60), e + min( 3 * int((pred[13] - e) / 3), 600) seq = genomes[pred[5]][1][(s2 - 1):e2] lp, rp = s - s2, e2 - e else: s2, e2 = s - min(int(3 * ((s - 1) / 3)), 600), e + min( 3 * int((pred[13] - e) / 3), 60) seq = rc(genomes[pred[5]][1][(s2 - 1):e2]) rp, lp = s - s2, e2 - e seq2 = seq[(lp):(len(seq) - rp)] if seq2 not in alleles[pred[0]]: if pred[3] == pred[0] and pred[7] == 1 and pred[ 8] == pred[12]: alleles[pred[0]][seq2] = len(alleles[pred[0]]) + 1 else: alleles[pred[0]][seq2] = 'LowQ{0}'.format( len(alleles[pred[0]]) + 1) allele_id = str(alleles[pred[0]][seq2]) allele_file.write('>{0}_{1}\n{2}\n'.format( pred[0], allele_id, seq2)) else: allele_id = str(alleles[pred[0]][seq2]) frames = sorted(set([0, len(seq) % 3])) for frame, aa_seq in zip( frames, transeq({'n': seq}, transl_table='starts', frame=','.join( [str(f + 1) for f in frames]))['n']): cds = 'CDS' s0, s1 = aa_seq.find('M', int(lp / 3), int(lp / 3 + 30)), aa_seq.rfind( 'M', 0, int(lp / 3)) start = s0 if s0 >= 0 else s1 if start < 0: cds, start = 'nostart', int(lp / 3) stop = aa_seq.find('X', start) if 0 <= stop < lp / 3 + 30: s0 = aa_seq.find('M', stop, int(lp / 3 + 30)) if s0 >= 0: start = s0 stop = aa_seq.find('X', start) if stop < 0: cds = 'nostop' elif (stop - start + 1) * 3 <= 0.8 * pred[12]: cds = 'premature stop:{0:.2f}%'.format( (stop - start + 1) * 300 / pred[12]) if cds == 'CDS': if pred[11] == '+': start, stop = s2 + start * 3 + frame, s2 + stop * 3 + 2 + frame else: start, stop = e2 - stop * 3 - 2 - frame, e2 - start * 3 - frame break else: start, stop = s, e if frame > 0: cds = 'frameshift' if pred[5] != op[0]: op = [pred[5], 0, old_prediction.get(pred[5], [])] old_tag = [] for k in xrange(op[1], len(op[2])): opd = op[2][k] if opd[2] < start: op[1] = k + 1 elif opd[1] > stop: break elif opd[3] != pred[11]: continue ovl = min(opd[2], stop) - max(opd[1], start) + 1 if ovl >= 300 or ovl >= 0.6 * ( opd[2] - opd[1] + 1) or ovl >= 0.6 * (stop - start + 1): frame = min((opd[1] - start) % 3, (opd[2] - stop) % 3) if frame == 0: old_tag.append('{0}:{1}-{2}'.format(*opd)) fout.write( '{0}\t{1}\tEToKi-ortho\t{2}\t{3}\t.\t{4}\t.\tID={5};{12}inference=ortholog group:{6},allele ID:{7},matched region:{8}-{9}{10}{11}\n' .format( pred[5], 'CDS' if cds == 'CDS' else 'pseudogene', start, stop, pred[11], '{0}_{1}_{2}'.format(prefix, gid, pid), pred[0], allele_id, s, e, '' if pred[0] == pred[3] else ',structure variant group:' + pred[3], '' if cds == 'CDS' else ';pseudogene=' + cds, '' if len(old_tag) == 0 else 'locus_tag={0};'.format( ','.join(old_tag)), )) allele_file.close() return
def runDiamond(self, ref, qry, nhits=10, frames='7'): logger('Run diamond starts') def parseDiamond(fin, refseq, qryseq, min_id, min_cov, min_ratio): blastab = [] for line in fin: if line.startswith('@'): continue part = line.strip().split('\t') if part[2] == '*': continue qn, qf = part[0].rsplit(':', 1) rn, rf, rx = part[2].rsplit(':', 2) rs = int(part[3]) + int(rx) ql, rl = len(qryseq[str(qn)]), len(refseq[str(rn)]) qm = len(part[9]) if qm * 3 < min_cov: continue cov_ratio = qm * 3. / ql if cov_ratio < min_ratio: continue cigar = [[int(n) * 3, t] for n, t in re.findall(r'(\d+)([A-Z])', part[5])] cl = np.sum([c[0] for c in cigar]) variation = float(part[12][5:]) * 3 if part[12].startswith( 'NM:') else float(re.findall('NM:i:(\d+)', line)[0]) * 3 iden = 1 - round(variation / cl, 3) if iden < min_id: continue qf, rf = int(qf), int(rf) qs = int(part[18][5:]) if part[18].startswith('ZS:') else int( re.findall('ZS:i:(\d+)', line)[0]) rm = int( np.sum([c[0] for c in cigar if c[1] in {'M', 'D'}]) / 3) if rf <= 3: rs, r_e = rs * 3 + rf - 3, (rs + rm - 1) * 3 + rf - 1 else: rs, r_e = rl - (rs * 3 + rf - 6) + 1, rl - ( (rs + rm - 1) * 3 + rf - 4) + 1 if qf <= 3: qs, qe = qs * 3 + qf - 3, (qs + qm - 1) * 3 + qf - 1 else: qs, qe = ql - (qs * 3 + qf - 6) + 1, ql - ( (qs + qm - 1) * 3 + qf - 4) + 1 qs, qe, rs, r_e = qe, qs, r_e, rs cigar = list(reversed(cigar)) cd = [c[0] for c in cigar if c[1] != 'M'] score = int( part[14][5:]) if part[14].startswith('ZR:') else int( re.findall('ZR:i:(\d+)', line)[0]) blastab.append([ qn, rn, iden, cl, int(variation - sum(cd)), len(cd), qs, qe, rs, r_e, 0.0, score, ql, rl, cigar ]) blastab = pd.DataFrame(blastab) blastab[[0, 1]] = blastab[[0, 1]].astype(str) return blastab refAA = os.path.join(self.dirPath, 'refAA') qryAA = os.path.join(self.dirPath, 'qryAA') aaMatch = os.path.join(self.dirPath, 'aaMatch') if not self.qrySeq: self.qrySeq, self.qryQual = readFastq(qry) if not self.refSeq: self.refSeq, self.refQual = readFastq(ref) qryAASeq = transeq(self.qrySeq, frame='F', transl_table=self.table_id) with open(qryAA, 'w') as fout: for n, ss in sorted(qryAASeq.items()): _, id, s = min([(len(s[:-1].split('X')), id, s) for id, s in enumerate(ss)]) fout.write('>{0}:{1}\n{2}\n'.format(n, id + 1, s)) diamond_fmt = '{diamond} makedb --db {qryAA} --in {qryAA}'.format( diamond=diamond, qryAA=qryAA) p = Popen(diamond_fmt.split(), stderr=PIPE, stdout=PIPE, universal_newlines=True).communicate() refAASeq = transeq(self.refSeq, frames, transl_table=self.table_id) toWrite = [] for n, ss in sorted(refAASeq.items()): for id, s in enumerate(ss): cdss = re.findall('.{1000,}?X|.{1,1000}$', s + 'X') cdss[-1] = cdss[-1][:-1] cdsi = np.cumsum([0] + list(map(len, cdss[:-1]))) for ci, cs in zip(cdsi, cdss): if len(cs): toWrite.append('>{0}:{1}:{2}\n{3}\n'.format( n, id + 1, ci, cs)) blastab = [] for id in xrange(5): #logger('{0}'.format(id)) with open(refAA, 'w') as fout: for line in toWrite[id::5]: fout.write(line) diamond_cmd = '{diamond} blastp --no-self-hits --threads {n_thread} --db {refAA} --query {qryAA} --out {aaMatch} --id {min_id} --query-cover {min_ratio} --evalue 1 -k {nhits} --dbsize 5000000 --outfmt 101'.format( diamond=diamond, refAA=refAA, qryAA=qryAA, aaMatch=aaMatch, n_thread=self.n_thread, min_id=self.min_id * 100., nhits=nhits, min_ratio=self.min_ratio * 100.) p = Popen(diamond_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() if os.path.getsize(aaMatch) > 0: tab = parseDiamond(open(aaMatch), self.refSeq, self.qrySeq, self.min_id, self.min_cov, self.min_ratio) os.unlink(aaMatch) if tab is not None: blastab.append(tab) blastab = pd.concat(blastab) logger('Run diamond finishes. Got {0} alignments'.format( blastab.shape[0])) return blastab