def lookForORF(self, seq, rec) : coordinates, edges = rec['coordinates'], rec['flanking'][:] seq = self.get_seq(seq, *coordinates) #if (len(seq) - sum(edges)) % 3 == 0 : startCodon, stopCodon = 0, 0 for s in xrange(edges[0]%3, len(seq), 3) : c = seq[s:s+3] if c in {'ATG', 'TTG', 'GTG'} : startCodon, edges[0] = 1, 0 e0 = s+3 new_s = coordinates[1] + s if coordinates[3] == '+' else coordinates[2] - s break if not startCodon : return 6 for e in xrange(e0, len(seq), 3) : c = seq[e:e+3] if c in {'TAG', 'TAA', 'TGA'} : stopCodon, edges[1] = 1, 0 new_e = coordinates[1] + e+2 if coordinates[3] == '+' else coordinates[2] - e-2 break if startCodon and stopCodon and abs(e-s) + 1 >= 0.6 * (abs(coordinates[2]-coordinates[1])+1 - sum(rec['flanking'])) : c2 = (new_s, new_e) if coordinates[3] == '+' else (new_e, new_s) if c2[0] != coordinates[1] or c2[1] != coordinates[2] : rec['CIGAR'] = rec['CIGAR'].rsplit(':', 1)[0] + ':EXEMPT' coordinates[1:3] = c2 rec['flanking'] = edges return 0 return 6
def get_similar(bsn, ortho_pairs): key = tuple(sorted([bsn[0][0], bsn[0][1]])) if key in ortho_pairs: return matched_aa = {} len_aa = int(int(bsn[0][12]) / 3) for part in bsn: s_i, e_i, s_j, e_j = [int(x) for x in part[6:10]] for s, t in re.findall(r'(\d+)([A-Z])', part[14]): frame_i, frame_j = s_i % 3, s_j % 3 s = int(s) if t == 'M': if frame_i == frame_j or params['incompleteCDS']: matched_aa.update({ (s_i + x): 1 for x in xrange((3 - (frame_i - 1)) % 3, s) }) s_i += s s_j += s if len(matched_aa) * 3 >= min( params['match_len2'], params['match_len']) or len(matched_aa) >= ( min(params['match_prop'], params['match_prop2']) - 0.1) * len_aa: ortho_pairs[key] = 1 return elif t == 'I': s_i += s else: s_j += s
def do_polish_with_SNPs(self, reference, snp_file): sequence = readFasta(reference) snps = {n: [] for n in sequence} if snp_file != '': with open(snp_file) as fin: for line in fin: part = line.strip().split() snps[part[0]].append([int(part[1]), part[-1]]) self.snps = snps for n, s in sequence.items(): sequence[n] = list(s) for cont, sites in snps.items(): for site, base in reversed(sites): if base.startswith('+'): sequence[cont][site - 1:site - 1] = base[1:] elif base.startswith('-'): sequence[cont][site - 1:(site + len(base) - 2)] = [] else: sequence[cont][site - 1] = base with open('{0}.fasta'.format(prefix), 'w') as fout: for n, s in sorted(sequence.items()): s = ''.join(s) fout.write('>{0}\n{1}\n'.format( n, '\n'.join([ s[site:(site + 100)] for site in xrange(0, len(s), 100) ]))) return '{0}.fasta'.format(prefix)
def write_down(filename, regions, repeats, mutations, reference, query, tag) : with uopen(filename, 'w') as fout: fout.write('##gff-version 3\n') fout.write('## Reference: {0}\n'.format(reference)) fout.write('## Query: {0}\n'.format(query)) fout.write('## Tag: {0}\n'.format(tag)) fout.write('\n'.join(['{0}\trefMapper\tmisc_feature\t{1}\t{2}\t{3}\t{4}\t.\t{5}'.format(r[1], r[2], r[3], r[0], r[10], '/inference="Aligned%20with%20{0}:{1}-{2}"'.format(*r[7:10])) for r in regions]) + '\n') fout.write('\n'.join(['{0}\trefMapper\tunsure\t{1}\t{2}\t.\t+\t.\t/inference="{3}"'.format(r[0], r[1], r[2], 'Repetitive%20region' if r[3] == 0 else 'Uncertain%20base%20calling%20or%20ambigious%20alignment') for r in repeats]) + '\n') for contig, variation in sorted(mutations.items()): for site, alters in sorted(variation.items()) : for alter, source in alters.items() : if source[6][0] == '-' : difference = '+{0}'.format(source[7]) origin = '.' elif source[7][0] == '-' : difference = '-{0}'.format(source[6]) origin = source[6] else : difference = source[7] origin = source[6] compare = '' for id in xrange(0, len(source), 9) : compare += '{0}:{1}-{2}:{3};'.format(source[id+0], abs(source[id+4]), abs(source[id+5]), source[id+1]) fout.write('{0}\trefMapper\tvariation\t{1}\t{2}\t.\t+\t.\t{3}\n'.format(contig, source[2], source[3], '/replace="{0}";/compare="{1}";/origin="{2}"'.format(difference, compare[:-1], origin)))
def inter_loci_overlap(self, alleles, parameters) : regions = [reg for region in alleles.values() for reg in region] # sort with contig name and start points regions.sort(key=itemgetter(2,3)) for id, regi in enumerate(regions) : if regi[0] == '' : continue todel, deleted = [], 0 for jd in xrange(id+1, len(regions)) : regj = regions[jd] if regj[0] == '' or regi[0] == regj[0]: continue if regi[2] != regj[2] or regj[3] > regi[4] : break overlap = min(regi[4], regj[4]) - regj[3] + 1 if (regi[-1] != '' and float(overlap) >= parameters['merging_prop'] * (regi[4]-regi[3]+1)) or \ (regj[-1] != '' and float(overlap) >= parameters['merging_prop'] * (regj[4]-regj[3]+1)) : delta = regi[1] - regj[1] if delta > 0.05 : todel.append(jd) elif delta <= -0.05 : deleted = 1 break if deleted == 0 : for jd in todel: regions[jd][0] = '' else : regi[0] = '' return [{'locus':reg[0], 'identity':reg[1], 'CIGAR':reg[9], 'coordinates':[reg[2], int(reg[3]), int(reg[4]), reg[5]], 'flanking':reg[6:8], 'status':reg[8], 'accepted':(0 if reg[8] == '' else 128)} for reg in regions if reg[0] != '' and reg[-1] != '']
def reScore(self, ref, qry, blastab, mode, perBatch=10000): if not self.qrySeq: self.qrySeq, self.qryQual = readFastq(qry) if not self.refSeq: self.refSeq, self.refQual = readFastq(ref) for k, v in self.qrySeq.items(): self.qrySeq[k] = nucEncoder[np.array(list(v)).view(asc2int)] for k, v in self.refSeq.items(): self.refSeq[k] = nucEncoder[np.array(list(v)).view(asc2int)] nTab = len(blastab) for bId in xrange(0, blastab.shape[0], perBatch): logger('Update scores: {0} / {1}'.format(bId, nTab)) tabs = blastab[bId:bId + perBatch] #scores = np.array([ cigar2score([t[14], self.refSeq[str(t[1])][t[8]-1:t[9]] if t[8] < t[9] else 4 - self.refSeq[str(t[1])][t[9]-1:t[8]][::-1], self.qrySeq[str(t[0])][t[6]-1:t[7]], t[6], mode, 6, 1]) for t in tabs ]) scores = np.array( list( map(cigar2score, ([ t[14], self.refSeq[str(t[1])][t[8] - 1:t[9]] if t[8] < t[9] else 4 - self.refSeq[str(t[1])][t[9] - 1:t[8]][::-1], self.qrySeq[str(t[0])][t[6] - 1:t[7]], t[6], mode, 6, 1 ] for t in tabs)))) tabs.T[2], tabs.T[11] = scores.T return blastab
def main(mgs): for mg in mgs: res = [] with uopen(mg) as fin: for line in fin: logp = re.findall('logp:\t([-eE\d\.]+)', line) if len(logp): logp = float(logp[0]) res.append([logp]) else: genotype = re.findall( 'Genotype (\d+):\tMean proportion:\t([eE\d\.]+)\tCI95%:\t(\[ [eE\d\.]+ - [eE\d\.]+ \])', line) if len(genotype): res[-1].append( [genotype[0][1], genotype[0][2], '', '', '']) elif len(res) and len( res[-1]) > 1 and res[-1][-1][-1] == '': part = line.strip().split('\t') res[-1][-1][2:] = [ part[0], part[1], part[3] + ' ' + part[5] ] try: res = max(res) res[1:] = sorted(res[1:], key=lambda x: -float(x[0])) for i in xrange(1, len(res)): r = res[i] print('{0}\t{1}\t{2}'.format(mg, i, '\t'.join(r))) except: pass
def assignment3(dists, res, encode, presence): n_loci = mat.shape[1] - 1 for id in xrange(len(dists)): idx, ref, _, s, ql = dists[id] gl = presence[encode[res[ref, 1:]], np.arange(1, res.shape[1])] gl[gl > ql] = ql d = (n_loci * (gl - s).astype(float) / gl + 0.5).astype(int) jd = np.argmax((d - np.arange(n_loci)) <= 0) + 1 res[idx, jd:] = res[ref, jd:] gl = presence[encode[res[idx, jd:]], np.arange(jd, presence.shape[1])] presence[encode[res[idx, jd:]][gl < ql], np.arange(jd, presence.shape[1])[gl < ql]] = ql
def assignment(dists, res): for id in xrange(len(dists)): idx, ref, d1, d2 = dists[id] for d in xrange(d1, n_loci + 1): if res[idx, d] != res[ref, d]: if d >= res[idx, 0]: if d >= d2: if res[idx, d] < res[ref, d]: grps = [res[idx, d], res[ref, d]] else: grps = [res[ref, d], res[idx, d]] res[:idx, d][res[:idx, d] == grps[1]] = grps[0] res[idx, d] = grps[0] else: if res[idx, d] < res[ref, d]: res[:idx, d][res[:idx, d] == res[ref, d]] = res[idx, d] else: res[idx, d:] = res[ref, d:] break else: break if res[idx, 0] > d1: res[idx, 0] = d1 return
def tab2overlaps(tabs, ovl_l, ovl_p, nTab, overlaps): ovlId = 0 for i1 in xrange(overlaps[-1, 0], nTab): t1 = tabs[i1] ovl_l2 = min(ovl_l, ovl_p * (t1[3] - t1[2] + 1)) if i1 > overlaps[-1, 0]: i2r = xrange(i1 + 1, nTab) else: i2r = xrange(overlaps[-1, 1], nTab) for i2 in i2r: t2 = tabs[i2] if t1[0] != t2[0] or t2[2] > t1[3]: break ovl = min(t1[3], t2[3]) - t2[2] + 1 if ovl >= ovl_l2 or ovl >= ovl_p * (t2[3] - t2[2] + 1): overlaps[ovlId, :] = [t1[1], t2[1], ovl] ovlId += 1 if ovlId == 1000000: overlaps[-1, :2] = [i1, i2] break if ovlId == 1000000: break if ovlId < 1000000: overlaps[-1, :] = -1 return overlaps
def runDiamond(self, ref, qry, nhits=10, frames='7') : logger('Run diamond starts') refAA = os.path.join(self.dirPath, 'refAA') qryAA = os.path.join(self.dirPath, 'qryAA') aaMatch = os.path.join(self.dirPath, 'aaMatch') if not self.qrySeq : self.qrySeq, self.qryQual = readFastq(qry) if not self.refSeq : self.refSeq, self.refQual = readFastq(ref) qryAASeq = transeq(self.qrySeq, frame='F', transl_table=self.table_id) with open(qryAA, 'w') as fout : for n, ss in sorted(qryAASeq.items()) : _, id, s = min([ (len(s[:-1].split('X')), id, s) for id, s in enumerate(ss) ]) fout.write('>{0}:{1}\n{2}\n'.format(n, id+1, s)) diamond_fmt = '{diamond} makedb --db {qryAA} --in {qryAA}'.format( diamond=diamond, qryAA=qryAA) p = Popen(diamond_fmt.split(), stderr=PIPE, stdout=PIPE, universal_newlines=True).communicate() refAASeq = transeq(self.refSeq, frames, transl_table=self.table_id) toWrite = [] for n, ss in sorted(refAASeq.items()) : for id, s in enumerate(ss) : cdss = re.findall('.{1000,}?X|.{1,1000}$', s + 'X') cdss[-1] = cdss[-1][:-1] cdsi = np.cumsum([0]+list(map(len, cdss[:-1]))) for ci, cs in zip(cdsi, cdss) : if len(cs) : toWrite.append('>{0}:{1}:{2}\n{3}\n'.format(n, id+1, ci, cs)) for id in xrange(5) : with open('{0}.{1}'.format(refAA, id), 'w') as fout : for line in toWrite[id::5] : fout.write(line) diamond_cmd = '{diamond} blastp --no-self-hits --threads {n_thread} --db {refAA} --query {qryAA} --out {aaMatch} --id {min_id} --query-cover {min_ratio} --evalue 1 -k {nhits} --dbsize 5000000 --outfmt 101'.format( diamond=diamond, refAA='{0}.{1}'.format(refAA, id), qryAA=qryAA, aaMatch='{0}.{1}'.format(aaMatch, id), n_thread=self.n_thread, min_id=self.min_id*100., nhits=nhits, min_ratio=self.min_ratio*100.) Popen(diamond_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() blastab = [] for r in self.pool.imap_unordered(parseDiamond, [ ['{0}.{1}'.format(aaMatch, id), self.refSeq, self.qrySeq, self.min_id, self.min_cov, self.min_ratio] for id in xrange(5) ]) : if r is not None : blastab.append(np.load(r, allow_pickle=True)) os.unlink(r) blastab = np.vstack(blastab) logger('Run diamond finishes. Got {0} alignments'.format(blastab.shape[0])) return blastab
def global_difference2(g): _, idx, cnt = np.unique(g.T[1], return_counts=True, return_index=True) idx = idx[cnt == 1] names, seqs = g[idx, 1], np.vstack(g[idx, 4]) comparable = np.zeros(shape=[seqs.shape[0], seqs.shape[0]]) seqs[np.in1d(seqs, [65, 67, 71, 84], invert=True).reshape(seqs.shape)] = 45 diff = compare_seq( seqs, np.zeros(shape=[seqs.shape[0], seqs.shape[0], 2], dtype=np.uint8)) res = {} for i, n1 in enumerate(names): for j in xrange(i + 1, len(names)): n2 = names[j] if diff[i, j, 1] >= min(params['match_len2'], seqs.shape[1] * params['match_prop2']): res[(n1, n2)] = diff[i, j, :] return res
def ovlFilter(self, blastab, params): coverage, delta = params[1:] logger('Run filtering. Start with {0} hits.'.format(len(blastab))) blastab[blastab.T[8] > blastab.T[9], 8:10] *= -1 blastab = pd.DataFrame(blastab).sort_values(by=[1, 0, 8, 6]).values for i, t1 in enumerate(blastab): if t1[2] < 0: continue toDel = [] for j in xrange(i + 1, blastab.shape[0]): t2 = blastab[j] if t2[2] < 0: continue if np.any(t1[:2] != t2[:2]) or t1[9] < t2[8]: break c = min(t1[9], t2[9]) - t2[8] + 1 if (c >= coverage * (t1[9] - t1[8] + 1) and t2[11] - t1[11] >= delta): t1[2] = -1. break elif (c >= coverage * (t2[9] - t2[8] + 1) and t1[11] - t2[11] >= delta): toDel.append(j) elif c >= (t1[9] - t1[8] + 1) and c < coverage * (t2[9] - t2[8] + 1): c2 = min(t1[7], t2[7]) - max(t2[6], t1[6]) + 1 if c2 >= (t1[7] - t1[6] + 1) and c2 < coverage * (t2[7] - t2[6] + 1): t1[2] == -1 break elif c >= (t2[9] - t2[8] + 1) and c < coverage * (t1[9] - t1[8] + 1): c2 = min(t1[7], t2[7]) - max(t2[6], t1[6]) + 1 if c2 >= (t2[7] - t2[6] + 1) and c2 < coverage * (t1[7] - t1[6] + 1): toDel.append(j) if t1[2] >= 0: for j in toDel: blastab[j][2] = -1. blastab = blastab[blastab.T[2] >= 0] blastab[blastab.T[8] < 0, 8:10] *= -1 logger('Done filtering. End with {0} hits.'.format(blastab.shape[0])) return blastab
def global_difference(bsn_file, orthoGroup, counts=3000): groups = np.load(bsn_file) genes = [] for gene, g in groups.items(): _, idx, cnt = np.unique(g.T[1], return_counts=True, return_index=True) score = (np.sum(cnt == 1) - 1) * (2**41) - np.sum(g[idx[cnt == 1], 2], dtype=int) if score > 0: genes.append([score, gene]) genes = sorted(genes, reverse=True) og = np.array(list(orthoGroup.keys())) grp_order, all_useds = [], set([]) for score, gene in genes: tag = groups[gene][0][0] if tag not in all_useds: grp_order.append(gene) used = og[og.T[0] == tag, 1] all_useds |= set(used.tolist()) genes = grp_order[:counts] global_differences = {} for iter in xrange(0, len(genes), 100): logger('finding ANIs between genomes. {0}/{1}'.format( iter, len(genes))) #diffs = list(map(global_difference2, [groups[i] for i in genes[iter:iter+100]])) diffs = pool.map(global_difference2, [groups[i] for i in genes[iter:iter + 100]]) for diff in diffs: for pair, (mut, aln) in diff.items(): if pair not in global_differences: global_differences[pair] = [] if aln: global_differences[pair].append(max(float(mut), .5) / aln) for pair, info in global_differences.items(): diff = np.log(info) mean_diff = max(np.mean(diff), -4.605) sigma = min(max(np.sqrt(np.mean((diff - mean_diff)**2)) * 3, 0.693), 1.386) global_differences[pair] = (np.exp(mean_diff), np.exp(sigma)) return pd.DataFrame(list(global_differences.items())).values
def __readAssembly(self, assembly): seq = {} with uopen(assembly) as fin: header = fin.read(1) with uopen(assembly) as fin: if header == '@': for id, line in enumerate(fin): if id % 4 == 0: part = line[1:].strip().split() name = part[0] seq[name] = [ 0, float(part[2]) if len(part) > 2 else 0., None, None ] elif id % 4 == 1: seq[name][2] = line.strip() seq[name][0] = len(seq[name][2]) elif id % 4 == 3: seq[name][3] = np.array(list(line.strip())) fasfile = assembly.rsplit('.', 1)[0] + '.fasta' logger('Write fasta sequences into {0}'.format(fasfile)) with open(fasfile, 'w') as fout: for n, s in sorted(seq.items()): fout.write('>{0}\n{1}\n'.format( n, '\n'.join([ s[2][site:(site + 100)] for site in xrange(0, len(s[2]), 100) ]))) else: fasfile = assembly for id, line in enumerate(fin): if line.startswith('>'): name = line[1:].strip().split()[0] seq[name] = [0, 0., []] else: seq[name][2].extend(line.strip().split()) for n, s in seq.items(): s[2] = ''.join(s[2]) s[0] = len(s[2]) return seq, fasfile
def runUBlast(self, ref, qry, nhits=6, frames='7'): logger('Run uBLAST starts') def parseUBlast(fin, refseq, qryseq, min_id, min_cov, min_ratio): blastab = pd.read_csv(fin, sep='\t', header=None) blastab[2] /= 100. blastab = blastab[blastab[2] >= min_id] blastab[3], blastab[4] = blastab[3] * 3, blastab[4] * 3 qf, rf = blastab[0].str.rsplit( ':', 1, expand=True), blastab[1].str.rsplit(':', 1, expand=True) if np.all(qf[0].str.isdigit()): qf[0] = qf[0].astype(int) if np.all(rf[0].str.isdigit()): rf[0] = rf[0].astype(int) blastab[0], qf = qf[0], qf[1].astype(int) blastab[1], rf = rf[0], rf[1].astype(int) blastab[6], blastab[ 7] = blastab[6] * 3 + qf - 3, blastab[7] * 3 + qf - 1 blastab[14] = [[ [3 * vv[0], vv[1]] for vv in v ] for v in map(getCIGAR, zip(blastab[15], blastab[14]))] blastab[12], blastab[13] = blastab[0].apply(lambda x: len(qryseq[ str(x)])), blastab[1].apply(lambda x: len(refseq[str(x)])) rf3 = (rf <= 3) blastab.loc[rf3, 8], blastab.loc[rf3, 9] = blastab.loc[rf3, 8] * 3 + rf[ rf3] - 3, blastab.loc[rf3, 9] * 3 + rf[rf3] - 1 blastab.loc[~rf3, 8], blastab.loc[ ~rf3, 9] = blastab.loc[~rf3, 13] - ( blastab.loc[~rf3, 8] * 3 + rf[~rf3] - 3 - 3) + 1, blastab.loc[~rf3, 13] - (blastab.loc[~rf3, 9] * 3 + rf[~rf3] - 3 - 1) + 1 d = np.max([ blastab[7] - blastab[12], blastab[9] - blastab[13], 1 - blastab[9], np.zeros(blastab.shape[0], dtype=int) ], axis=0) blastab[7] -= d def ending(x, y): x[-1][0] -= y np.vectorize(ending)(blastab[14], d) d[~rf3] *= -1 blastab[9] -= d blastab = blastab[ (blastab[7] - blastab[6] + 1 >= min_ratio * blastab[12]) & (blastab[7] - blastab[6] + 1 >= min_cov)] return blastab.drop(columns=[15, 16]) refAA = os.path.join(self.dirPath, 'refAA') qryAA = os.path.join(self.dirPath, 'qryAA') aaMatch = os.path.join(self.dirPath, 'aaMatch') if not self.qrySeq: self.qrySeq, self.qryQual = readFastq(qry) if not self.refSeq: self.refSeq, self.refQual = readFastq(ref) qryAASeq = transeq(self.qrySeq, frame='F') with open(qryAA, 'w') as fout: for n, ss in sorted(qryAASeq.items()): _, id, s = min([(len(s[:-1].split('X')), id, s) for id, s in enumerate(ss)]) fout.write('>{0}:{1}\n{2}\n'.format(n, id + 1, s)) refAASeq = transeq(self.refSeq, frames) toWrite = [] for n, ss in sorted(refAASeq.items()): for id, s in enumerate(ss): toWrite.append('>{0}:{1}\n{2}\n'.format(n, id + 1, s)) blastab = [] for id in xrange(5): with open(refAA, 'w') as fout: for line in toWrite[id::4]: fout.write(line) ublast_cmd = '{usearch} -self -threads {n_thread} -db {refAA} -ublast {qryAA} -mid {min_id} -query_cov {min_ratio} -evalue 1 -accel 0.9 -maxhits {nhits} -userout {aaMatch} -ka_dbsize 5000000 -userfields query+target+id+alnlen+mism+opens+qlo+qhi+tlo+thi+evalue+raw+ql+tl+qrow+trow+qstrand'.format( usearch=usearch, refAA=refAA, qryAA=qryAA, aaMatch=aaMatch, n_thread=self.n_thread, min_id=self.min_id * 100., nhits=nhits, min_ratio=self.min_ratio) p = Popen(ublast_cmd.split(), stderr=PIPE, stdout=PIPE, universal_newlines=True).communicate() if os.path.getsize(aaMatch) > 0: blastab.append( parseUBlast(open(aaMatch), self.refSeq, self.qrySeq, self.min_id, self.min_cov, self.min_ratio)) blastab = pd.concat(blastab) logger('Run uBLAST finishes. Got {0} alignments'.format( blastab.shape[0])) return blastab
def _linearMerge(data): matches, params = data grpCol = pd.Series(data=[[]] * matches.shape[0]) matches = np.hstack([matches, grpCol.values[:, np.newaxis]]) gapDist, lenDiff = params[1:] gene, geneLen = matches[0][0], matches[0][12] tailing = 20 def resolve_edges(edges): grps = [] for id, m1 in edges[0]: for jd, m2 in edges[1]: if (m1[1] == m2[1] and max(abs(m1[8]), abs(m1[9])) > min(abs(m2[8]), abs(m2[9])) ) or \ abs(m1[2]-m2[2]) > 0.3 or m1[6] >= m2[6] or m1[7] >= m2[7] or m2[6]-m1[7]-1 >= gapDist: continue rLen = m2[7] - m1[6] + 1 g1 = -m1[9] - 1 if m1[9] < 0 else m1[13] - m1[9] g2 = m1[8] - 1 if m1[8] > 0 else m1[13] + m1[8] qLen = m1[9] - m1[8] + 1 + m2[9] - m2[8] + 1 + g1 + g2 if g1 + g2 >= gapDist or min(rLen, qLen) * lenDiff < max( rLen, qLen): continue overlap = sorted([m1[7] - m2[6] + 1, -g1 - g2], reverse=True) rLen1, rLen2 = m1[7] - m1[6] + 1, m2[7] - m2[6] + 1 if overlap[0] > 0: score = m1[11] + m2[11] - overlap[0] * min( float(m1[11]) / rLen1, float(m2[11]) / rLen2) ident = (m1[2] * rLen1 + m2[2] * rLen2 - overlap[0] * min(m1[2], m2[2])) / (rLen1 + rLen2 - overlap[0]) else: score = m1[11] + m2[11] ident = (m1[2] * rLen1 + m2[2] * rLen2) / (rLen1 + rLen2) if overlap[1] < 0: score += overlap[1] / 3. if score > m1[11] and score > m2[11]: grps.append([score, ident, rLen, 1, id, jd]) return grps groups = [] prev, edges = matches[0][1], [[], []] nSave = len(matches) for id, m1 in enumerate(matches): rLen1 = m1[7] - m1[6] + 1 groups.append([m1[11], m1[2], rLen1, 0, id]) if m1[6] > tailing and ( (m1[8] > 0 and m1[8] - 1 <= gapDist) or (m1[8] < 0 and m1[13] + m1[8] < gapDist) ): # any hit within the last 150 bps to either end of a scaffold is a potential fragmented gene edges[1].append([id, m1]) if m1[7] <= m1[12] - tailing: if (m1[8] > 0 and m1[13] - m1[9] <= gapDist) or ( m1[8] < 0 and -1 - m1[9] < gapDist): edges[0].append([id, m1]) for jd in xrange(id + 1, nSave): m2 = matches[jd] if m1[1] != m2[2] or (m1[8] < 0 and m2[8] > 0) or m2[8] - m1[ 9] - 1 >= gapDist: # maximum 300bps between two continuous hits in the same scaffold break rLen, qLen = m2[7] - m1[6] + 1, m2[9] - m1[8] + 1 if abs(m1[2]-m2[2]) > 0.3 or m1[9] >= m2[9] or m1[6] >= m2[6] or m1[7] >= m2[7] or m2[6] - m1[7] -1 >= gapDist \ or min(rLen, qLen)*lenDiff < max(rLen, qLen) : continue rLen2 = m2[7] - m2[6] + 1 overlap = sorted([m1[7] - m2[6] + 1, m1[9] - m2[8] + 1], reverse=True) if overlap[0] > 0: score = m1[11] + m2[11] - overlap[0] * min( float(m1[11]) / rLen1, float(m2[11]) / rLen2) ident = (m1[2] * rLen1 + m2[2] * rLen2 - overlap[0] * min(m1[2], m2[2])) / (rLen1 + rLen2 - overlap[0]) else: score = m1[11] + m2[11] ident = (m1[2] * rLen1 + m2[2] * rLen2) / (rLen1 + rLen2) if overlap[1] < 0: score += overlap[1] / 3. if score > m1[11] and score > m2[11]: groups.append([score, ident, rLen, 0, id, jd]) if len(edges[0]) and len(edges[1]): groups.extend(resolve_edges(edges)) if len(groups) > len(matches): groups.sort(reverse=True) usedMatches, usedGroups = {}, [] for grp in groups: if (grp[4], 4) in usedMatches or (grp[-1], 5) in usedMatches: continue if grp[3] > 0: if (grp[4], 5) in usedMatches or (grp[-1], 4) in usedMatches: continue if grp[4] != grp[-1]: lMat, rMat = matches[grp[4]], matches[grp[-1]] il, im = sorted([grp[4], grp[-1]]) skp = 0 for i in xrange(il + 1, im): if matches[i][1] in {lMat[1], rMat[1]}: if (i, 4) in usedMatches or (i, 5) in usedMatches: skp = 1 break if skp: continue for i in xrange(il + 1, im): if matches[i][1] in {lMat[1], rMat[1]}: usedMatches[(i, 4)] = usedMatches[(i, 5)] = 0 usedGroups.append(grp) usedMatches[(grp[4], 4)] = usedMatches[(grp[-1], 5)] = 1 if grp[3] > 0: usedMatches[(grp[4], 5)] = usedMatches[(grp[-1], 4)] = 1 usedGroups.sort(key=itemgetter(4), reverse=True) for gId in xrange(len(usedGroups) - 1): g1, g2 = usedGroups[gId:gId + 2] if g1[4] == g2[-1]: m = matches[g1[4]] score = g1[0] + g2[0] - m[11] length = g1[2] + g2[2] - (m[7] - m[6] + 1) iden = (g1[1] * g1[2] + g2[1] * g2[2] - min(g1[1], g2[1]) * (m[7] - m[6] + 1)) / length usedGroups[gId + 1] = [score, iden, length, 0, g2[4]] + g1[4:] g1[1] = -1 else: usedGroups = groups usedMatches = {(k, k): 1 for k in np.arange(matches.shape[0])} for g in usedGroups: if g[1] >= 0: ids = [matches[i][15] for i in g[4:]] for i in g[4:]: matches[i, -1] = g[:3] + ids ids = {k[0] for k, v in usedMatches.items() if v == 1} matches = matches[np.array(list(ids))] return matches
def hierCC(args): params = get_args(args) ot = time.time() profile_file, cluster_file, old_cluster = params.profile, params.output + '.npz', params.incremental global mat, n_loci mat = pd.read_csv(profile_file, sep='\t', header=None, dtype=str).values allele_columns = np.array( [i == 0 or (not h.startswith('#')) for i, h in enumerate(mat[0])]) mat = mat[1:, allele_columns].astype(int) n_loci = mat.shape[1] - 1 logger( '{0}: Loaded in allelic profiles with dimension: {1} and {2}. The first column is assumed to be type id.' .format(time.time() - ot, *mat.shape)) if not params.immutable: absence = np.sum(mat <= 0, 1) mat = mat[np.argsort(absence, kind='mergesort')] if os.path.isfile(old_cluster): od = np.load(old_cluster, allow_pickle=True) cls = od['hierCC'] typed = {c[0]: id for id, c in enumerate(cls) if c[0] > 0} if len(typed) > 0: logger('{0}: Loaded in {1} old hierCC assignments.'.format( time.time() - ot, len(typed))) mat_idx = np.array([t in typed for t in mat.T[0]]) mat[:] = np.vstack([mat[mat_idx], mat[(mat_idx) == False]]) else: typed = {} logger('{0}: Start hierCC assignments'.format(time.time() - ot)) pool = Pool(10) res = np.repeat(mat.T[0], mat.shape[1]).reshape(mat.shape) res[1:, 0] = n_loci + 1 for index in xrange(0, mat.shape[0], 100): to_run = [] for idx in np.arange(index, index + 100): if idx < mat.shape[0]: if mat[idx, 0] in typed: res[idx, :] = cls[typed[mat[idx, 0]], :] else: to_run.append(idx) if len(to_run) == 0: continue if not params.immutable: dists = np.vstack(pool.map(get_distance, to_run)) assignment(dists, res) else: dists = np.vstack(pool.map(get_distance2, to_run)) assignment2(dists, res) logger('{0}: Assigned {1} of {2} types into hierCC.'.format( time.time() - ot, index, mat.shape[0])) res.T[0] = mat.T[0] np.savez_compressed(cluster_file, hierCC=res) if not params.delta: with uopen(params.output + '.hierCC.gz', 'w') as fout: fout.write('#ST_id\t{0}\n'.format('\t'.join( ['d' + str(id) for id in np.arange(n_loci)]))) for r in res[np.argsort(res.T[0])]: fout.write('\t'.join([str(rr) for rr in r]) + '\n') else: deltas = map(int, params.delta.split(',')) with uopen(params.output + '.hierCC.gz', 'w') as fout: fout.write('#ST_id\t{0}\n'.format('\t'.join( ['d' + str(id) for id in deltas]))) for r in res[np.argsort(res.T[0])]: fout.write('\t'.join([str(r[id + 1]) for id in deltas]) + '\n') del res logger('NUMPY clustering result (for incremental hierCC): {0}.npz'.format( params.output)) logger('TEXT clustering result (for visual inspection): {0}.hierCC.gz'. format(params.output))
def assignment2(dists, res): for id in xrange(len(dists)): idx, ref, jd = dists[id] res[idx, jd:] = res[ref, jd:]
def runDiamond(self, ref, qry, nhits=10, frames='7'): logger('Run diamond starts') def parseDiamond(fin, refseq, qryseq, min_id, min_cov, min_ratio): blastab = [] for line in fin: if line.startswith('@'): continue part = line.strip().split('\t') if part[2] == '*': continue qn, qf = part[0].rsplit(':', 1) rn, rf, rx = part[2].rsplit(':', 2) rs = int(part[3]) + int(rx) ql, rl = len(qryseq[str(qn)]), len(refseq[str(rn)]) qm = len(part[9]) if qm * 3 < min_cov: continue cov_ratio = qm * 3. / ql if cov_ratio < min_ratio: continue cigar = [[int(n) * 3, t] for n, t in re.findall(r'(\d+)([A-Z])', part[5])] cl = np.sum([c[0] for c in cigar]) variation = float(part[12][5:]) * 3 if part[12].startswith( 'NM:') else float(re.findall('NM:i:(\d+)', line)[0]) * 3 iden = 1 - round(variation / cl, 3) if iden < min_id: continue qf, rf = int(qf), int(rf) qs = int(part[18][5:]) if part[18].startswith('ZS:') else int( re.findall('ZS:i:(\d+)', line)[0]) rm = int( np.sum([c[0] for c in cigar if c[1] in {'M', 'D'}]) / 3) if rf <= 3: rs, r_e = rs * 3 + rf - 3, (rs + rm - 1) * 3 + rf - 1 else: rs, r_e = rl - (rs * 3 + rf - 6) + 1, rl - ( (rs + rm - 1) * 3 + rf - 4) + 1 if qf <= 3: qs, qe = qs * 3 + qf - 3, (qs + qm - 1) * 3 + qf - 1 else: qs, qe = ql - (qs * 3 + qf - 6) + 1, ql - ( (qs + qm - 1) * 3 + qf - 4) + 1 qs, qe, rs, r_e = qe, qs, r_e, rs cigar = list(reversed(cigar)) cd = [c[0] for c in cigar if c[1] != 'M'] score = int( part[14][5:]) if part[14].startswith('ZR:') else int( re.findall('ZR:i:(\d+)', line)[0]) blastab.append([ qn, rn, iden, cl, int(variation - sum(cd)), len(cd), qs, qe, rs, r_e, 0.0, score, ql, rl, cigar ]) blastab = pd.DataFrame(blastab) blastab[[0, 1]] = blastab[[0, 1]].astype(str) return blastab refAA = os.path.join(self.dirPath, 'refAA') qryAA = os.path.join(self.dirPath, 'qryAA') aaMatch = os.path.join(self.dirPath, 'aaMatch') if not self.qrySeq: self.qrySeq, self.qryQual = readFastq(qry) if not self.refSeq: self.refSeq, self.refQual = readFastq(ref) qryAASeq = transeq(self.qrySeq, frame='F', transl_table=self.table_id) with open(qryAA, 'w') as fout: for n, ss in sorted(qryAASeq.items()): _, id, s = min([(len(s[:-1].split('X')), id, s) for id, s in enumerate(ss)]) fout.write('>{0}:{1}\n{2}\n'.format(n, id + 1, s)) diamond_fmt = '{diamond} makedb --db {qryAA} --in {qryAA}'.format( diamond=diamond, qryAA=qryAA) p = Popen(diamond_fmt.split(), stderr=PIPE, stdout=PIPE, universal_newlines=True).communicate() refAASeq = transeq(self.refSeq, frames, transl_table=self.table_id) toWrite = [] for n, ss in sorted(refAASeq.items()): for id, s in enumerate(ss): cdss = re.findall('.{1000,}?X|.{1,1000}$', s + 'X') cdss[-1] = cdss[-1][:-1] cdsi = np.cumsum([0] + list(map(len, cdss[:-1]))) for ci, cs in zip(cdsi, cdss): if len(cs): toWrite.append('>{0}:{1}:{2}\n{3}\n'.format( n, id + 1, ci, cs)) blastab = [] for id in xrange(5): #logger('{0}'.format(id)) with open(refAA, 'w') as fout: for line in toWrite[id::5]: fout.write(line) diamond_cmd = '{diamond} blastp --no-self-hits --threads {n_thread} --db {refAA} --query {qryAA} --out {aaMatch} --id {min_id} --query-cover {min_ratio} --evalue 1 -k {nhits} --dbsize 5000000 --outfmt 101'.format( diamond=diamond, refAA=refAA, qryAA=qryAA, aaMatch=aaMatch, n_thread=self.n_thread, min_id=self.min_id * 100., nhits=nhits, min_ratio=self.min_ratio * 100.) p = Popen(diamond_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() if os.path.getsize(aaMatch) > 0: tab = parseDiamond(open(aaMatch), self.refSeq, self.qrySeq, self.min_id, self.min_cov, self.min_ratio) os.unlink(aaMatch) if tab is not None: blastab.append(tab) blastab = pd.concat(blastab) logger('Run diamond finishes. Got {0} alignments'.format( blastab.shape[0])) return blastab
def loadBam(prefix, reference, bams, sequences, snps): sites = [] p = subprocess.Popen('samtools mpileup -ABQ0 {0}'.format( ' '.join(bams)).split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) for line in p.stdout: part = line.strip().split('\t') s = int(part[1]) - 1 if s % 100000 == 0: sys.stdout.write('# {0}\n'.format(s)) if sequences[part[0]][s] > 0 or s % 5 == 0: bases = ''.join(part[4::3]) bases = re.sub('[\*\$\+-]', '', re.sub(r'\^.', '', bases.upper())) bases = re.split('(\d+)', bases) for i in range(1, len(bases), 2): bases[i + 1] = bases[i + 1][int(bases[i]):] types, cnts = np.unique(list(''.join(bases[::2])), return_counts=True) if np.sum(cnts) >= 3: if types.size > 1: cnts.sort() sites.append([cnts[-1], np.sum(cnts[:-1])]) else: sites.append([cnts[0], 0]) sites = np.array(sites) ave_depth = np.max([np.median(np.sum(sites, 1)), 2.]) sys.stdout.write( '{3}: Average read depth: {0}; Sites between {1} and {2} will be used for hybrid estimation.\n' .format(ave_depth, ave_depth / 2., ave_depth * 3., prefix)) sites = sites[(ave_depth / 2. <= np.sum(sites, 1)) & (np.sum(sites, 1) <= ave_depth * 3)] m = GaussianMixture(n_components=1, covariance_type='tied') m.fit(sites) best_model = [m.bic(sites), m] for n_components in xrange(2, 6): sys.stdout.write('# Testing {0} components.\n'.format(n_components)) m = GaussianMixture(n_components=n_components, covariance_type='tied') for i in xrange(20): m.fit(sites) bic = m.bic(sites) if bic < best_model[0]: best_model = [bic, m] m = GaussianMixture(n_components=n_components, covariance_type='tied') m = best_model[1] mId = np.argmax(m.means_.T[1] / np.sum(m.means_, 1)) sys.stdout.write( '{3}: Find {0} GMM components. The most divergent group is {1} and counts for {2} of total sites.\n' .format(m.n_components, m.means_[mId].tolist(), m.weights_[mId], prefix)) mDiv = m.means_[mId][0] / np.sum(m.means_[mId]) mDiv = 10 * np.log10([[mDiv, 1 - mDiv], [1 - mDiv, mDiv]]) seq = {n: list(s) for n, s in readFasta(reference).items()} qual = {n: [0] * len(s) for n, s in seq.items()} lowQ, lowC, highQ = 0, 0, 0 p = subprocess.Popen('samtools mpileup -ABQ0 {0}'.format( ' '.join(bams)).split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) for line in p.stdout: part = line.strip().split('\t') s = int(part[1]) - 1 if s % 100000 == 0: sys.stdout.write('# {0}\n'.format(s)) bases = ''.join(part[4::3]) bases = re.sub('[\*\$\+-]', '', re.sub(r'\^.', '', bases.upper())) bases = re.split('(\d+)', bases) for i in range(1, len(bases), 2): bases[i + 1] = bases[i + 1][int(bases[i]):] types, cnts = np.unique(list(''.join(bases[::2])), return_counts=True) if types.size > 0: depth = np.sum(cnts) if cnts.size == 1: g, mId = [cnts[0], 0], 0 elif cnts.size > 1: mId = np.argmax(cnts) g = [cnts[mId], depth - cnts[mId]] seq[part[0]][s] = types[mId] if depth >= 3 and depth / 3. <= ave_depth <= depth * 3.: q = min( 40, max( 1, int(round( np.sum(g * mDiv[0]) - np.sum(g * mDiv[1]), 0)))) qual[part[0]][s] = q if q < 10: lowQ += 1 else: highQ += 1 else: lowC += 1 qual = {n: ''.join([chr(ss + 33) for ss in s]) for n, s in qual.items()} with open(prefix + '.fastq', 'w') as fout: for n, s in seq.items(): fout.write('@{0}\n{1}\n+\n{2}\n'.format(n, ''.join(s), qual[n])) sys.stdout.write( '{0}: {1} good sites; {2} low covered sites; {3} low quality sites;\n'. format(prefix, highQ, lowC, lowQ)) return
def getClust(prefix, genes, params): groups = {} dirPath = tempfile.mkdtemp(prefix='NS_', dir='.') try: if not params['translate']: geneFile = genes else: na_seqs = readFasta(genes) aa_seqs = transeq(na_seqs, frame='1', transl_table='starts') with open(os.path.join(dirPath, 'seq.aa'), 'w') as fout: for n, s in aa_seqs: fout.write('>{0}\n{1}\n'.format(n, s[0])) geneFile = os.path.join(dirPath, 'seq.aa') seqDb = os.path.join(dirPath, 'seq.db') tmpDb = os.path.join(dirPath, 'tmp') lcDb = os.path.join(dirPath, 'seq.lc') tabFile = os.path.join(dirPath, 'clust.tab') refFile = os.path.join(dirPath, 'seq.ref') nRef = 999999999999999 for ite in xrange(3): if os.path.isdir(tmpDb): shutil.rmtree(tmpDb) os.makedirs(tmpDb) if os.path.isfile(seqDb): list(map(os.unlink, glob.glob(seqDb + '*'))) if os.path.isfile(lcDb): list(map(os.unlink, glob.glob(lcDb + '*'))) subprocess.Popen('{0} createdb {2} {1} -v 0'.format( externals['mmseqs'], seqDb, geneFile).split()).communicate() subprocess.Popen('{0} linclust {1} {2} {3} --min-seq-id {4} -c {5} --threads {6} -v 0'.format( \ externals['mmseqs'], seqDb, lcDb, tmpDb, params['identity'], params['coverage'], params['n_thread']).split(), stdout=subprocess.PIPE).communicate() subprocess.Popen('{0} createtsv {1} {1} {2} {3}'.format(\ externals['mmseqs'], seqDb, lcDb, tabFile).split(), stdout = subprocess.PIPE).communicate() with open(tabFile) as fin: for line in fin: part = line.strip().split() groups[part[1]] = part[0] tmp = [] with open(geneFile) as fin: toWrite, used_grps = False, {None: 1} for line in fin: if line.startswith('>'): name = line[1:].strip().split()[0] grp = groups.get(name, None) toWrite = False if grp in used_grps else True if toWrite: used_grps[grp] = name if toWrite: tmp.append(line) for gene, grp in groups.items(): if grp in used_grps: groups[gene] = used_grps[grp] with open(refFile, 'w') as fout: for line in tmp: fout.write(line) if nRef <= len(used_grps): break nRef = len(used_grps) geneFile = refFile if not params['translate']: shutil.copy2(refFile, '{0}.clust.exemplar'.format(prefix)) else: rSeq = readFasta(refFile) na_seqs = dict(na_seqs) with open('{0}.clust.exemplar'.format(prefix), 'w') as fout: for n, s in rSeq: fout.write('>{0}\n{1}\n'.format(n, na_seqs[n])) finally: shutil.rmtree(dirPath) with open('{0}.clust.tab'.format(prefix), 'w') as fout: for gene, grp in sorted(groups.items()): g = gene while g != grp: g, grp = grp, groups[grp] groups[gene] = grp fout.write('{0}\t{1}\n'.format(gene, grp)) return '{0}.clust.exemplar'.format(prefix), '{0}.clust.tab'.format(prefix)
def linear_merge(self, blasttab, min_iden, min_frag_prop, min_frag_len, max_dist=300, diag_diff=1.5, max_diff=200, **params) : for part in blasttab : if part[8] > part[9] : part[8], part[9] = -part[8], -part[9] blasttab.sort(key=itemgetter(0,1,6,8,-11)) nB = len(blasttab) for id, p1 in enumerate(blasttab) : if p1[0] == '' : continue for jd in xrange(id+1, nB) : p2 = blasttab[jd] if p2[0] == '' : continue if (p1[0], p1[1]) != (p2[0], p2[1]) or p2[6] - p1[6] > 4 : break d = abs(p1[6]-p2[6]) + abs(p1[7]-p2[7]) + abs(p1[8]-p2[8]) + abs(p1[9]-p2[9]) if d <= 5 : if p1[2] >= p2[2] : p2[0] = '' else : p1[0] = '' break blasttab = [p for p in blasttab if p[0] != ''] nB = len(blasttab) syntenies = [] for id, p1 in enumerate(blasttab) : for jd in xrange(id+1, nB) : p2 = blasttab[jd] if p1[0] != p2[0] or p1[1] != p2[1] or p2[6] - p1[7] > max_dist : break elif p1[8] < 0 < p2[8] or p2[8] < p1[8] + 15 or p2[9] < p1[9] + 15 or p2[7]<p1[7]+15 or p2[6] < p1[6]+15 or p2[8] - p1[9] > max_dist : continue m, n = p2[7] - p1[6] + 1, p2[9] - p1[8] + 1 if m < min_frag_len or m < min_frag_prop*p1[12] or max(m, n) - min(m, n) > max_diff or max(m,n) > diag_diff * min(m, n) : continue o_len = 0 if p2[6] > p1[7] else p1[7] - p2[6] + 1 p1_len = p1[7] - p1[6] + 1 - o_len p2_len = p2[7] - p2[6] + 1 - o_len iden = (p1[2]*p1_len + p2[2]*p2_len +max(p1[2], p2[2])*o_len)/(p1_len+p2_len+o_len) if iden < min_iden : continue p1s, p2s = p1[11]/(p1[7]-p1[6]+1), p2[11]/(p2[7]-p2[6]+1) dist = max(p2[6]-p1[7]-1, p2[8]-p1[9]-1, 0) score = p1s*p1_len +p2s*p2_len+max(p1s, p2s)*o_len - dist if score > 0 : syntenies.append([id, jd, iden, score]) syn_score = {} for id , syn in enumerate(syntenies) : if syn[0] not in syn_score and syn[1] not in syn_score : p1, p2 = blasttab[syn[0]][:], blasttab[syn[1]] c1, c2 = p1[-1], p2[-1] r_dist, q_dist = p2[6]-p1[7]-1, p2[8]-p1[9]-1 if min(r_dist, q_dist) < 0 : p1s, p2s = p1[11]/(p1[7]-p1[6]+1), p2[11]/(p2[7]-p2[6]+1) if p1s <= p2s : cc = [ [int(n), t] for n,t in re.findall(r'(\d+)([A-Z])', c1)] i = -1 else : cc = [ [int(n), t] for n,t in re.findall(r'(\d+)([A-Z])', c2)] i = 0 while min(r_dist, q_dist) < 0 : if cc[i][1] == 'M' : d = min(cc[i][0], max(-r_dist, -q_dist, 0)) r_dist, q_dist = d + r_dist, d + q_dist elif cc[i][1] == 'D' : d = min(cc[i][0], max(-r_dist, -q_dist, 0)) r_dist += d elif cc[i][1] == 'I' : d = min(cc[i][0], max(-r_dist, -q_dist, 0)) q_dist += d else : raise 'unknown' if d >= cc[i][0] : cc = cc[1:] if i == 0 else cc[:-1] else : cc[i][0] -= d if i == -1 : c1 = ''.join([ '{0}{1}'.format(*c) for c in cc ]) else : c2 = ''.join([ '{0}{1}'.format(*c) for c in cc ]) gap=[] if r_dist > 0 : gap.append('{0}D'.format(r_dist)) if q_dist > 0 : gap.append('{0}I'.format(q_dist)) if len(gap) == 0 : cc1 = re.findall(r'(^.*?)(\d+)([A-Z]$)', c1)[0] cc2 = re.findall(r'(^\d+)([A-Z])(.*$)', c2)[0] if cc1[2] == cc2[1] : c1 = '{0}{1}{2}'.format(cc1[0], int(cc1[1])+int(cc2[0]), cc1[2]) c2 = cc2[2] p1[7], p1[9], p1[2], p1[11], p1[14] = p2[7], p2[9], syn[2], syn[3], ''.join([c1] + gap + [c2]) blasttab.append(p1) syn_score[syn[0]] = id syn_score[syn[1]] = id for part in blasttab : if part[8] < 0 : part[8], part[9] = -part[8], -part[9] x = ['{0}D'.format(part[6]-1), part[-1]] if part[6] > 1 else [part[-1]] if part[7] < part[12] : x.append('{0}D'.format(part[12]-part[7])) if len(x) : part[-1] = ''.join(x) return blasttab
def alignAgainst(data) : prefix, aligner, db, (rtag, reference), (tag, query) = data if isinstance(aligner, list) : return lastAgainst(tag, query, db, prefix, reference, aligner[1]) try : qrySeq, qryQual = readFastq(query) except : return [tag, query] refSeq, refQual = readFastq(reference) proc = subprocess.Popen('{0} -c -t1 --frag=yes -A1 -B14 -O24,60 -E2,1 -r100 -g1000 -P -N5000 -f1000,5000 -n2 -m50 -s200 -z200 -2K10m --heap-sort=yes --secondary=yes {1} {2}'.format( aligner, db, query).split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) alignments = [] for lineId, line in enumerate(proc.stdout) : part = line.strip().split('\t') part[1:4] = [int(p) for p in part[1:4]] part[6:11] = [int(p) for p in part[6:11]] part[11] = float(part[13][5:]) part[12], part[13] = lineId, part[11]/part[10] part[14:17] = [[], [], []] alignments.append(part) proc.wait() deleteChain = {} nItem = len(alignments) alignments.sort(key=lambda x:x[:4]) for i1, p1 in enumerate(alignments) : for i2 in xrange(i1+1, nItem) : p2 = alignments[i2] if p1[0] != p2[0] : break s, e = max(p1[2], p2[2]), min(p1[3], p2[3]) if s > e+10 : break if (e-s) >= 0.9 * (p1[3]-p1[2]) and p2[13] - 0.1 >= p1[13] : deleteChain[p1[12]] = deleteChain.get(p1[12], set([])) | set([p2[12]]) if (e-s) >= 0.9 * (p2[3]-p2[2]) and p1[13] - 0.1 >= p2[13] : deleteChain[p2[12]] = deleteChain.get(p2[12], set([])) | set([p1[12]]) alignments.sort(key=lambda x:x[5:9]) for i1, p1 in enumerate(alignments) : for i2 in xrange(i1+1, nItem) : p2 = alignments[i2] if p1[5] != p2[5] : break s, e = max(p1[7], p2[7]), min(p1[8], p2[8]) if s > e+10 : break if (e-s) >= 0.9 * (p1[8]-p1[7]) and p2[13] - 0.05 >= p1[13] : deleteChain[p1[12]] = deleteChain.get(p1[12], set([])) | set([p2[12]]) if (e-s) >= 0.9 * (p2[8]-p2[7]) and p1[13] - 0.05 >= p2[13] : deleteChain[p2[12]] = deleteChain.get(p2[12], set([])) | set([p1[12]]) deleted = {} for p in sorted(alignments, key=lambda x:x[11], reverse=True) : id = p[12] if id in deleteChain : for jd in deleteChain[id] : if jd not in deleted : deleted[id] = 1 break alignments = [p for p in alignments if p[12] not in deleted] # repeats in qry nItem = len(alignments) alignments.sort(key=lambda x:x[:4]) for i1, p1 in enumerate(alignments) : for i2 in xrange(i1+1, nItem) : p2 = alignments[i2] if p1[0] != p2[0] : break s, e = max(p1[2], p2[2]), min(p1[3], p2[3]) if e > s : p1[16].append([s, e]) p2[16].append([s, e]) else : break # repeats in ref alignments.sort(key=lambda x:x[5:9]) for i1, p1 in enumerate(alignments) : for i2 in xrange(i1+1, nItem) : p2 = alignments[i2] if p1[5] != p2[5] : break s, e = max(p1[7], p2[7]), min(p1[8], p2[8]) if e > s : p1[15].append([s, e]) p2[15].append([s, e]) else : break maskedRegion = {} refRepeat = [] for p in alignments : # prepare a unique set of repeat region qryRepeat = [] if len(p[16]) > 0 : qryRepeat.append(p[16][0]) for pp in p[16][1:] : if pp[0] > qryRepeat[-1][1]+20 : qryRepeat.append(pp) elif pp[1] > qryRepeat[-1][1]: qryRepeat[-1][1] = pp[1] ref = [refSeq[p[5]], refQual[p[5]]] qry = [qrySeq[p[0]], qryQual[p[0]]] cigar = p[-1][5:] d = 1 if p[4] == '+' else -1 if d < 0 : qryRepeat = [[q[1], q[0], -1, -1] for q in qryRepeat] else : qryRepeat = [[q[0], q[1], -1, -1] for q in reversed(qryRepeat)] mut = [] alnSite = [p[7], p[2] if d > 0 else p[3]-1] for cl, ct in re.findall(r'(\d+)([MID])', cigar) : cl = int(cl) if ct == 'M' : # extract aligned sequences r = ref[0][alnSite[0]:alnSite[0]+cl] r1 = ref[1][alnSite[0]:alnSite[0]+cl] q = qry[0][alnSite[1]:alnSite[1]+cl] if d > 0 else rc(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)]) q1 = qry[1][alnSite[1]:alnSite[1]+cl] if d > 0 else ''.join(reversed(qry[1][(alnSite[1]-cl+1):(alnSite[1]+1)])) e =[alnSite[0]+cl, alnSite[1]+cl*d] for qid in xrange(len(qryRepeat)-1, -1, -1) : qr = qryRepeat[qid] if d*qr[0] <= d*e[1] : if qr[2] == -1 : qr[2] = alnSite[0] + d*(qr[0] - alnSite[1]) if d*qr[1] <= d*e[1] : qr[3] = alnSite[0] + d*(qr[1] - alnSite[1]) p[15].append(qr[2:]) del qryRepeat[qid] else : break for id, (rr, rr1, qq, qq1) in enumerate(np.array([list(r), list(r1), list(q), list(q1)]).T) : if ord(rr1) < 43 or ord(qq1) < 43 : maskedRegion[(p[5], alnSite[0]+id)] = 0 if rr != qq and rr != 'N' and qq != 'N' : mut.append([alnSite[0]+id, alnSite[1]+id*d, rr, qq, p[4]]) alnSite = e elif ct == 'I' : q = qry[0][alnSite[1]:alnSite[1]+cl] if d < 0 else rc(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)] ) q1 = qry[1][alnSite[1]:alnSite[1]+cl] if d > 0 else ''.join(reversed(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)] )) e = alnSite[1] + cl*d for qid in xrange(len(qryRepeat)-1, -1, -1) : qr = qryRepeat[qid] if d*qr[0] <= d*e : if qr[2] == -1 : qr[2] = alnSite[0] if d*qr[1] <= d*e : qr[3] = alnSite[0] p[15].append(qr[2:]) del qryRepeat[qid] else : break if ord(min(list(q1))) >= 43 : mut.append([alnSite[0], min(alnSite[1], e), '.', '+' + q, p[4]]) for site in xrange(alnSite[0], alnSite[0]+2) : maskedRegion[(p[5], site)] = 0 alnSite[1] = e elif ct == 'D' : r = ref[0][alnSite[0]:alnSite[0]+cl] r1 = ref[1][alnSite[0]:alnSite[0]+cl] if ord(min(list(r1))) >= 43 : mut.append([alnSite[0], int(alnSite[1]+0.5*d), '.', '-' + r, p[4]]) for site in xrange(alnSite[0], alnSite[0]+2) : maskedRegion[(p[5], site)] = 0 alnSite[0]+=cl p[14] = mut refRepeat.extend([ [p[5], pp[0], pp[1]] for pp in p[15] ]) repeats = [] if len(refRepeat) : refRepeat.sort() repeats = [refRepeat[0]] for p in refRepeat[1:] : if p[0] != repeats[-1][0] or p[1] - 20 > repeats[-1][2] : repeats.append(p) elif p[2] > repeats[-1][2] : repeats[-1][2] = p[2] for p in repeats : for site in xrange(p[1], p[2]) : maskedRegion[(p[0], site)] = 1 repeats = [] for cont, site in sorted(maskedRegion) : if len(repeats) == 0 or repeats[-1][0] != cont or repeats[-1][2]+1 < site : repeats.append([cont, site, site]) else : repeats[-1][2] = site mutations = [] alignments = [aln for aln in alignments if aln[9] >= 100] for aln in alignments : for m in aln[14] : if len(m[3]) == 1 : if (aln[5], m[0]) not in maskedRegion : mutations.append([aln[5], aln[0]] + m) elif maskedRegion.get((aln[5], m[0]), 0) != 1 : if m[3].startswith('-') and maskedRegion.get((aln[5], m[0]+len(m[3])-2), 0) > 0 : continue mutations.append([aln[5], aln[0]] + m) with uopen(prefix + '.gff.gz', 'w') as fout : fout.write('##gff-version 3\n') fout.write('## Reference: {0}\n'.format(reference)) fout.write('## Query: {0}\n'.format(query)) fout.write('## Tag: {0}\n'.format(tag)) for aln in alignments : if aln[5] == aln[0] and aln[2] == aln[7] and aln[3] == aln[8] : fout.write('{0}\trefMapper\tmisc_feature\t{1}\t{2}\t{3}\t{4}\t.\t/inference="Self%20Alignments"\n'.format( aln[5], aln[7]+1, aln[8], aln[9], aln[4], aln[0], aln[2]+1, aln[3], )) else : fout.write('{0}\trefMapper\tmisc_feature\t{1}\t{2}\t{3}\t{4}\t.\t/inference="Aligned%20with%20{5}:{6}-{7}"\n'.format( aln[5], aln[7]+1, aln[8], aln[9], aln[4], aln[0], aln[2]+1, aln[3], )) for p in repeats : fout.write('{0}\trefMapper\tunsure\t{1}\t{2}\t.\t+\t.\t/inference="Uncertain%20base%20calling%20or%20ambigious%20alignment"\n'.format( p[0], p[1]+1, p[2]+1, )) for mut in mutations : e1 = mut[2] if not mut[5].startswith('-') else mut[2] + len(mut[5]) - 2 e2 = mut[3] if not mut[5].startswith('+') else mut[3] + len(mut[5]) - 2 if len(mut[5]) > 26 : mut[5] = '{0}[{1}bps]'.format(mut[5][0], len(mut[5])-1) fout.write('{0}\trefMapper\tvariation\t{1}\t{2}\t.\t+\t.\t/replace="{7}";/compare="{3}:{4}-{5}:{8}";/origin="{6}"\n'.format( mut[0], mut[2]+1, e1+1, mut[1], mut[3]+1, e2+1, mut[4], mut[5], mut[6] )) return [tag, prefix + '.gff.gz']
def loadBam(prefix, reference, bams, sequences, snps): sequence = readFasta(reference) sequence = {n: [s, [0] * len(s)] for n, s in sequence.items()} sites = {} for bam in bams: if bam is not None: depth = subprocess.Popen('{samtools} depth -q 0 -Q 0 {bam}'.format( bam=bam, **externals).split(), stdout=subprocess.PIPE, universal_newlines=True) try: d = pd.read_csv(depth.stdout, sep='\t').values sites.update({cName: 1 for cName in np.unique(d.T[0])}) except: pass sequence = {n: s for n, s in sequence.items() if n in sites} with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout: for n, s in sorted(sequence.items()): fout.write('>{0}\n{1}\n'.format( n, '\n'.join([ s[0][site:(site + 100)] for site in xrange(0, len(s[0]), 100) ]))) bam_opt = ' '.join(['--bam {0}'.format(b) for b in bams if b is not None]) pilon_cmd = '{pilon} --fix snps,indels,gaps --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format( prefix=prefix, bam_opt=bam_opt, **externals) subprocess.Popen(pilon_cmd.split(), stdout=subprocess.PIPE, universal_newlines=True).communicate() uncertains = [] with open('{0}.mapping.vcf'.format(prefix)) as fin: for line in fin: if line.startswith('#'): continue part = line.strip().split('\t') if sequences[part[0]][int(part[1]) - 1] >= 0: if len(part[3]) == 1 and len(part[4]) == 1: pp = part[7].split(';') dp = float(pp[0][3:]) if dp >= 3: qd = int(pp[4][3:]) if part[-1] == '0/1' or qd < 10: bcs = sorted( [float(bc) for bc in pp[5][3:].split(',')]) uncertains.append([bcs[-1], np.sum(bcs[:-1])]) uncertains = np.array(uncertains) p = np.sum(uncertains.T[0]) / np.sum(uncertains) qPerRead = 10 * (np.log10(p) - np.log10(1 - p)) for n in sequence: sequence[n][0] = list(sequence[n][0]) highQ, lowQ, lowC = 0, 0, 0 with open('{0}.mapping.vcf'.format(prefix)) as fin: for line in fin: if line.startswith('#'): continue part = line.strip().split('\t') if len(part[3]) == 1 and len(part[4]) == 1: s = int(part[1]) - 1 pp = part[7].split(';') dp = float(pp[0][3:]) qd = int(pp[4][3:]) if part[-1] == '0/1' or qd < 10: bcs = np.array([int(bc) for bc in pp[5][3:].split(',')]) if np.sum(bcs) > 0: sequence[part[0]][0][s] = ['A', 'C', 'G', 'T'][np.argmax(bcs)] else: sequence[part[0]][0][s] = part[3] if dp < 3: lowC += 1 else: bcs.sort() bcs = [bcs[-1], np.sum(bcs[:-1])] q1 = binom.cdf(bcs[0], bcs[0] + bcs[1], p) q2 = qPerRead * (bcs[0] - bcs[1]) if q1 >= 0.05 else 1 if q2 >= 10: highQ += 1 else: lowQ += 1 sequence[part[0]][1][s] = min(40, max(1, int(q2))) else: if dp < 3: lowC += 1 else: if qd >= 10: highQ += 1 else: lowQ += 1 sequence[part[0]][1][s] = qd if part[-1] == '1/1': sequence[part[0]][0][s] = part[4] logger( '{0}: Expected mix-up: {1} {2} ; Got highQ {3} ; lowQ {4} ; lowC {5}'. format(prefix, uncertains.shape[0], p, highQ, lowQ, lowC)) with open('{0}.metaCaller.fastq'.format(prefix), 'w') as fout: p = prefix.rsplit('/', 1)[-1] for n, (s, q) in sequence.items(): fout.write('@{0}\n{1}\n+\n{2}\n'.format( p + '_' + n, ''.join(s), ''.join([chr(qq + 33) for qq in q]))) os.unlink('{0}.mapping.vcf'.format(prefix)) os.unlink('{0}.mapping.fasta'.format(prefix)) os.unlink('{0}.mapping.reference.fasta'.format(prefix)) return '{0}.metaCaller.fastq'.format(prefix)
def make_alignment( filename ) : comparisons = [] with open(filename, 'r') as fin: for line in fin: if line[0] == 'a' : comparison = [ int(line.split(' ', 2)[1][6:]) ] elif line[0] == 's' : part = line.strip().split()[1:] part[1:5] = [int(part[1]), int(part[2]), part[3], int(part[4])] if part[3] == '+' : part[1:3] = [part[1]+1, part[1]+part[2]] else : part[1:3] = [part[4]-part[1], part[4]-part[1]-part[2]+1] comparison.extend(part) if len(comparison) >= 13 : comparison.append([int((m in 'nN') or (n in 'Nn')) for m, n in zip(comparison[6], comparison[12])]) elif line[0] in 'pq' : part = line.strip().split() comparison[13] = [ max(comparison[13][id], int(b in '!"#$%&\'()*+,-./')) for id, b in enumerate(part[-1])] elif len(line.strip()) == 0 : if comparison[0] >= 200 : comparisons.append( last_package.call_mutation(comparison) ) # remove significant low identity regions in query comparisons.sort(key=lambda x: min(x[8:10]) ) comparisons.sort(key=lambda x: x[7] ) low_q = [] for id, regi in enumerate(comparisons) : if len(regi) == 0 : continue for jd in xrange(id+1, len(comparisons)) : regj = comparisons[jd] if len(regj) == 0 : continue if regi[7] != regj[7] : break si, ei = sorted(regi[8:10]) sj, ej = sorted(regj[8:10]) s = max(si, sj) e = min(ei, ej) if e >= s : overlap_i = last_package.sub_comparison(regi, qry_coords=[s, e]) overlap_j = last_package.sub_comparison(regj, qry_coords=[s, e]) if overlap_i[0] < 0.95 * overlap_j[0] and ( regi[0] < regj[0] or ei < ej ) : if s - si >= 30 : comparisons[id] = last_package.sub_comparison(regi, qry_coords=[si, s-1]) #if overlap_i[3] < overlap_i[2] and overlap_i[4] == '+' : #print comparisons[id] overlap_i[12] = 'E' low_q.append(overlap_i) if overlap_i[3] >= overlap_i[2] : regi = comparisons[id] if len(regi) == 0: break else : comparisons[id][12] = 'E' break elif overlap_i[0] * 0.95 > overlap_j[0] : if ej - e >= 30 : comparisons[jd] = last_package.sub_comparison(regj, qry_coords=[e+1, ej]) overlap_j[12] = 'E' if overlap_j[3] >= overlap_j[2] : low_q.append(overlap_j) else : comparisons[jd][12] = 'E' elif s == si and e == ei and regj[0] > regi[0]*3 and overlap_i[0] <= overlap_j[0] : comparisons[id][12] = 'E' break elif s == sj and e == ej and regi[0] > regj[0]*3 and overlap_i[0] >= overlap_j[0] : comparisons[jd][12] = 'E' else : comparisons[id][12] = 'D' comparisons[jd][12] = 'D' else : break # remove significant low identity regions in reference comparisons = sorted([x for x in comparisons if len(x) > 0] + low_q, key=lambda x: x[2] ) comparisons.sort(key=lambda x: x[1] ) for id, regi in enumerate(comparisons) : if len(regi) == 0 : continue for jd in xrange(id+1, len(comparisons)) : regj = comparisons[jd] if len(regj) == 0 : continue if regi[1] != regj[1] : break si, ei = regi[2:4] sj, ej = regj[2:4] s = max(si, sj) e = min(ei, ej) if e >= s : overlap_i = last_package.sub_comparison(regi, ref_coords=[s, e]) overlap_j = last_package.sub_comparison(regj, ref_coords=[s, e]) if overlap_i[0] < 0.95 * overlap_j[0] and ( regi[0] < regj[0] or ei < ej ) : if s - si >= 30 : comparisons[id] = last_package.sub_comparison(regi, ref_coords=[si, s-1]) regi = comparisons[id] if len(regi) == 0: break else : comparisons[id] = [] break elif overlap_i[0] * 0.95 > overlap_j[0] : if ej - e >= 30 : comparisons[jd] = last_package.sub_comparison(regj, ref_coords=[e+1, ej]) else : comparisons[jd] = [] elif overlap_i[0] == overlap_j[0] and len(overlap_i) == len(overlap_j) : if si == sj and ei == ej: diff = 0 for i, i_snp in enumerate(overlap_i[13:]) : j_snp = overlap_j[13+i] if i_snp[0] != j_snp[0] or i_snp[5] != j_snp[5] : diff = 1 break if diff == 0 : if comparisons[id][12] in 'DE': comparisons[id] = [] break else: comparisons[jd] = [] elif si <= sj and ei >= ej : diff = 0 for i, i_snp in enumerate(overlap_i[13:]) : j_snp = overlap_j[13+i] if i_snp[0] != j_snp[0] or i_snp[5] != j_snp[5] : diff = 1 break if diff == 0 : comparisons[jd] = [] elif si >= sj and ei <= ej: diff = 0 for i, i_snp in enumerate(overlap_i[13:]) : j_snp = overlap_j[13+i] if i_snp[0] != j_snp[0] or i_snp[5] != j_snp[5] : diff = 1 break if diff == 0 : comparisons[id] = [] break else : break if len(comparisons[id]) > 0 : regi = comparisons[id] regi[6] = [lq for lq in regi[6] if lq[1] >= regi[2] and lq[0] <= regi[3]] # mark repetitive regions in query repeats = [] mutations = {} comparisons = sorted([x for x in comparisons if len(x) > 0 and x[12] != 'E'], key=lambda x: min(x[8:10]) ) comparisons.sort(key=lambda x: x[7] ) for id, regi in enumerate(comparisons) : for jd in xrange(id+1, len(comparisons)) : regj = comparisons[jd] if regi[7] != regj[7] : break si, ei = sorted(regi[8:10]) sj, ej = sorted(regj[8:10]) s = max(si, sj) e = min(ei, ej) if e >= s : for mut in regi[13:] : if abs(min(mut[2:4])) <= e and abs(max(mut[2:4])) >= s : mut[6] = 1 for mut in regj[13:] : if abs(min(mut[2:4])) <= e and abs(max(mut[2:4])) >= s : mut[6] = 1 overlap_i = last_package.sub_comparison(regi, qry_coords=[s, e]) overlap_j = last_package.sub_comparison(regj, qry_coords=[s, e]) regi[6] = [lq for lq in regi[6] if lq[0] <overlap_i[2] or lq[1] > overlap_i[3]] regj[6] = [lq for lq in regj[6] if lq[0] <overlap_j[2] or lq[1] > overlap_j[3]] repeats.append(overlap_i[1:4] + [0]) repeats.append(overlap_j[1:4] + [0]) # identify repetitive regions in the reference comparisons.sort(key=lambda x: x[2] ) comparisons.sort(key=lambda x: x[1] ) for id, regi in enumerate(comparisons) : if len(regi) == 0 : continue for jd in xrange(id+1, len(comparisons)) : regj = comparisons[jd] if regi[1] != regj[1] : break si, ei = sorted(regi[2:4]) sj, ej = sorted(regj[2:4]) s = max(si, sj) e = min(ei, ej) if e >= s : overlap_i = last_package.sub_comparison(regi, ref_coords=[s, e]) overlap_j = last_package.sub_comparison(regj, ref_coords=[s, e]) if len(overlap_i) == len(overlap_j) : diff = 0 for i, i_snp in enumerate(overlap_i[13:]) : j_snp = overlap_j[13+i] if i_snp[0] != j_snp[0] or i_snp[5] != j_snp[5] : diff = 1 break if diff == 1 : for mut in regi[13:] : if abs(mut[0]) <= e and abs(mut[1]) >= s : mut[6] = 1 for mut in regj[13:] : if abs(mut[0]) <= e and abs(mut[1]) >= s : mut[6] = 1 regi[6] = [lq for lq in regi[6] if lq[0] <s or lq[1] > e] regj[6] = [lq for lq in regj[6] if lq[0] <s or lq[1] > e] repeats.append([regi[1], s, e, 0]) else : for mut in regi[13:] : if abs(mut[0]) <= e and abs(mut[1]) >= s : mut[6] = 1 for mut in regj[13:] : if abs(mut[0]) <= e and abs(mut[1]) >= s : mut[6] = 1 regi[6] = [lq for lq in regi[6] if lq[0] <s or lq[1] > e] regj[6] = [lq for lq in regj[6] if lq[0] <s or lq[1] > e] repeats.append([regi[1], s, e, 0]) for mut in regi[13:] : if mut[6] == 0 : if regi[1] not in mutations : mutations[ regi[1] ] = {} if mut[0] not in mutations[ regi[1] ]: mutations[ regi[1] ] [ mut[0] ] = {} if mut[5] not in mutations[ regi[1] ] [ mut[0] ] : mutations[ regi[1] ] [ mut[0] ] [ mut[5] ] = [regi[7], regi[10]] + mut else : mutations[ regi[1] ] [ mut[0] ] [ mut[5] ].extend([regi[7], regi[10]] + mut) repeats.extend([[regi[1]]+ lq[:2] + [1] for lq in regi[6]]) repeats.sort(key=lambda x:x[1]) repeats.sort(key=lambda x:x[0]) repetitive_regions = [] for rep in repeats: if len(repetitive_regions) == 0 or repetitive_regions[-1][0] != rep[0] or repetitive_regions[-1][2]+1 < rep[1] : repetitive_regions.append(rep) elif rep[2] > repetitive_regions[-1][2] : repetitive_regions[-1][2] = rep[2] if repetitive_regions[-1][3] > 0 : repetitive_regions[-1][3] = rep[3] nocall = {} for r in repetitive_regions + [c[1:] for c in comparisons if float(c[0])/(abs(c[3]-c[2])+1) < 0.7 or c[0] < 200] : for s in xrange(r[1], r[2]+1) : nocall[(r[0], s)] = 1 mutations = { contig:{ site:alters for site, alters in variation.items() if (contig, site) not in nocall } for contig, variation in mutations.items() } comparisons = [c for c in comparisons if float(c[0])/(abs(c[3]-c[2])+1) >= 0.7 and c[0] >= 200] return comparisons, repetitive_regions, mutations
def write_output(prefix, prediction, genomes, clust_ref, old_prediction): predictions, alleles = {}, {} allele_file = open('{0}.allele.fna'.format(prefix), 'w') prediction = pd.read_csv(prediction, sep='\t', header=None).values for part in prediction: #with open(prediction) as fin : #for line in fin : #part = line.strip().split() if part[0] not in alleles: alleles[part[0]] = {clust_ref[part[0]]: 1} allele_file.write('>{0}_{1}\n{2}\n'.format(part[0], 1, clust_ref[part[0]])) if part[9] < part[10]: l, r, d = min(part[7] - 1, part[9] - 1), min(part[12] - part[8], part[13] - part[10]), 1 else: l, r, d = min(part[7] - 1, part[13] - part[9]), min(part[12] - part[8], part[10] - 1), -1 if l <= 6 and part[7] - l == 1: part[7], part[9] = part[7] - l, part[9] - l * d else: ll = (part[7] - 1) % 3 if ll > 0: part[7], part[9] = part[7] + 3 - ll, part[9] + (3 - ll) * d if r <= 6 and part[8] + r == part[12]: part[8], part[10] = part[8] + r, part[10] + r * d else: rr = (part[12] - part[8]) % 3 if rr > 0: part[8], part[10] = part[8] - 3 + rr, part[10] - (3 + rr) * d if part[9] < part[10]: part[9:12] = part[9], part[10], '+' else: part[9:12] = part[10], part[9], '-' if part[4] not in predictions: predictions[part[4]] = [] elif predictions[part[4]][-1][2] == part[2]: prev = predictions[part[4]][-1] if prev[5] == part[5] and part[7] - prev[8] < 500: if part[11] == '+' and part[9] - prev[10] < 500: prev[8], prev[10] = part[8], part[10] continue elif part[11] == '-' and prev[9] - part[10] < 500: prev[8], prev[9] = part[8], part[9] continue predictions[part[4]][-1][1], part[1] = -1, -1 predictions[part[4]].append(part) op = ['', 0, []] with open('{0}.EToKi.gff'.format(prefix), 'w') as fout: for gid, (g, predict) in enumerate(predictions.items()): predict.sort(key=itemgetter(5, 9, 10)) for pid, pred in enumerate(predict): if pred[1] == -1 or (pred[10] - pred[9] + 1) <= 0.8 * pred[12]: cds, allele_id = 'fragment:{0:.2f}%'.format( (pred[10] - pred[9] + 1) * 100 / pred[12]), 'uncertain' start, stop = pred[9:11] else: s, e = pred[9:11] if pred[11] == '+': s2, e2 = s - min(int(3 * ((s - 1) / 3)), 60), e + min( 3 * int((pred[13] - e) / 3), 600) seq = genomes[pred[5]][1][(s2 - 1):e2] lp, rp = s - s2, e2 - e else: s2, e2 = s - min(int(3 * ((s - 1) / 3)), 600), e + min( 3 * int((pred[13] - e) / 3), 60) seq = rc(genomes[pred[5]][1][(s2 - 1):e2]) rp, lp = s - s2, e2 - e seq2 = seq[(lp):(len(seq) - rp)] if seq2 not in alleles[pred[0]]: if pred[3] == pred[0] and pred[7] == 1 and pred[ 8] == pred[12]: alleles[pred[0]][seq2] = len(alleles[pred[0]]) + 1 else: alleles[pred[0]][seq2] = 'LowQ{0}'.format( len(alleles[pred[0]]) + 1) allele_id = str(alleles[pred[0]][seq2]) allele_file.write('>{0}_{1}\n{2}\n'.format( pred[0], allele_id, seq2)) else: allele_id = str(alleles[pred[0]][seq2]) frames = sorted(set([0, len(seq) % 3])) for frame, aa_seq in zip( frames, transeq({'n': seq}, transl_table='starts', frame=','.join( [str(f + 1) for f in frames]))['n']): cds = 'CDS' s0, s1 = aa_seq.find('M', int(lp / 3), int(lp / 3 + 30)), aa_seq.rfind( 'M', 0, int(lp / 3)) start = s0 if s0 >= 0 else s1 if start < 0: cds, start = 'nostart', int(lp / 3) stop = aa_seq.find('X', start) if 0 <= stop < lp / 3 + 30: s0 = aa_seq.find('M', stop, int(lp / 3 + 30)) if s0 >= 0: start = s0 stop = aa_seq.find('X', start) if stop < 0: cds = 'nostop' elif (stop - start + 1) * 3 <= 0.8 * pred[12]: cds = 'premature stop:{0:.2f}%'.format( (stop - start + 1) * 300 / pred[12]) if cds == 'CDS': if pred[11] == '+': start, stop = s2 + start * 3 + frame, s2 + stop * 3 + 2 + frame else: start, stop = e2 - stop * 3 - 2 - frame, e2 - start * 3 - frame break else: start, stop = s, e if frame > 0: cds = 'frameshift' if pred[5] != op[0]: op = [pred[5], 0, old_prediction.get(pred[5], [])] old_tag = [] for k in xrange(op[1], len(op[2])): opd = op[2][k] if opd[2] < start: op[1] = k + 1 elif opd[1] > stop: break elif opd[3] != pred[11]: continue ovl = min(opd[2], stop) - max(opd[1], start) + 1 if ovl >= 300 or ovl >= 0.6 * ( opd[2] - opd[1] + 1) or ovl >= 0.6 * (stop - start + 1): frame = min((opd[1] - start) % 3, (opd[2] - stop) % 3) if frame == 0: old_tag.append('{0}:{1}-{2}'.format(*opd)) fout.write( '{0}\t{1}\tEToKi-ortho\t{2}\t{3}\t.\t{4}\t.\tID={5};{12}inference=ortholog group:{6},allele ID:{7},matched region:{8}-{9}{10}{11}\n' .format( pred[5], 'CDS' if cds == 'CDS' else 'pseudogene', start, stop, pred[11], '{0}_{1}_{2}'.format(prefix, gid, pid), pred[0], allele_id, s, e, '' if pred[0] == pred[3] else ',structure variant group:' + pred[3], '' if cds == 'CDS' else ';pseudogene=' + cds, '' if len(old_tag) == 0 else 'locus_tag={0};'.format( ','.join(old_tag)), )) allele_file.close() return
def filt_per_group(data): mat, ref, global_file = data global_differences = dict(np.load(global_file)) nMat = mat.shape[0] seqs = np.vstack([ np.vstack(mat.T[4]), np.array(list(ref)).view(asc2int).astype(np.uint8)[np.newaxis, :] ]) seqs[np.in1d(seqs, [65, 67, 71, 84], invert=True).reshape(seqs.shape)] = 45 diff = compare_seq( seqs, np.zeros(shape=[seqs.shape[0], seqs.shape[0], 2], dtype=int)).astype(float) incompatible, distances = {}, np.zeros( shape=[seqs.shape[0], seqs.shape[0]], dtype=float) for i1, m1 in enumerate(mat): for i2 in xrange(i1 + 1, nMat): m2 = mat[i2] mut, aln = diff[i1, i2] if aln > 0: gd = global_differences.get(tuple(sorted([m1[1], m2[1]])), (0.01, 4)) distances[i1, i2] = distances[i2, i1] = max( 0., 1 - (aln - mut) / aln / (1 - gd[0])) difference = mut / aln / gd[0] / gd[1] else: distances[i1, i2] = distances[i2, i1] = 0.8 difference = 1.5 if difference > 1.: incompatible[(i1, i2)] = 1 if len(incompatible) > 0: groups = [] for j, m in enumerate(mat): novel = 1 for g in groups: if diff[g[0], j, 0] <= 0.6 * ( 1.0 - params['clust_identity']) * diff[g[0], j, 1]: g.append(j) novel = 0 break if novel: groups.append([j]) group_tag = {gg: g[0] for g in groups for gg in g} try: tags = { g[0]: mat[g[0]][4].tostring().decode('ascii') for g in groups } except: tags = {g[0]: mat[g[0]][4].tostring() for g in groups} tags.update({'REF': ref}) ic2 = {} for i1, i2 in incompatible: t1, t2 = group_tag[i1], group_tag[i2] if t1 != t2: t1, t2 = str(t1), str(t2) if t1 not in ic2: ic2[t1] = {} if t2 not in ic2: ic2[t2] = {} ic2[t1][t2] = ic2[t2][t1] = 1 incompatible = ic2 for ite in xrange(3): try: tmpFile = tempfile.NamedTemporaryFile(dir='.', delete=False) for n, s in tags.items(): tmpFile.write('>X{0}\n{1}\n{2}'.format( n, s, '\n' * ite).encode('utf-8')) tmpFile.close() cmd = params[params['orthology']].format( tmpFile.name, ** params) if len(tags) < 500 else params['nj'].format( tmpFile.name, **params) phy_run = subprocess.Popen(shlex.split(cmd), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) gene_phy = ete3.Tree(phy_run.communicate()[0].replace("'", '')) break except: if ite == 2: return mat finally: os.unlink(tmpFile.name) for n in gene_phy.get_leaves(): if len(n.name): n.name = n.name[1:] node = gene_phy.get_midpoint_outgroup() if node is not None: gene_phy.set_outgroup(node) for ite in xrange(3000): gene_phy.ic, gene_phy.dist = {}, 0. rdist = sum([c.dist for c in gene_phy.get_children()]) for c in gene_phy.get_children(): c.dist = rdist for node in gene_phy.iter_descendants('postorder'): if node.is_leaf(): node.ic = { tuple(sorted([node.name, n2])): 1 for n2 in incompatible.get(node.name, {}) } else: node.ic = {} for c in node.get_children(): for x in c.ic: if x in node.ic: node.ic.pop(x) else: node.ic[x] = 1 cut_node = max([[len(n.ic), n.dist, n] for n in gene_phy.iter_descendants('postorder')], key=lambda x: (x[0], x[1])) if cut_node[0] > 0: cut_node = cut_node[2] prev_node = cut_node.up cut_node.detach() if 'REF' in cut_node.get_leaf_names(): gene_phy = cut_node elif prev_node.is_root(): gene_phy = gene_phy.get_children()[0] else: prev_node.delete(preserve_branch_length=True) tips = set(gene_phy.get_leaf_names()) for r1 in list(incompatible.keys()): if r1 not in tips: rr = incompatible.pop(r1, None) for r2 in rr: incompatible.get(r2, {}).pop(r1, None) for r1 in list(incompatible.keys()): if len(incompatible[r1]) == 0: incompatible.pop(r1, None) if len(incompatible) == 0: break logger(' Iteration {0}. Remains {1} tips.'.format( ite + 1, len(gene_phy.get_leaf_names()))) else: break if len(gene_phy.get_leaf_names()) < len(tags): groups = {str(g[0]): g for g in groups} tips = sorted([ nn for n in gene_phy.get_leaf_names() for nn in groups.get(n, []) ]) mat = mat[tips] return mat
def do_polish(self, reference, reads, reassemble=False, onlySNP=False) : if parameters.get('SNP', None) is not None : return self.do_polish_with_SNPs(reference, parameters['SNP']) else : if parameters['mapper'] == 'minimap2' : bams = self.__run_minimap(prefix, reference, reads ) elif parameters['mapper'] != 'bwa' : bams = self.__run_bowtie(prefix, reference, reads ) else : bams = self.__run_bwa(prefix, reference, reads ) sites = {} for bam in bams : if bam is not None : depth = Popen('{samtools} depth -q 0 -Q 0 {bam}'.format(bam=bam, **parameters).split(), stdout=PIPE, universal_newlines=True) for line in depth.stdout : part = line.strip().split() if len(part) > 2 and float(part[2]) > 0 : sites[part[0]] = 1 sequence = readFasta(reference) sequence = {n:s for n,s in sequence.items() if n in sites} with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout : for n, s in sorted(sequence.items()) : fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[site:(site+100)] for site in xrange(0, len(s), 100)]))) bam_opt = ' '.join(['--bam {0}'.format(b) for b in bams if b is not None]) if reassemble : pilon_cmd = '{pilon} --fix all,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters) Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() else : pilon_cmd = '{pilon} --fix all --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters) Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() if not os.path.isfile('{0}.mapping.vcf'.format(prefix)) : pilon_cmd = '{pilon} --fix snps,indels,gaps --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters) Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() snps = [] with open('{0}.mapping.vcf'.format(prefix)) as fin, open('{0}.mapping.changes'.format(prefix), 'w') as fout : for line in fin : if line.startswith('#') : continue part = line.strip().split('\t') if part[-1] != '0/0': try : if (part[6] == 'PASS' or float(part[7][-4:]) >= 0.75) and re.match(r'^[ACGTN]+$', part[4]): if (not onlySNP) or (len(part[3]) == 1 and len(part[4]) == 1 ) : snps.append( [ part[0], int(part[1])-1, part[3], part[4] ] ) fout.write(line) except : pass os.unlink('{0}.mapping.vcf'.format(prefix)) for n in sequence.keys() : sequence[n] = list(sequence[n]) for n, site, ori, alt in reversed(snps) : s = sequence[n] end = site + len(ori) s[site:end] = alt logger('Observed and corrected {0} changes using PILON'.format(len(snps))) with open('{0}.fasta'.format(prefix), 'w') as fout : for n, s in sorted(sequence.items()) : s = ''.join(s) fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[site:(site+100)] for site in xrange(0, len(s), 100)]))) return '{0}.fasta'.format(prefix)
def get_quality(self, reference, reads ) : if parameters['mapper'] == 'minimap2' : bams = self.__run_minimap(prefix, reference, reads, ) elif parameters['mapper'] != 'bwa' : bams = self.__run_bowtie(prefix, reference, reads, ) else : bams = self.__run_bwa(prefix, reference, reads, ) sequence = readFasta(reference) for n, s in sequence.items() : q = ['!'] * len(s) sequence[n] = [s, q] sites = { n:np.array([0 for ss in s[1] ]) for n, s in sequence.items() } for bam in bams : if bam is not None : depth = Popen('{samtools} depth -q 0 -Q 0 {bam}'.format(bam=bam, **parameters).split(), stdout=PIPE, universal_newlines=True) for line in depth.stdout : part = line.strip().split() if len(part) > 2 and float(part[2]) > 0 : sites[part[0]][int(part[1]) - 1] += float(part[2]) sites = {n:[s.size, np.max([np.median(s), np.exp(np.mean(np.log(s + 0.5)))-0.5]), 0.] for n, s in sites.items()} depth = np.array(list(sites.values())) depth = depth[np.argsort(-depth.T[0])] size = np.sum(depth.T[0]) acc = [0, 0] for d in depth : acc[0], acc[1] = acc[0] + d[0], acc[1] + d[0]*d[1] if acc[0] *2 >= size : break ave_depth = acc[1]/acc[0] exp_mut_depth = max(ave_depth * 0.2, 2.) for n, s in sites.items() : s[2] = s[1]/ave_depth logger('Average read depth: {0}'.format(ave_depth)) sequence = {n:s for n, s in sequence.items() if sites[n][1]>0.} with open('{0}.mapping.reference.fasta'.format(prefix), 'w') as fout : for n, s in sorted(sequence.items()) : fout.write('>{0}\n{1}\n'.format(n, '\n'.join([ s[0][site:(site+100)] for site in xrange(0, len(s[0]), 100)]))) bam_opt = ' '.join(['--bam {0}'.format(b) for b in bams if b is not None]) pilon_cmd = '{pilon} --fix all,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters) Popen( pilon_cmd.split(), stdout=PIPE, universal_newlines=True ).communicate() if not os.path.isfile('{0}.mapping.vcf'.format(prefix)) : pilon_cmd = '{pilon} --fix snps,indels,gaps,breaks --vcf --output {prefix}.mapping --genome {prefix}.mapping.reference.fasta {bam_opt}'.format(bam_opt=bam_opt, **parameters) Popen( pilon_cmd.split(), stdout=PIPE, stderr=PIPE, universal_newlines=True).communicate() cont_depth = [float(d) for d in parameters['cont_depth'].split(',')] logger('Contigs with less than {0} depth will be removed from the assembly'.format(cont_depth[0]*ave_depth)) logger('Contigs with more than {0} depth will be treated as duplicates'.format(cont_depth[1]*ave_depth)) indels = [] with open('{0}.mapping.vcf'.format(prefix)) as fin, open('{0}.mapping.difference'.format(prefix), 'w') as fout : for line in fin : if line.startswith('#') : continue part = line.strip().split('\t') if sites[part[0]][2] < cont_depth[0] or sites[part[0]][2] >= cont_depth[1] : continue if part[-1] == '1/1': if len(part[3]) > 1 : indels.append([part[0], max(0, int(site)-1), int(site)-1+len(part[3])+2]) elif len(part[4]) > 1 and part[4] != '<DUP>' : indels.append([part[0], max(0, int(site)-2), int(site)-1+len(part[3])+2]) try: if part[-1] == '0/0' and len(part[3]) == 1 and len(part[4]) == 1 : pp = part[7].split(';') dp = float(pp[0][3:]) af = 100 - sorted([float(af) for af in pp[6][3:].split(',')])[-1] if af <= 20 and dp >= 2 and dp * af/100. <= exp_mut_depth and (part[6] == 'PASS' or (part[6] == 'LowCov' and parameters['metagenome'])) : site = int(part[1])-1 qual = chr(int(pp[4][3:])+33) sequence[part[0]][1][site] = qual else : fout.write(line) else : fout.write(line) except : fout.write(line) for n, s, e in indels : sequence[n][1][s:e] = ['!'] * len(sequence[n][1][s:e]) if self.snps is not None : for n, snvs in self.snps.items() : for site, snv in snvs : if snv.find('N') >= 0 : continue if snv.startswith('+') : s, e = site-4, site+3+len(snv) else : s, e = site-4, site+4 for k in xrange(s, e) : sequence[n][1][k] = max(chr(40+33), sequence[n][1][k]) with open('{0}.result.fastq'.format(prefix), 'w') as fout : p = prefix.rsplit('/', 1)[-1] for n, (s, q) in sequence.items() : if sites[n][2] >= cont_depth[0] : fout.write( '@{0} {3} {4} {5}\n{1}\n+\n{2}\n'.format( p+'_'+n, s, ''.join(q), *sites[n] ) ) os.unlink( '{0}.mapping.vcf'.format(prefix) ) logger('Final result is written into {0}'.format('{0}.result.fastq'.format(prefix))) return '{0}.result.fastq'.format(prefix)