def iter_readGFF(fname): seq, cds = {}, {} names = {} with uopen(fname) as fin: sequenceMode = False for line in fin: if line.startswith('#'): continue elif line.startswith('>'): sequenceMode = True name = line[1:].strip().split()[0] assert name not in seq, logger( 'Error: duplicated sequence name {0}'.format(name)) seq[name] = [fname, []] elif sequenceMode: seq[name][1].extend(line.strip().split()) else: part = line.strip().split('\t') if len(part) > 2: name = re.findall(r'locus_tag=([^;]+)', part[8]) if len(name) == 0: parent = re.findall(r'Parent=([^;]+)', part[8]) if len(parent) and parent[0] in names: name = names[parent[0]] if len(name) == 0: name = re.findall(r'Name=([^;]+)', part[8]) if len(name) == 0: name = re.findall(r'ID=([^;]+)', part[8]) if part[2] == 'CDS': assert len(name) > 0, logger( 'Error: CDS has no name. {0}'.format(line)) # source_file, seqName, Start, End, Direction, hash, Sequences cds[name[0]] = [ fname, part[0], int(part[3]), int(part[4]), part[6], 0, '' ] else: ids = re.findall(r'ID=([^;]+)', part[8]) if len(ids): names[ids[0]] = name for n in seq: seq[n][1] = ''.join(seq[n][1]).upper() for n in cds: c = cds[n] try: c[6] = seq[c[1]][1][(c[2] - 1):c[3]] if c[4] == '-': c[6] = rc(c[6]) if not checkCDS(n, c[6]): c[6] = '' else: c[5] = int(hashlib.sha1(c[6].encode('utf-8')).hexdigest(), 16) except: c[6] = '' return seq, cds
def alignAgainst(data) : prefix, minimap2, db, (rtag, reference), (tag, query) = data try : qrySeq, qryQual = readFastq(query) except : return [tag, query] refSeq, refQual = readFastq(reference) proc = subprocess.Popen('{0} -c -t1 --frag=yes -A2 -B8 -O20,40 -E3,2 -r20 -g200 -p.000001 -N5000 -f1000,5000 -n2 -m30 -s30 -z200 -2K10m --heap-sort=yes --secondary=yes {1} {2}'.format( minimap2, db, query).split(), stdout=subprocess.PIPE, universal_newlines=True) alignments = [] for lineId, line in enumerate(proc.stdout) : part = line.strip().split('\t') part[1:4] = [int(p) for p in part[1:4]] part[6:11] = [int(p) for p in part[6:11]] part[11] = float(part[13][5:]) part[12], part[13] = lineId, part[11]/part[10] part[14:17] = [[], [], []] alignments.append(part) proc.wait() deleteChain = {} nItem = len(alignments) alignments.sort(key=lambda x:x[:4]) for i1, p1 in enumerate(alignments) : for i2 in xrange(i1+1, nItem) : p2 = alignments[i2] if p1[0] != p2[0] : break s, e = max(p1[2], p2[2]), min(p1[3], p2[3]) if s > e+10 : break if (e-s) >= 0.9 * (p1[3]-p1[2]) and p2[13] - 0.1 >= p1[13] : deleteChain[p1[12]] = deleteChain.get(p1[12], set([])) | set([p2[12]]) if (e-s) >= 0.9 * (p2[3]-p2[2]) and p1[13] - 0.1 >= p2[13] : deleteChain[p2[12]] = deleteChain.get(p2[12], set([])) | set([p1[12]]) alignments.sort(key=lambda x:x[5:9]) for i1, p1 in enumerate(alignments) : for i2 in xrange(i1+1, nItem) : p2 = alignments[i2] if p1[5] != p2[5] : break s, e = max(p1[7], p2[7]), min(p1[8], p2[8]) if s > e+10 : break if (e-s) >= 0.9 * (p1[8]-p1[7]) and p2[13] - 0.05 >= p1[13] : deleteChain[p1[12]] = deleteChain.get(p1[12], set([])) | set([p2[12]]) if (e-s) >= 0.9 * (p2[8]-p2[7]) and p1[13] - 0.05 >= p2[13] : deleteChain[p2[12]] = deleteChain.get(p2[12], set([])) | set([p1[12]]) deleted = {} for p in sorted(alignments, key=lambda x:x[11], reverse=True) : id = p[12] if id in deleteChain : for jd in deleteChain[id] : if jd not in deleted : deleted[id] = 1 break alignments = [p for p in alignments if p[12] not in deleted] # repeats in qry nItem = len(alignments) alignments.sort(key=lambda x:x[:4]) for i1, p1 in enumerate(alignments) : for i2 in xrange(i1+1, nItem) : p2 = alignments[i2] if p1[0] != p2[0] : break s, e = max(p1[2], p2[2]), min(p1[3], p2[3]) if e > s : p1[16].append([s, e]) p2[16].append([s, e]) else : break # repeats in ref alignments.sort(key=lambda x:x[5:9]) for i1, p1 in enumerate(alignments) : for i2 in xrange(i1+1, nItem) : p2 = alignments[i2] if p1[5] != p2[5] : break s, e = max(p1[7], p2[7]), min(p1[8], p2[8]) if e > s : p1[15].append([s, e]) p2[15].append([s, e]) else : break maskedRegion = {} refRepeat = [] for p in alignments : # prepare a unique set of repeat region qryRepeat = [] if len(p[16]) > 0 : qryRepeat.append(p[16][0]) for pp in p[16][1:] : if pp[0] > qryRepeat[-1][1]+20 : qryRepeat.append(pp) elif pp[1] > qryRepeat[-1][1]: qryRepeat[-1][1] = pp[1] ref = [refSeq[p[5]], refQual[p[5]]] qry = [qrySeq[p[0]], qryQual[p[0]]] cigar = p[-1][5:] d = 1 if p[4] == '+' else -1 if d < 0 : qryRepeat = [[q[1], q[0], -1, -1] for q in qryRepeat] else : qryRepeat = [[q[0], q[1], -1, -1] for q in reversed(qryRepeat)] mut = [] alnSite = [p[7], p[2] if d > 0 else p[3]-1] for cl, ct in re.findall(r'(\d+)([MID])', cigar) : cl = int(cl) if ct == 'M' : # extract aligned sequences r = ref[0][alnSite[0]:alnSite[0]+cl] r1 = ref[1][alnSite[0]:alnSite[0]+cl] q = qry[0][alnSite[1]:alnSite[1]+cl] if d > 0 else rc(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)]) q1 = qry[1][alnSite[1]:alnSite[1]+cl] if d > 0 else ''.join(reversed(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)])) e =[alnSite[0]+cl, alnSite[1]+cl*d] for qid in xrange(len(qryRepeat)-1, -1, -1) : qr = qryRepeat[qid] if d*qr[0] <= d*e[1] : if qr[2] == -1 : qr[2] = alnSite[0] + d*(qr[0] - alnSite[1]) if d*qr[1] <= d*e[1] : qr[3] = alnSite[0] + d*(qr[1] - alnSite[1]) p[15].append(qr[2:]) del qryRepeat[qid] else : break for id, (rr, rr1, qq, qq1) in enumerate(np.array([list(r), list(r1), list(q), list(q1)]).T) : if ord(rr1) < 43 or ord(qq1) < 43 : maskedRegion[(p[5], alnSite[0]+id)] = 0 if rr != qq and rr != 'N' and qq != 'N' : mut.append([alnSite[0]+id, alnSite[1]+id*d, rr, qq, p[4]]) alnSite = e elif ct == 'I' : q = qry[0][alnSite[1]:alnSite[1]+cl] if d < 0 else rc(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)] ) q1 = qry[1][alnSite[1]:alnSite[1]+cl] if d > 0 else ''.join(reversed(qry[0][(alnSite[1]-cl+1):(alnSite[1]+1)] )) e = alnSite[1] + cl*d for qid in xrange(len(qryRepeat)-1, -1, -1) : qr = qryRepeat[qid] if d*qr[0] <= d*e : if qr[2] == -1 : qr[2] = alnSite[0] if d*qr[1] <= d*e : qr[3] = alnSite[0] p[15].append(qr[2:]) del qryRepeat[qid] else : break if ord(min(list(q1))) >= 43 : mut.append([alnSite[0], min(alnSite[1], e), '.', '+' + q, p[4]]) for site in xrange(alnSite[0], alnSite[0]+2) : maskedRegion[(p[5], site)] = 0 alnSite[1] = e elif ct == 'D' : r = ref[0][alnSite[0]:alnSite[0]+cl] r1 = ref[1][alnSite[0]:alnSite[0]+cl] if ord(min(list(r1))) >= 43 : mut.append([alnSite[0], int(alnSite[1]+0.5*d), '.', '-' + r, p[4]]) for site in xrange(alnSite[0], alnSite[0]+2) : maskedRegion[(p[5], site)] = 0 alnSite[0]+=cl p[14] = mut refRepeat.extend([ [p[5], pp[0], pp[1]] for pp in p[15] ]) repeats = [] if len(refRepeat) : refRepeat.sort() repeats = [refRepeat[0]] for p in refRepeat[1:] : if p[0] != repeats[-1][0] or p[1] - 20 > repeats[-1][2] : repeats.append(p) elif p[2] > repeats[-1][2] : repeats[-1][2] = p[2] for p in repeats : for site in xrange(p[1], p[2]) : maskedRegion[(p[0], site)] = 1 repeats = [] for cont, site in sorted(maskedRegion) : if len(repeats) == 0 or repeats[-1][0] != cont or repeats[-1][2]+1 < site : repeats.append([cont, site, site]) else : repeats[-1][2] = site mutations = [] alignments = [aln for aln in alignments if aln[9] >= 100] for aln in alignments : for m in aln[14] : if len(m[3]) == 1 : if (aln[5], m[0]) not in maskedRegion : mutations.append([aln[5], aln[0]] + m) elif maskedRegion.get((aln[5], m[0]), 0) != 1 : if m[3].startswith('-') and maskedRegion.get((aln[5], m[0]+len(m[3])-2), 0) > 0 : continue mutations.append([aln[5], aln[0]] + m) with uopen(prefix + '.gff.gz', 'w') as fout : fout.write('##gff-version 3\n') fout.write('## Reference: {0}\n'.format(reference)) fout.write('## Query: {0}\n'.format(query)) fout.write('## Tag: {0}\n'.format(tag)) for aln in alignments : if aln[5] == aln[0] and aln[2] == aln[7] and aln[3] == aln[8] : fout.write('{0}\trefMapper\tmisc_feature\t{1}\t{2}\t{3}\t{4}\t.\t/inference="Self%20Alignments"\n'.format( aln[5], aln[7]+1, aln[8], aln[9], aln[4], aln[0], aln[2]+1, aln[3], )) else : fout.write('{0}\trefMapper\tmisc_feature\t{1}\t{2}\t{3}\t{4}\t.\t/inference="Aligned%20with%20{5}:{6}-{7}"\n'.format( aln[5], aln[7]+1, aln[8], aln[9], aln[4], aln[0], aln[2]+1, aln[3], )) for p in repeats : fout.write('{0}\trefMapper\tunsure\t{1}\t{2}\t.\t+\t.\t/inference="Uncertain%20base%20calling%20or%20ambigious%20alignment"\n'.format( p[0], p[1]+1, p[2]+1, )) for mut in mutations : e1 = mut[2] if not mut[5].startswith('-') else mut[2] + len(mut[5]) - 2 e2 = mut[3] if not mut[5].startswith('+') else mut[3] + len(mut[5]) - 2 if len(mut[5]) > 26 : mut[5] = '{0}[{1}bps]'.format(mut[5][0], len(mut[5])-1) fout.write('{0}\trefMapper\tvariation\t{1}\t{2}\t.\t+\t.\t/replace="{7}";/compare="{3}:{4}-{5}:{8}";/origin="{6}"\n'.format( mut[0], mut[2]+1, e1+1, mut[1], mut[3]+1, e2+1, mut[4], mut[5], mut[6] )) return [tag, prefix + '.gff.gz']
def write_output(prefix, prediction, genomes, clust_ref, old_prediction): predictions, alleles = {}, {} allele_file = open('{0}.allele.fna'.format(prefix), 'w') prediction = pd.read_csv(prediction, sep='\t', header=None).values for part in prediction: #with open(prediction) as fin : #for line in fin : #part = line.strip().split() if part[0] not in alleles: alleles[part[0]] = {clust_ref[part[0]]: 1} allele_file.write('>{0}_{1}\n{2}\n'.format(part[0], 1, clust_ref[part[0]])) if part[9] < part[10]: l, r, d = min(part[7] - 1, part[9] - 1), min(part[12] - part[8], part[13] - part[10]), 1 else: l, r, d = min(part[7] - 1, part[13] - part[9]), min(part[12] - part[8], part[10] - 1), -1 if l <= 6 and part[7] - l == 1: part[7], part[9] = part[7] - l, part[9] - l * d else: ll = (part[7] - 1) % 3 if ll > 0: part[7], part[9] = part[7] + 3 - ll, part[9] + (3 - ll) * d if r <= 6 and part[8] + r == part[12]: part[8], part[10] = part[8] + r, part[10] + r * d else: rr = (part[12] - part[8]) % 3 if rr > 0: part[8], part[10] = part[8] - 3 + rr, part[10] - (3 + rr) * d if part[9] < part[10]: part[9:12] = part[9], part[10], '+' else: part[9:12] = part[10], part[9], '-' if part[4] not in predictions: predictions[part[4]] = [] elif predictions[part[4]][-1][2] == part[2]: prev = predictions[part[4]][-1] if prev[5] == part[5] and part[7] - prev[8] < 500: if part[11] == '+' and part[9] - prev[10] < 500: prev[8], prev[10] = part[8], part[10] continue elif part[11] == '-' and prev[9] - part[10] < 500: prev[8], prev[9] = part[8], part[9] continue predictions[part[4]][-1][1], part[1] = -1, -1 predictions[part[4]].append(part) op = ['', 0, []] with open('{0}.EToKi.gff'.format(prefix), 'w') as fout: for gid, (g, predict) in enumerate(predictions.items()): predict.sort(key=itemgetter(5, 9, 10)) for pid, pred in enumerate(predict): if pred[1] == -1 or (pred[10] - pred[9] + 1) <= 0.8 * pred[12]: cds, allele_id = 'fragment:{0:.2f}%'.format( (pred[10] - pred[9] + 1) * 100 / pred[12]), 'uncertain' start, stop = pred[9:11] else: s, e = pred[9:11] if pred[11] == '+': s2, e2 = s - min(int(3 * ((s - 1) / 3)), 60), e + min( 3 * int((pred[13] - e) / 3), 600) seq = genomes[pred[5]][1][(s2 - 1):e2] lp, rp = s - s2, e2 - e else: s2, e2 = s - min(int(3 * ((s - 1) / 3)), 600), e + min( 3 * int((pred[13] - e) / 3), 60) seq = rc(genomes[pred[5]][1][(s2 - 1):e2]) rp, lp = s - s2, e2 - e seq2 = seq[(lp):(len(seq) - rp)] if seq2 not in alleles[pred[0]]: if pred[3] == pred[0] and pred[7] == 1 and pred[ 8] == pred[12]: alleles[pred[0]][seq2] = len(alleles[pred[0]]) + 1 else: alleles[pred[0]][seq2] = 'LowQ{0}'.format( len(alleles[pred[0]]) + 1) allele_id = str(alleles[pred[0]][seq2]) allele_file.write('>{0}_{1}\n{2}\n'.format( pred[0], allele_id, seq2)) else: allele_id = str(alleles[pred[0]][seq2]) frames = sorted(set([0, len(seq) % 3])) for frame, aa_seq in zip( frames, transeq({'n': seq}, transl_table='starts', frame=','.join( [str(f + 1) for f in frames]))['n']): cds = 'CDS' s0, s1 = aa_seq.find('M', int(lp / 3), int(lp / 3 + 30)), aa_seq.rfind( 'M', 0, int(lp / 3)) start = s0 if s0 >= 0 else s1 if start < 0: cds, start = 'nostart', int(lp / 3) stop = aa_seq.find('X', start) if 0 <= stop < lp / 3 + 30: s0 = aa_seq.find('M', stop, int(lp / 3 + 30)) if s0 >= 0: start = s0 stop = aa_seq.find('X', start) if stop < 0: cds = 'nostop' elif (stop - start + 1) * 3 <= 0.8 * pred[12]: cds = 'premature stop:{0:.2f}%'.format( (stop - start + 1) * 300 / pred[12]) if cds == 'CDS': if pred[11] == '+': start, stop = s2 + start * 3 + frame, s2 + stop * 3 + 2 + frame else: start, stop = e2 - stop * 3 - 2 - frame, e2 - start * 3 - frame break else: start, stop = s, e if frame > 0: cds = 'frameshift' if pred[5] != op[0]: op = [pred[5], 0, old_prediction.get(pred[5], [])] old_tag = [] for k in xrange(op[1], len(op[2])): opd = op[2][k] if opd[2] < start: op[1] = k + 1 elif opd[1] > stop: break elif opd[3] != pred[11]: continue ovl = min(opd[2], stop) - max(opd[1], start) + 1 if ovl >= 300 or ovl >= 0.6 * ( opd[2] - opd[1] + 1) or ovl >= 0.6 * (stop - start + 1): frame = min((opd[1] - start) % 3, (opd[2] - stop) % 3) if frame == 0: old_tag.append('{0}:{1}-{2}'.format(*opd)) fout.write( '{0}\t{1}\tEToKi-ortho\t{2}\t{3}\t.\t{4}\t.\tID={5};{12}inference=ortholog group:{6},allele ID:{7},matched region:{8}-{9}{10}{11}\n' .format( pred[5], 'CDS' if cds == 'CDS' else 'pseudogene', start, stop, pred[11], '{0}_{1}_{2}'.format(prefix, gid, pid), pred[0], allele_id, s, e, '' if pred[0] == pred[3] else ',structure variant group:' + pred[3], '' if cds == 'CDS' else ';pseudogene=' + cds, '' if len(old_tag) == 0 else 'locus_tag={0};'.format( ','.join(old_tag)), )) allele_file.close() return
def iter_map_bsn(data): prefix, clust, id, taxon, seq, params = data gfile, out_prefix = '{0}.{1}.genome'.format(prefix, id), '{0}.{1}'.format( prefix, id) with open(gfile, 'w') as fout: for n, s in seq: fout.write('>{0}\n{1}\n'.format(n, s)) blastab, overlap = uberBlast( '-r {0} -q {1} -f -m -o --blastn --ublast --min_id {2} --min_cov {3} -t 2 -s 2 -e 0,3' .format(gfile, clust, params['match_identity'] - 0.1, params['match_frag_len']).split()) os.unlink(gfile) groups = [] groups2 = {} ids = np.zeros(np.max(blastab.T[15]) + 1, dtype=bool) for tab in blastab: if tab[16][1] >= params['match_identity'] and tab[16][2] >= max( params['match_prop'] * tab[12], params['match_len']) and tab[16][2] >= max( params['match_prop2'] * tab[12], params['match_len2']): ids[tab[15]] = True if len(tab[16]) <= 4: groups.append(tab[:2].tolist() + tab[16][:2] + [None, 0, [tab[:16]]]) else: length = tab[7] - tab[6] + 1 if tab[2] >= params['match_identity'] and length >= max( params['match_prop'] * tab[12], params['match_len']) and length >= max( params['match_prop2'] * tab[12], params['match_len2']): groups.append(tab[:2].tolist() + [tab[11], tab[2], None, 0, [tab[:16]]]) if tab[16][3] not in groups2: groups2[tab[16][3]] = tab[:2].tolist() + tab[16][:2] + [ None, 0, [[]] * (len(tab[16]) - 3) ] x = [i for i, t in enumerate(tab[16][3:]) if t == tab[15]][0] groups2[tab[16][3]][6][x] = tab[:16] else: tab[2] = -1 groups.extend(list(groups2.values())) overlap = overlap[ids[overlap.T[0]] & ids[overlap.T[1]], :2] convA, convB = np.tile(-1, np.max(blastab.T[15]) + 1), np.tile( -1, np.max(blastab.T[15]) + 1) seq = dict(seq) for id, group in enumerate(groups): group[4] = np.zeros(group[6][0][12], dtype=np.uint8) group[4].fill(45) group[5] = id group[6] = np.array(group[6]) if group[6].shape[0] == 1: convA[group[6].T[15].astype(int)] = id else: convB[group[6].T[15].astype(int)] = id max_sc = 0 for tab in group[6]: matchedSeq = seq[tab[1]][tab[8] - 1:tab[9]] if tab[8] < tab[9] else rc( seq[tab[1]][tab[9] - 1:tab[8]]) ms, i, f, sc = [], 0, 0, [0, 0, 0] for s, t in re.findall(r'(\d+)([A-Z])', tab[14]): s = int(s) if t == 'M': ms.append(matchedSeq[i:i + s]) i += s sc[f] += s elif t == 'D': i += s f = (f - s) % 3 else: ms.append('-' * s) f = (f + s) % 3 group[4][tab[6] - 1:tab[7]] = np.array(list( ''.join(ms))).view(asc2int).astype(np.uint8) max_sc += max(sc[0], sc[f]) group[2] = max_sc overlap = np.vstack([np.vstack([m, n]).T[(m>=0) & (n >=0)] for m in (convA[overlap.T[0]], convB[overlap.T[0]]) \ for n in (convA[overlap.T[1]], convB[overlap.T[1]]) ] + [np.vstack([convA, convB]).T[(convA >= 0) & (convB >=0)]]) np.savez_compressed(out_prefix + '.bsn.npz', bsn=np.array(groups, dtype=object), ovl=overlap) return out_prefix