def annotate_from_ref( align_json=None, ref_gtf=None, outfile=None, outfmt=None, debug=False, ): outh = sys.stdout if outfile is None else open(outfile, 'w') jaln = load_slot_json(align_json, 'padded_alignments') refmap = {parse_seq_id(k)['ref']: k for k in list(jaln.keys())} for gr in gtf_parser(ref_gtf): if gr.feature not in [ 'gene', ]: continue alignment = jaln[refmap[gr.chrom]] ref_s = gr.start - 1 ref_e = gr.end # Get alignment start for aln_s in range(len(alignment)): if alignment[aln_s][0] == ref_s: break while alignment[aln_s][3] == -1: aln_s += 1 # Get alignment end for aln_e in range(len(alignment) - 1, -1, -1): if alignment[aln_e][0] == ref_e: break while alignment[aln_e][3] == -1: aln_e += -1 con_s = alignment[aln_s][3] con_e = alignment[aln_e][3] new_gr = GTFRow() new_gr.chrom, new_gr.source = (refmap[gr.chrom], 'haphpipe') new_gr.feature = gr.feature new_gr.start, new_gr.end = (con_s + 1, con_e) new_gr.score, new_gr.strand, new_gr.frame = ('.', gr.strand, gr.frame) new_gr.attrs['name'] = gr.attrs['name'] # Include statistics in attributes new_gr.attrs.update(get_seg_stats(alignment[aln_s:aln_e + 1])) # Get the regions that are actually called creg = called_regions(alignment[aln_s:aln_e + 1]) new_gr.attrs['call_reg'] = ','.join('%d-%d' % t for t in creg) new_gr.attrs['call_len'] = sum((t[1] - t[0] + 1) for t in creg) print(new_gr, file=outh)
def extract_pairwise( align_json=None, outfile=None, outfmt=None, refreg=None, debug=False, ): outh = sys.stdout if outfile is None else open(outfile, 'w') if outfmt == 'nuc_fa' or outfmt == 'prot_fa': jaln = load_slot_json(align_json, 'padded_alignments') if refreg is None: for newname, alignment in list(jaln.items()): nucstr = ''.join(t[2] for t in alignment if t[3] != -1) nucstr = nucstr.replace('*', 'N') print('>%s' % newname, file=outh) if outfmt == 'nuc_fa': print(sequtils.wrap(nucstr), file=outh) else: s = Seq(nucstr[:(old_div(len(nucstr), 3)) * 3]) print(sequtils.wrap(str(s.translate())), file=outh) else: refmap = { sequtils.parse_seq_id(k)['ref']: k for k in list(jaln.keys()) } chrom, ref_s, ref_e = sequtils.region_to_tuple(refreg) ref_s = ref_s - 1 alignment = jaln[refmap[chrom]] # Get alignment start for aln_s in range(len(alignment)): if alignment[aln_s][0] == ref_s: break while alignment[aln_s][3] == -1: aln_s += 1 # Get alignment end for aln_e in range(len(alignment) - 1, -1, -1): if alignment[aln_e][0] == ref_e: break while alignment[aln_e][3] == -1: aln_e += -1 nucstr = ''.join(t[2] for t in alignment[aln_s:aln_e] if t[3] != -1) nucstr = nucstr.replace('*', 'N') print('>%s (%s)' % (refmap[chrom], refreg), file=outh) if outfmt == 'nuc_fa': print(sequtils.wrap(nucstr), file=outh) else: s = Seq(nucstr[:(old_div(len(nucstr), 3)) * 3]) print(sequtils.wrap(str(s.translate())), file=outh) elif outfmt == 'aln_fa': jaln = load_slot_json(align_json, 'padded_alignments') for newname, alignment in list(jaln.items()): aid = sequtils.parse_seq_id(newname) rstr = ''.join(t[1] for t in alignment).replace('*', 'N') qstr = ''.join(t[2] for t in alignment).replace('*', 'N') print('>ref|%s|' % aid['ref'], file=outh) print(sequtils.wrap(rstr), file=outh) print('>sid|%s|' % aid['sid'], file=outh) print(sequtils.wrap(qstr), file=outh) elif outfmt == 'amp_gtf': jgtf = load_slot_json(align_json, 'padded_gtf') print('\n'.join(_ for _ in jgtf), file=outh) elif outfmt == 'tsv': jaln = load_slot_json(align_json, 'padded_alignments') for newname, alignment in list(jaln.items()): print('# %s' % newname, file=outh) for l in alignment: print('\t'.join(str(_) for _ in l), file=outh)
def pairwise_align( amplicons_fa=None, ref_fa=None, ref_gtf=None, outdir='.', keep_tmp=False, quiet=False, logfile=None, debug=False, ): """ Pipeline step to align amplicons to reference Args: amplicons_fa (str): Path to fasta file with amplicon sequences ref_fa (str): Path to reference fasta file ref_gtf (str): Path to reference GTF file with amplicons outdir (str): Path to output directory keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: out_aln (str): Path to alignment in JSON format """ # Check dependencies sysutils.check_dependency('blastx') # Outputs out_aln = os.path.join(outdir, 'alignments.json') # Temporary directory tempdir = sysutils.create_tempdir('pairwise_align', None, quiet, logfile) # Load reference sequence(s) refseqs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')} # Load amplicons from GTF file amps = [ gl for gl in gtfparse.gtf_parser(ref_gtf) if gl.feature == 'amplicon' ] ampdict = {(gl.chrom, gl.attrs['name']): gl for gl in amps} out_json = { 'aa_alignments': {}, 'nuc_alignments': {}, 'padded_alignments': {}, 'padded_gtf': [], } # {(sid, ref): [(reg, list(alignment)), ...], ...} all_nuc_aln = defaultdict(list) for amprec in SeqIO.parse(amplicons_fa, 'fasta'): # Get amplicon reference and region from sequence ID aid = sequtils.parse_seq_id(amprec.id) # Find the GTF line used to orient this amplicon try: gl = ampdict[(aid['ref'], aid['reg'])] except KeyError: poss_gl = [t for t in ampdict.keys() if t[1] == aid['reg']] gl = ampdict[poss_gl[0]] # Start and stop for primary coding region pri_s = int(gl.attrs['primary_cds'].split('-')[0]) - 1 pri_e = int(gl.attrs['primary_cds'].split('-')[1]) # Start and stop for additional coding regions altcds = [] if 'alt_cds' in gl.attrs: for x in gl.attrs['alt_cds'].split(','): altcds.append( ((int(x.split('-')[0]) - 1), int(x.split('-')[1]))) # Align using amino acids refseq = matching_refseq(refseqs, aid['ref']) alnobj, nuc_aln = baln.alignAA(refseq, amprec, (pri_s, pri_e), altcds, tempdir, quiet) # prialn is a BlastxAlignment object with amplicon aligned to primary cds # merged is a nucleotide alignment over the full amplicon, with unaligned regions # aligned using alternate cds or nucleotide alignments all_nuc_aln[(aid['sid'], aid['ref'])].append((aid['reg'], nuc_aln)) jid = 'sid|%s|ref|%s|reg|%s|' % (aid['sid'], aid['ref'], aid['reg']) out_json['aa_alignments'][jid] = alnobj.aa_align out_json['nuc_alignments'][jid] = nuc_aln # Full sequence with padding for sid, ref in list(all_nuc_aln.keys()): _refseq = matching_refseq(refseqs, ref) # New name and new alignment newname = 'sid|%s|ref|%s|' % (sid, _refseq.id) tmp = [] # Sort all segments by the start position segments = sorted(all_nuc_aln[(sid, ref)], key=lambda x: x[1][0][0]) rpos = qpos = 0 for sname, seg in segments: gr = GTFRow() gr.chrom, gr.source, gr.feature = (newname, 'haphpipe', 'amplicon') gr.score, gr.strand, gr.frame = ('.', '+', '.') gr.attrs['name'] = sname # Pad up to first position of segment if rpos < seg[0][0]: for p in range(rpos, seg[0][0]): tmp.append((p, str(_refseq.seq[p]), '*', qpos)) qpos += 1 gr.start = qpos + 1 for t in seg: if t[3] == -1: tmp.append(t) else: tmp.append((t[0], t[1], t[2], qpos)) qpos += 1 # Add annotation line gr.end = qpos # Include statistics in attributes gr.attrs.update(baln.get_seg_stats(seg)) # Include called regions gr.attrs['call_reg'] = '%d-%d' % (gr.start, gr.end) gr.attrs['call_len'] = (gr.end - gr.start + 1) # Append to json object out_json['padded_gtf'].append(str(gr)) rpos = seg[-1][0] + 1 # Add padding for end of sequence if rpos < len(_refseq.seq): for p in range(rpos, len(_refseq.seq)): tmp.append((p, str(_refseq.seq[p]), '*', qpos)) qpos += 1 # Validate the alignment vseq = ''.join(t[2] for t in tmp if t[3] != -1) if baln.validate_alignment(tmp, _refseq.seq, vseq): if not quiet: print('%s alignment validation passed' % newname, file=sys.stderr) out_json['padded_alignments'][newname] = tmp for s in out_json['padded_gtf']: if not quiet: print(s, file=sys.stdout) with open(out_aln, 'w') as outh: print(json.dumps(out_json), file=outh) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'pairwise_align', quiet, logfile) return out_aln