def annotate_from_ref( align_json=None, ref_gtf=None, outfile=None, outfmt=None, debug=False, ): outh = sys.stdout if outfile is None else open(outfile, 'w') jaln = load_slot_json(align_json, 'padded_alignments') refmap = {parse_seq_id(k)['ref']: k for k in list(jaln.keys())} for gr in gtf_parser(ref_gtf): if gr.feature not in [ 'gene', ]: continue alignment = jaln[refmap[gr.chrom]] ref_s = gr.start - 1 ref_e = gr.end # Get alignment start for aln_s in range(len(alignment)): if alignment[aln_s][0] == ref_s: break while alignment[aln_s][3] == -1: aln_s += 1 # Get alignment end for aln_e in range(len(alignment) - 1, -1, -1): if alignment[aln_e][0] == ref_e: break while alignment[aln_e][3] == -1: aln_e += -1 con_s = alignment[aln_s][3] con_e = alignment[aln_e][3] new_gr = GTFRow() new_gr.chrom, new_gr.source = (refmap[gr.chrom], 'haphpipe') new_gr.feature = gr.feature new_gr.start, new_gr.end = (con_s + 1, con_e) new_gr.score, new_gr.strand, new_gr.frame = ('.', gr.strand, gr.frame) new_gr.attrs['name'] = gr.attrs['name'] # Include statistics in attributes new_gr.attrs.update(get_seg_stats(alignment[aln_s:aln_e + 1])) # Get the regions that are actually called creg = called_regions(alignment[aln_s:aln_e + 1]) new_gr.attrs['call_reg'] = ','.join('%d-%d' % t for t in creg) new_gr.attrs['call_len'] = sum((t[1] - t[0] + 1) for t in creg) print(new_gr, file=outh)
def assemble_amplicons(contigs_fa=None, ref_fa=None, ref_gtf=None, outdir='.', sample_id='sampleXX', padding=50, min_contig_len=200, keep_tmp=False, quiet=False, logfile=None, debug=False): """ Pipeline step to assemble contigs using reference and amplicon regions Args: contigs_fa (str): Path to fasta file with assembled contigs ref_fa (str): Path to reference fasta file ref_gtf (str): Path to reference GTF file with amplicons outdir (str): Path to output directory sample_id (str): Name to append to scaffold sequence padding (int): Bases to include outside reference annotation min_contig_len (int): Minimum contig length for tiling path keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: out_assembly (str): Path to assembled amplicons (FASTA) out_summary (str): Path to assembly summary out_padded (str): Path to padded output file """ # Check dependencies sysutils.check_dependency('nucmer') sysutils.check_dependency('delta-filter') sysutils.check_dependency('show-tiling') # Outputs out_assembly = os.path.join(outdir, 'amplicon_assembly.fna') out_summary = os.path.join(outdir, 'amplicon_summary.txt') out_padded = os.path.join(outdir, 'amplicon_padded.out') if os.path.exists(out_padded): os.unlink(out_padded) # Temporary directory tempdir = sysutils.create_tempdir('assemble_amplicons', None, quiet, logfile) # Create fasta file with sequence IDs only (remove decription) tmp_contigs_fa = sequtils.clean_seqnames_file(contigs_fa, tempdir) # Load reference sequence(s) refseqs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')} # For each amplicon, extract the sequence from the reference and scaffold using nucmer amplicon_alignments = [] amps = [ gl for gl in gtfparse.gtf_parser(ref_gtf) if gl.feature == 'amplicon' ] for gl in amps: msg = 'Amplicon ref|%s|reg|%s\n' % (gl.chrom, gl.attrs['name']) sysutils.log_message(msg, quiet, logfile) # Extract reference amplicon amp_s = max(0, (gl.start - 1) - padding) amp_e = min(len(refseqs[gl.chrom]), gl.end + padding) ampseq = refseqs[gl.chrom].seq[amp_s:amp_e] amplicon_fa = os.path.join(tempdir, 'subject.fa') with open(amplicon_fa, 'w') as outh: print('>ref|%s|reg|%s' % (gl.chrom, gl.attrs['name']), file=outh) print(sequtils.wrap(str(ampseq)), file=outh) # Align with nucmer fil, til = alignutils.align_nucmer(tmp_contigs_fa, amplicon_fa, tempdir, min_contig_len=min_contig_len, quiet=quiet, logfile=logfile, debug=debug) # Skip everything else if debugging if debug: continue # Parse tiling and show alignments trows = [alignutils.TilingRow(l) for l in open(til, 'rU')] if not trows: amplicon_alignments.append((gl.chrom, gl.attrs['name'], None)) else: # Initialize alignment amp_seq = SeqIO.read(amplicon_fa, 'fasta') combined = alignutils.EmptyReferenceAlignment( str(amp_seq.seq).lower()) for tr in trows: out = alignutils.show_aligns(tr.ref, tr.qry, fil) for nucaln in alignutils.parse_show_aligns(out): combined = combined.merge_alignments(nucaln) with open(out_padded, 'a') as outh: print('%s\n%s\n%s' % (tr, combined.raln(), combined.qaln()), file=outh) amplicon_alignments.append((gl.chrom, gl.attrs['name'], combined)) # Cleanup for f in [fil, til, amplicon_fa]: if os.path.isfile(f): os.unlink(f) # Write to output files with open(out_assembly, 'w') as outseq, open(out_summary, 'w') as outsum: for ref_id, reg, combined in amplicon_alignments: amp_id = sequtils.make_seq_id(sid=sample_id, ref=ref_id, reg=reg) if combined is None: msg1 = '%s\tFAIL\t%d' % (amp_id, 0) msg2 = u'%s\tFAIL\t%d\t%s\n' % (amp_id, 0, u"👎🏼") if logfile is not None: print(u'%s\tFAIL\t%d\t%s' % (amp_id, 0, u"👎🏼"), file=logfile) else: scaf, s, e = combined.scaffold2() msg1 = '%s\tPASS\t%d' % (amp_id, len(scaf)) msg2 = u'%s\tPASS\t%d\t%s\n' % (amp_id, len(scaf), u"👍🏼") print('>%s' % (amp_id), file=outseq) print('%s' % sequtils.wrap(scaf), file=outseq) print(msg1, file=outsum) sysutils.log_message(msg2, quiet, logfile) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'assemble_amplicons', quiet, logfile) return out_assembly, out_summary, out_padded
def pairwise_align( amplicons_fa=None, ref_fa=None, ref_gtf=None, outdir='.', keep_tmp=False, quiet=False, logfile=None, debug=False, ): """ Pipeline step to align amplicons to reference Args: amplicons_fa (str): Path to fasta file with amplicon sequences ref_fa (str): Path to reference fasta file ref_gtf (str): Path to reference GTF file with amplicons outdir (str): Path to output directory keep_tmp (bool): Do not delete temporary directory quiet (bool): Do not write output to console logfile (file): Append console output to this file debug (bool): Print commands but do not run Returns: out_aln (str): Path to alignment in JSON format """ # Check dependencies sysutils.check_dependency('blastx') # Outputs out_aln = os.path.join(outdir, 'alignments.json') # Temporary directory tempdir = sysutils.create_tempdir('pairwise_align', None, quiet, logfile) # Load reference sequence(s) refseqs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')} # Load amplicons from GTF file amps = [ gl for gl in gtfparse.gtf_parser(ref_gtf) if gl.feature == 'amplicon' ] ampdict = {(gl.chrom, gl.attrs['name']): gl for gl in amps} out_json = { 'aa_alignments': {}, 'nuc_alignments': {}, 'padded_alignments': {}, 'padded_gtf': [], } # {(sid, ref): [(reg, list(alignment)), ...], ...} all_nuc_aln = defaultdict(list) for amprec in SeqIO.parse(amplicons_fa, 'fasta'): # Get amplicon reference and region from sequence ID aid = sequtils.parse_seq_id(amprec.id) # Find the GTF line used to orient this amplicon try: gl = ampdict[(aid['ref'], aid['reg'])] except KeyError: poss_gl = [t for t in ampdict.keys() if t[1] == aid['reg']] gl = ampdict[poss_gl[0]] # Start and stop for primary coding region pri_s = int(gl.attrs['primary_cds'].split('-')[0]) - 1 pri_e = int(gl.attrs['primary_cds'].split('-')[1]) # Start and stop for additional coding regions altcds = [] if 'alt_cds' in gl.attrs: for x in gl.attrs['alt_cds'].split(','): altcds.append( ((int(x.split('-')[0]) - 1), int(x.split('-')[1]))) # Align using amino acids refseq = matching_refseq(refseqs, aid['ref']) alnobj, nuc_aln = baln.alignAA(refseq, amprec, (pri_s, pri_e), altcds, tempdir, quiet) # prialn is a BlastxAlignment object with amplicon aligned to primary cds # merged is a nucleotide alignment over the full amplicon, with unaligned regions # aligned using alternate cds or nucleotide alignments all_nuc_aln[(aid['sid'], aid['ref'])].append((aid['reg'], nuc_aln)) jid = 'sid|%s|ref|%s|reg|%s|' % (aid['sid'], aid['ref'], aid['reg']) out_json['aa_alignments'][jid] = alnobj.aa_align out_json['nuc_alignments'][jid] = nuc_aln # Full sequence with padding for sid, ref in list(all_nuc_aln.keys()): _refseq = matching_refseq(refseqs, ref) # New name and new alignment newname = 'sid|%s|ref|%s|' % (sid, _refseq.id) tmp = [] # Sort all segments by the start position segments = sorted(all_nuc_aln[(sid, ref)], key=lambda x: x[1][0][0]) rpos = qpos = 0 for sname, seg in segments: gr = GTFRow() gr.chrom, gr.source, gr.feature = (newname, 'haphpipe', 'amplicon') gr.score, gr.strand, gr.frame = ('.', '+', '.') gr.attrs['name'] = sname # Pad up to first position of segment if rpos < seg[0][0]: for p in range(rpos, seg[0][0]): tmp.append((p, str(_refseq.seq[p]), '*', qpos)) qpos += 1 gr.start = qpos + 1 for t in seg: if t[3] == -1: tmp.append(t) else: tmp.append((t[0], t[1], t[2], qpos)) qpos += 1 # Add annotation line gr.end = qpos # Include statistics in attributes gr.attrs.update(baln.get_seg_stats(seg)) # Include called regions gr.attrs['call_reg'] = '%d-%d' % (gr.start, gr.end) gr.attrs['call_len'] = (gr.end - gr.start + 1) # Append to json object out_json['padded_gtf'].append(str(gr)) rpos = seg[-1][0] + 1 # Add padding for end of sequence if rpos < len(_refseq.seq): for p in range(rpos, len(_refseq.seq)): tmp.append((p, str(_refseq.seq[p]), '*', qpos)) qpos += 1 # Validate the alignment vseq = ''.join(t[2] for t in tmp if t[3] != -1) if baln.validate_alignment(tmp, _refseq.seq, vseq): if not quiet: print('%s alignment validation passed' % newname, file=sys.stderr) out_json['padded_alignments'][newname] = tmp for s in out_json['padded_gtf']: if not quiet: print(s, file=sys.stdout) with open(out_aln, 'w') as outh: print(json.dumps(out_json), file=outh) if not keep_tmp: sysutils.remove_tempdir(tempdir, 'pairwise_align', quiet, logfile) return out_aln