Ejemplo n.º 1
0
def annotate_from_ref(
    align_json=None,
    ref_gtf=None,
    outfile=None,
    outfmt=None,
    debug=False,
):
    outh = sys.stdout if outfile is None else open(outfile, 'w')
    jaln = load_slot_json(align_json, 'padded_alignments')

    refmap = {parse_seq_id(k)['ref']: k for k in list(jaln.keys())}
    for gr in gtf_parser(ref_gtf):
        if gr.feature not in [
                'gene',
        ]:
            continue
        alignment = jaln[refmap[gr.chrom]]
        ref_s = gr.start - 1
        ref_e = gr.end

        # Get alignment start
        for aln_s in range(len(alignment)):
            if alignment[aln_s][0] == ref_s:
                break
        while alignment[aln_s][3] == -1:
            aln_s += 1

        # Get alignment end
        for aln_e in range(len(alignment) - 1, -1, -1):
            if alignment[aln_e][0] == ref_e:
                break
        while alignment[aln_e][3] == -1:
            aln_e += -1

        con_s = alignment[aln_s][3]
        con_e = alignment[aln_e][3]

        new_gr = GTFRow()
        new_gr.chrom, new_gr.source = (refmap[gr.chrom], 'haphpipe')
        new_gr.feature = gr.feature
        new_gr.start, new_gr.end = (con_s + 1, con_e)
        new_gr.score, new_gr.strand, new_gr.frame = ('.', gr.strand, gr.frame)
        new_gr.attrs['name'] = gr.attrs['name']

        # Include statistics in attributes
        new_gr.attrs.update(get_seg_stats(alignment[aln_s:aln_e + 1]))
        # Get the regions that are actually called
        creg = called_regions(alignment[aln_s:aln_e + 1])
        new_gr.attrs['call_reg'] = ','.join('%d-%d' % t for t in creg)
        new_gr.attrs['call_len'] = sum((t[1] - t[0] + 1) for t in creg)

        print(new_gr, file=outh)
Ejemplo n.º 2
0
def extract_pairwise(
    align_json=None,
    outfile=None,
    outfmt=None,
    refreg=None,
    debug=False,
):
    outh = sys.stdout if outfile is None else open(outfile, 'w')

    if outfmt == 'nuc_fa' or outfmt == 'prot_fa':
        jaln = load_slot_json(align_json, 'padded_alignments')
        if refreg is None:
            for newname, alignment in list(jaln.items()):
                nucstr = ''.join(t[2] for t in alignment if t[3] != -1)
                nucstr = nucstr.replace('*', 'N')
                print('>%s' % newname, file=outh)
                if outfmt == 'nuc_fa':
                    print(sequtils.wrap(nucstr), file=outh)
                else:
                    s = Seq(nucstr[:(old_div(len(nucstr), 3)) * 3])
                    print(sequtils.wrap(str(s.translate())), file=outh)
        else:
            refmap = {
                sequtils.parse_seq_id(k)['ref']: k
                for k in list(jaln.keys())
            }
            chrom, ref_s, ref_e = sequtils.region_to_tuple(refreg)
            ref_s = ref_s - 1
            alignment = jaln[refmap[chrom]]

            # Get alignment start
            for aln_s in range(len(alignment)):
                if alignment[aln_s][0] == ref_s:
                    break
                while alignment[aln_s][3] == -1:
                    aln_s += 1

            # Get alignment end
            for aln_e in range(len(alignment) - 1, -1, -1):
                if alignment[aln_e][0] == ref_e:
                    break
            while alignment[aln_e][3] == -1:
                aln_e += -1

            nucstr = ''.join(t[2] for t in alignment[aln_s:aln_e]
                             if t[3] != -1)
            nucstr = nucstr.replace('*', 'N')
            print('>%s (%s)' % (refmap[chrom], refreg), file=outh)
            if outfmt == 'nuc_fa':
                print(sequtils.wrap(nucstr), file=outh)
            else:
                s = Seq(nucstr[:(old_div(len(nucstr), 3)) * 3])
                print(sequtils.wrap(str(s.translate())), file=outh)

    elif outfmt == 'aln_fa':
        jaln = load_slot_json(align_json, 'padded_alignments')
        for newname, alignment in list(jaln.items()):
            aid = sequtils.parse_seq_id(newname)
            rstr = ''.join(t[1] for t in alignment).replace('*', 'N')
            qstr = ''.join(t[2] for t in alignment).replace('*', 'N')
            print('>ref|%s|' % aid['ref'], file=outh)
            print(sequtils.wrap(rstr), file=outh)
            print('>sid|%s|' % aid['sid'], file=outh)
            print(sequtils.wrap(qstr), file=outh)

    elif outfmt == 'amp_gtf':
        jgtf = load_slot_json(align_json, 'padded_gtf')
        print('\n'.join(_ for _ in jgtf), file=outh)

    elif outfmt == 'tsv':
        jaln = load_slot_json(align_json, 'padded_alignments')
        for newname, alignment in list(jaln.items()):
            print('# %s' % newname, file=outh)
            for l in alignment:
                print('\t'.join(str(_) for _ in l), file=outh)
Ejemplo n.º 3
0
def pairwise_align(
    amplicons_fa=None,
    ref_fa=None,
    ref_gtf=None,
    outdir='.',
    keep_tmp=False,
    quiet=False,
    logfile=None,
    debug=False,
):
    """ Pipeline step to align amplicons to reference

    Args:
        amplicons_fa (str): Path to fasta file with amplicon sequences
        ref_fa (str): Path to reference fasta file
        ref_gtf (str): Path to reference GTF file with amplicons
        outdir (str): Path to output directory
        keep_tmp (bool): Do not delete temporary directory
        quiet (bool): Do not write output to console
        logfile (file): Append console output to this file
        debug (bool): Print commands but do not run

    Returns:
        out_aln (str): Path to alignment in JSON format

    """
    # Check dependencies
    sysutils.check_dependency('blastx')

    # Outputs
    out_aln = os.path.join(outdir, 'alignments.json')

    # Temporary directory
    tempdir = sysutils.create_tempdir('pairwise_align', None, quiet, logfile)

    # Load reference sequence(s)
    refseqs = {s.id: s for s in SeqIO.parse(ref_fa, 'fasta')}

    # Load amplicons from GTF file
    amps = [
        gl for gl in gtfparse.gtf_parser(ref_gtf) if gl.feature == 'amplicon'
    ]
    ampdict = {(gl.chrom, gl.attrs['name']): gl for gl in amps}

    out_json = {
        'aa_alignments': {},
        'nuc_alignments': {},
        'padded_alignments': {},
        'padded_gtf': [],
    }
    # {(sid, ref): [(reg, list(alignment)), ...], ...}
    all_nuc_aln = defaultdict(list)

    for amprec in SeqIO.parse(amplicons_fa, 'fasta'):
        # Get amplicon reference and region from sequence ID
        aid = sequtils.parse_seq_id(amprec.id)
        # Find the GTF line used to orient this amplicon
        try:
            gl = ampdict[(aid['ref'], aid['reg'])]
        except KeyError:
            poss_gl = [t for t in ampdict.keys() if t[1] == aid['reg']]
            gl = ampdict[poss_gl[0]]

        # Start and stop for primary coding region
        pri_s = int(gl.attrs['primary_cds'].split('-')[0]) - 1
        pri_e = int(gl.attrs['primary_cds'].split('-')[1])
        # Start and stop for additional coding regions
        altcds = []
        if 'alt_cds' in gl.attrs:
            for x in gl.attrs['alt_cds'].split(','):
                altcds.append(
                    ((int(x.split('-')[0]) - 1), int(x.split('-')[1])))

        # Align using amino acids
        refseq = matching_refseq(refseqs, aid['ref'])
        alnobj, nuc_aln = baln.alignAA(refseq, amprec, (pri_s, pri_e), altcds,
                                       tempdir, quiet)
        # prialn is a BlastxAlignment object with amplicon aligned to primary cds
        # merged is a nucleotide alignment over the full amplicon, with unaligned regions
        # aligned using alternate cds or nucleotide alignments

        all_nuc_aln[(aid['sid'], aid['ref'])].append((aid['reg'], nuc_aln))
        jid = 'sid|%s|ref|%s|reg|%s|' % (aid['sid'], aid['ref'], aid['reg'])
        out_json['aa_alignments'][jid] = alnobj.aa_align
        out_json['nuc_alignments'][jid] = nuc_aln

    # Full sequence with padding
    for sid, ref in list(all_nuc_aln.keys()):
        _refseq = matching_refseq(refseqs, ref)
        # New name and new alignment
        newname = 'sid|%s|ref|%s|' % (sid, _refseq.id)
        tmp = []
        # Sort all segments by the start position
        segments = sorted(all_nuc_aln[(sid, ref)], key=lambda x: x[1][0][0])
        rpos = qpos = 0
        for sname, seg in segments:
            gr = GTFRow()
            gr.chrom, gr.source, gr.feature = (newname, 'haphpipe', 'amplicon')
            gr.score, gr.strand, gr.frame = ('.', '+', '.')
            gr.attrs['name'] = sname

            # Pad up to first position of segment
            if rpos < seg[0][0]:
                for p in range(rpos, seg[0][0]):
                    tmp.append((p, str(_refseq.seq[p]), '*', qpos))
                    qpos += 1
            gr.start = qpos + 1
            for t in seg:
                if t[3] == -1:
                    tmp.append(t)
                else:
                    tmp.append((t[0], t[1], t[2], qpos))
                    qpos += 1
            # Add annotation line
            gr.end = qpos
            # Include statistics in attributes
            gr.attrs.update(baln.get_seg_stats(seg))
            # Include called regions
            gr.attrs['call_reg'] = '%d-%d' % (gr.start, gr.end)
            gr.attrs['call_len'] = (gr.end - gr.start + 1)
            # Append to json object
            out_json['padded_gtf'].append(str(gr))
            rpos = seg[-1][0] + 1

        # Add padding for end of sequence
        if rpos < len(_refseq.seq):
            for p in range(rpos, len(_refseq.seq)):
                tmp.append((p, str(_refseq.seq[p]), '*', qpos))
                qpos += 1

        # Validate the alignment
        vseq = ''.join(t[2] for t in tmp if t[3] != -1)
        if baln.validate_alignment(tmp, _refseq.seq, vseq):
            if not quiet:
                print('%s alignment validation passed' % newname,
                      file=sys.stderr)
            out_json['padded_alignments'][newname] = tmp

    for s in out_json['padded_gtf']:
        if not quiet:
            print(s, file=sys.stdout)

    with open(out_aln, 'w') as outh:
        print(json.dumps(out_json), file=outh)

    if not keep_tmp:
        sysutils.remove_tempdir(tempdir, 'pairwise_align', quiet, logfile)

    return out_aln