Ejemplo n.º 1
0
def align_consensus(cons_file_1, cons_file_2):
    """ Align consensus.faa to each other and to the HIV reference
    """
    from pythonlib import EmbossStandalone
    from pythonlib.MarkxIO import Markx10Iterator

    needle_exe = 'needle'

    out_file = 'map_cons.needle'
    EmbossStandalone.needle(needle_exe,
                            cons_file_1,
                            cons_file_2,
                            out=out_file,
                            aglobal3='False')

    alignment = Markx10Iterator(open(out_file)).next()

    cons_1 = alignment.get_seq_by_num(0).tostring().upper()
    cons_2 = alignment.get_seq_by_num(1).tostring().upper()

    map = []
    for c in zip(cons_1, cons_2):
        assert not (c[0] == c[1] and c[1] == '-'), 'not two gaps'
        if c[0] == '-':
            map.append(1)
        elif c[1] == '-':
            map.append(2)
        else:
            map.append(0)

    return map
Ejemplo n.º 2
0
        length += float(len_seq)
        length2 += float(len_seq * len_seq)
        #        readdict[read.] = [seq,len_seq]
        n += 1.

meanlr = length / n
stdlr = math.sqrt((n * length2 - length * length) / (n * n - n))
allowed_length = [meanlr - acclength * stdlr, meanlr + (1 + acclength) * stdlr]
print >> sys.stderr, 'Allowed interval for length is', allowed_length

if not os.path.isfile('tmp_align_f.needle'):
    print >> sys.stderr, 'needle forward'
    EmbossStandalone.needle(needle_exe,
                            options.ref,
                            f_fasta_forward_filename,
                            out='tmp_align_f.needle',
                            gapopen=6.0,
                            gapext=3.0,
                            aglobal3='False')
"""
else:
    print >>sys.stderr, 'The alignment file tmp_align_f.needle is already present'
    statinfo = os.stat('tmp_align_f.needle')
    age_sec = time.time() - statinfo.st_mtime
    if age_sec > 3600:
        print >>sys.stderr, 'Warning: it was modified more than an hour ago'
    age = time.gmtime(age_sec)
    
    print >>sys.stderr, 'If you want to run the alignment again, remove it'
    print >>sys.stderr, "using existing 'tmp_align_f.needle'..."
"""
Ejemplo n.º 3
0
def align_reads(filename):
    """reads the file with reads, align them with the reference,
    returns a dictionary with reads (in-dels are discarded)
    and starting position with respect to the reference
    """
    from pythonlib import EmbossStandalone
    from pythonlib.MarkxIO import Markx10Iterator

    needle_exe = 'needle'

    aligned_reads = {}

    f_fasta = open(filename)
    seqlist = list(SeqIO.parse(f_fasta, 'fasta'))
    countreads = len(seqlist)

    # forward...
    f_fasta_forward_filename = 'tmp_reads_f.fas'
    f_fasta_forward = open(f_fasta_forward_filename, 'w')
    SeqIO.write(seqlist, f_fasta_forward, 'fasta')
    f_fasta_forward.close()

    # ...and reverse
    for seq in seqlist:
        seq.seq = seq.seq.reverse_complement()
    f_fasta.close()
    f_fasta_reverse_filename = 'tmp_reads_r.fas'
    f_fasta_reverse = open(f_fasta_reverse_filename, 'w')
    SeqIO.write(seqlist, f_fasta_reverse, 'fasta')
    f_fasta_reverse.close()

    print >> sys.stderr, 'Found', countreads, 'reads'

    if not os.path.isfile('tmp_align_f.needle'):
        print >> sys.stderr, 'needle forward'
        EmbossStandalone.needle(needle_exe,
                                ref_genome,
                                f_fasta_forward_filename,
                                out='tmp_align_f.needle',
                                gapopen=6.0,
                                gapext=3.0,
                                aglobal3='False',
                                adesshow3='True')

    if not os.path.isfile('tmp_align_r.needle'):
        print >> sys.stderr, 'needle backward'
        EmbossStandalone.needle(needle_exe,
                                ref_genome,
                                f_fasta_reverse_filename,
                                out='tmp_align_r.needle',
                                gapopen=6.0,
                                gapext=3.0,
                                aglobal3='False',
                                adesshow3='True')

    f_forward = open('tmp_align_f.needle')
    f_reverse = open('tmp_align_r.needle')

    forwardaligniter = Markx10Iterator(f_forward)
    reversealigniter = Markx10Iterator(f_reverse)
    count_forward = 0
    count_reverse = 0

    while True:

        # pos += 1
        # print >> sys.stderr,  '\x1B[1A\x1B[2K', pos
        try:
            f_align = forwardaligniter.next()
            r_align = reversealigniter.next()
        except:
            break

        if f_align is None or r_align is None:
            break

        assert f_align.get_all_seqs()[1].id == r_align.get_all_seqs(
        )[1].id, 'same seq back and forward'

        this_id = f_align.get_all_seqs()[1].id

        if float(f_align._annotations['sw_score']) > float(
                r_align._annotations['sw_score']):
            tmp = f_align.get_seq_by_num(1).tostring().upper()
            refseq = f_align.get_seq_by_num(0).tostring().upper()
            count_forward += 1
        else:
            tmp = r_align.get_seq_by_num(1).tostring().upper()
            refseq = r_align.get_seq_by_num(0).tostring().upper()
            count_reverse += 1

        q_align_start = len(tmp) - len(tmp.lstrip('-'))
        q_align_end = len(tmp.rstrip('-'))

        m_align_start = len(refseq) - len(refseq.lstrip('-'))
        m_align_end = len(refseq.rstrip('-'))

        align_start = max(m_align_start, q_align_start)
        align_end = min(m_align_end, q_align_end)

        this_read = []
        for c in zip(refseq[align_start:align_end + 1],
                     tmp[align_start:align_end + 1]):
            if c[0] != '-' and c[1] != '-':
                this_read.append(c[1])
            elif c[1] == '-':
                this_read.append(c[0])
            elif c[0] == '-':
                pass
        aligned_reads[this_id] = [''.join(this_read), align_start]

    return aligned_reads
Ejemplo n.º 4
0
def align_to_ref(al_exe, ref_file, reads_file, gen_length):
    """
    Calls water standalone program to align reads to reference genome
    """
    from pythonlib import EmbossStandalone
    import MyAlignIO
    import time

    max_read_length = 300
    format = 'markx10'
    align_file = '%s.needle' % reads_file.rstrip('.fas')
    out_reads = {}
    cov_prof = [0] * (2 * gen_length + max_read_length)

    if not os.path.isfile(align_file):
        print 'Aligning reads via Needleman-Wunsch algorithm'
        EmbossStandalone.needle(al_exe,
                                ref_file,
                                reads_file,
                                out=align_file,
                                gapopen=go_default,
                                gapext=ge_default,
                                aglobal3='False')
    else:
        print 'The alignment file', align_file, 'is already present'
        statinfo = os.stat(align_file)
        age_sec = time.time() - statinfo.st_mtime
        if age_sec > 3600:
            print 'Warning: it was modified more than an hour ago'
        age = time.gmtime(age_sec)

        print 'If you want to run the alignment again, remove it'

    assert os.path.isfile(align_file), 'File %s not found' % align_file
    handle = open(align_file, 'rU')
    print 'Parsing alignment output'

    for alin in MyAlignIO.parse(handle, format):
        assert len(alin.get_all_seqs()) == 2, "Should be pairwise!"
        alength = int(alin.get_alignment_length())
        #        print 'Alignment is', alength, 'bases long'

        record = iter(alin)

        # These are the information of the query sequence, i.e. the reference
        query_rec = record.next()
        assert query_rec.name == 'query', 'This should be the query'
        qstart = int(query_rec.annotations['al_start'])
        qstop = int(query_rec.annotations['al_stop'])

        gaps_query = 0
        qst = query_rec.seq.tostring()

        qls = list(qst)
        for c in qst.strip('-'):
            if c == '-':
                gaps_query = gaps_query + 1

        # These are for the matching sequences, i.e. the reads
        match_rec = record.next()
        assert match_rec.name == 'match', 'This should be the match'

        mst = match_rec.seq.tostring()

        mls = list(mst)

        for c in mls:
            if c != '-':
                mstart = mls.index(c) + 1
                break
        mstop = len(mst.rstrip('-'))

        # counts the gaps in the read (no flanking gaps)
        gaps_match = 0
        for c in mst.strip('-'):
            if c == '-':
                gaps_match = gaps_match + 1
        match_length = len(mst.strip('-'))
        if gaps_query + gaps_match > round(tolerance * match_length):
            # print 'too many indels,', (gaps_query + gaps_match)
            continue

        out_reads[match_rec.id] = [None, None, None, None, []]
        out_reads[
            match_rec.id][0] = qstart  # is this really useful at this time?
        out_reads[
            match_rec.id][1] = qstop  # is this really useful at this time?
        out_reads[match_rec.id][2] = mstart  # this is
        out_reads[match_rec.id][3] = mstop  # this too

        for i in range(mstart, mstop + 1):
            try:
                cov_prof[i] = cov_prof[i] + 1
            except IndexError:
                print 'out of coverage', i
        this_q = qls[mstart - 1:mstop]
        this_m = list(mst.strip('-'))

        assert len(this_q) == len(this_m), 'Length must be the same %d %d' % (
            len(this_q), len(this_m))

        amb_calls = 0

        # There are three possibilities: insertions, deletions, no in-dels
        for i in range(len(this_m)):

            if this_m[i] == '-' and this_q[i] != '-':
                out_reads[match_rec.id][4].append('-')

            if this_m[i] != '-' and this_q[i] == '-':
                pass

            if this_m[i] != '-' and this_q[i] != '-':
                out_reads[match_rec.id][4].append(this_m[i])

            # This should never happen
            if this_m[i] == '-' and this_q[i] == '-':
                print 'Should this happen?'
                sys.exit()

            if this_m[i] == 'N':
                amb_calls = amb_calls + 1
                if verbose:
                    print >> sys.stderr, 'Found an N in', match_rec.id

        if amb_calls > amb_thresh:
            if verbose:
                print 'Read', match_rec.id, 'has too many Ns'
            del out_reads[match_rec.id]
    cp = open('./%s.covprof' % reads_file.rstrip('_reads.fas'), 'w')
    for i in range(1, gen_length):
        cp.write('%i\t%i\n' % (i, cov_prof[i]))
    cp.close()

    return out_reads