Ejemplo n.º 1
0
def split_reads(data_folder,
                adaID,
                fragment,
                chunk_size=10000,
                maxreads=-1,
                VERBOSE=0):
    '''Split reads into chunks for mapping'''

    input_filename = get_divided_filename(data_folder,
                                          adaID,
                                          fragment,
                                          type='bam')
    with pysam.Samfile(input_filename, 'rb') as bamfile:
        if VERBOSE:
            if maxreads == -1:
                n_reads = get_number_reads_open(bamfile) // 2
            else:
                n_reads = maxreads

            print 'Expected number of chunks:', 1 + (n_reads // chunk_size)

        chunk_number = 0
        chunkfile = None
        for irp, read_pair in enumerate(pair_generator(bamfile)):
            if irp == maxreads:
                break

            if VERBOSE >= 2:
                if not ((irp + 1) % 10000):
                    print irp + 1

            if not (irp % chunk_size):
                if chunkfile is not None:
                    chunkfile.close()
                chunk_number += 1
                chunk_filename = get_divided_filename(data_folder,
                                                      adaID,
                                                      fragment,
                                                      type='bam',
                                                      chunk=chunk_number)
                chunkfile = pysam.Samfile(chunk_filename,
                                          'wb',
                                          template=bamfile)
                if VERBOSE >= 2:
                    print 'Chunk n', chunk_number, 'started'

            chunkfile.write(read_pair[0])
            chunkfile.write(read_pair[1])

        if chunkfile is not None:
            chunkfile.close()

    if VERBOSE:
        print 'Chunking finished'
def split_reads(data_folder, adaID, fragment, chunk_size=10000, maxreads=-1, VERBOSE=0):
    '''Split reads into chunks for mapping'''

    input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam')
    with pysam.Samfile(input_filename, 'rb') as bamfile:
        if VERBOSE:
            if maxreads == -1:
                n_reads = get_number_reads_open(bamfile) // 2
            else:
                n_reads = maxreads

            print 'Expected number of chunks:', 1 + (n_reads // chunk_size)

        chunk_number = 0
        chunkfile = None
        for irp, read_pair in enumerate(pair_generator(bamfile)):
            if irp == maxreads:
                break

            if VERBOSE >= 2:
                if not ((irp+1) % 10000):
                    print irp+1

            if not (irp % chunk_size):
                if chunkfile is not None:
                    chunkfile.close()
                chunk_number += 1
                chunk_filename = get_divided_filename(data_folder, adaID, fragment, type='bam', chunk=chunk_number)
                chunkfile = pysam.Samfile(chunk_filename, 'wb', template=bamfile)
                if VERBOSE >= 2:
                    print 'Chunk n', chunk_number, 'started'


            chunkfile.write(read_pair[0])
            chunkfile.write(read_pair[1])

        if chunkfile is not None:
            chunkfile.close()

    if VERBOSE:
        print 'Chunking finished'
Ejemplo n.º 3
0
def build_consensus(bamfilename, len_reference, VERBOSE=0,
                    block_len=100,
                    reads_per_alignment=31,
                    deltamax=60):
    '''Build a consensus from mapped filtered reads'''
    if VERBOSE:
        print 'Build consensus'
    
    from operator import itemgetter
    import numpy as np
    import pysam
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.Alphabet.IUPAC import ambiguous_dna

    from hivwholeseq.utils.miseq import alpha
    from hivwholeseq.utils.mapping import pair_generator
    from hivwholeseq.utils.sequence import build_local_consensus
    
    with pysam.Samfile(bamfilename, 'rb') as bamfile:

        if VERBOSE >= 3:
            from hivwholeseq.utils.mapping import get_number_reads_open
            print 'The bamfile has', get_number_reads_open(bamfile), 'reads.'

        # Get first block covered, even if partially, and record where each read started
        if VERBOSE >= 2:
            print 'First block'

        block_len = block_len
        seqs = []
        n_block = 0
        while not seqs:
            start_block = n_block * (block_len // 2)
            for read in bamfile:
                if read.pos <= start_block:
                    seqs.append((read.pos, ('N' * read.pos) + read.seq[:block_len - read.pos]))
            bamfile.reset()
            n_block += 1
        
        # If there are too many reads, take the reads that start earliest
        if len(seqs) > reads_per_alignment:
            np.random.shuffle(seqs)
            seqs.sort(key=itemgetter(0))
            seqs = seqs[:reads_per_alignment]

        seqrecs = [SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i))
                   for i, (pos, s) in enumerate(seqs)]
        consensus = build_local_consensus(seqrecs, VERBOSE=VERBOSE, full_cover=False)

        # Block, by block, make local alignment and join to previous consensus
        # There are two ways of finishing the loop:
        # 1. if we cover all the way to the end of the reference, good
        # 2. if we find no reads fully covering a block BEFORE that, add a final block
        while start_block < len_reference:
            edges = (start_block, min(len_reference, start_block + block_len))

            if VERBOSE >= 2:
                print 'block n.', n_block, 'region:', edges

            seqs = pileup_trim_reads_coverfull(bamfile, edges, VERBOSE=VERBOSE)

            # If we do not find reads that fully cover, consider it the end of
            # the consensus, only the final block is missing
            if not seqs:
                break
            elif len(seqs) > reads_per_alignment:
                np.random.shuffle(seqs)
                seqs = seqs[:reads_per_alignment]

            # Make local consensus using a multiple sequence alignment
            # --------------
            # -----   ------
            # --------   ---
            #---------------
            seqrecs = [SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i))
                       for i, s in enumerate(seqs)]
            cons_block = build_local_consensus(seqrecs, VERBOSE=VERBOSE, full_cover=True)

            # Join to the rest of the consensus, like this:
            # ---------------------------
            #                        --------------------
            consensus = join_block_to_consensus(consensus, cons_block,
                                                VERBOSE=VERBOSE, deltamax=deltamax)

            start_block += 2 * block_len // 3
            n_block += 1

        # If we cover the whole reference, good
        else:
            return consensus

        if VERBOSE >= 2:
            print 'final block'

        # If we broke out of the while, a final block is needed
        seqs = pileup_trim_reads_coverstart(bamfile, start_block, VERBOSE=VERBOSE)

        # Sort reads by length
        if len(seqs) > reads_per_alignment:
            np.random.shuffle(seqs)
            seqs.sort(key=len, reverse=True)
            seqs = seqs[:reads_per_alignment]

            # Complete with N, approximately
            sl = len(seqs[0])
            seqs = [s+('N' * (sl - len(s))) for s in seqs]


        seqrecs = [SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i))
                   for i, s in enumerate(seqs)]
        cons_block = build_local_consensus(seqrecs, VERBOSE=VERBOSE, full_cover=False)
        consensus = join_block_to_consensus(consensus, cons_block, VERBOSE=VERBOSE,
                                            deltamax=deltamax)

    return consensus
Ejemplo n.º 4
0
def build_consensus(bamfilename,
                    len_reference,
                    VERBOSE=0,
                    block_len=100,
                    reads_per_alignment=31,
                    deltamax=60):
    '''Build a consensus from mapped filtered reads'''
    if VERBOSE:
        print 'Build consensus'

    from operator import itemgetter
    import numpy as np
    import pysam
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.Alphabet.IUPAC import ambiguous_dna

    from hivwholeseq.utils.miseq import alpha
    from hivwholeseq.utils.mapping import pair_generator
    from hivwholeseq.utils.sequence import build_local_consensus

    with pysam.Samfile(bamfilename, 'rb') as bamfile:

        if VERBOSE >= 3:
            from hivwholeseq.utils.mapping import get_number_reads_open
            print 'The bamfile has', get_number_reads_open(bamfile), 'reads.'

        # Get first block covered, even if partially, and record where each read started
        if VERBOSE >= 2:
            print 'First block'

        block_len = block_len
        seqs = []
        n_block = 0
        while not seqs:
            start_block = n_block * (block_len // 2)
            for read in bamfile:
                if read.pos <= start_block:
                    seqs.append(
                        (read.pos,
                         ('N' * read.pos) + read.seq[:block_len - read.pos]))
            bamfile.reset()
            n_block += 1

        # If there are too many reads, take the reads that start earliest
        if len(seqs) > reads_per_alignment:
            np.random.shuffle(seqs)
            seqs.sort(key=itemgetter(0))
            seqs = seqs[:reads_per_alignment]

        seqrecs = [
            SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i))
            for i, (pos, s) in enumerate(seqs)
        ]
        consensus = build_local_consensus(seqrecs,
                                          VERBOSE=VERBOSE,
                                          full_cover=False)

        # Block, by block, make local alignment and join to previous consensus
        # There are two ways of finishing the loop:
        # 1. if we cover all the way to the end of the reference, good
        # 2. if we find no reads fully covering a block BEFORE that, add a final block
        while start_block < len_reference:
            edges = (start_block, min(len_reference, start_block + block_len))

            if VERBOSE >= 2:
                print 'block n.', n_block, 'region:', edges

            seqs = pileup_trim_reads_coverfull(bamfile, edges, VERBOSE=VERBOSE)

            # If we do not find reads that fully cover, consider it the end of
            # the consensus, only the final block is missing
            if not seqs:
                break
            elif len(seqs) > reads_per_alignment:
                np.random.shuffle(seqs)
                seqs = seqs[:reads_per_alignment]

            # Make local consensus using a multiple sequence alignment
            # --------------
            # -----   ------
            # --------   ---
            #---------------
            seqrecs = [
                SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i))
                for i, s in enumerate(seqs)
            ]
            cons_block = build_local_consensus(seqrecs,
                                               VERBOSE=VERBOSE,
                                               full_cover=True)

            # Join to the rest of the consensus, like this:
            # ---------------------------
            #                        --------------------
            consensus = join_block_to_consensus(consensus,
                                                cons_block,
                                                VERBOSE=VERBOSE,
                                                deltamax=deltamax)

            start_block += 2 * block_len // 3
            n_block += 1

        # If we cover the whole reference, good
        else:
            return consensus

        if VERBOSE >= 2:
            print 'final block'

        # If we broke out of the while, a final block is needed
        seqs = pileup_trim_reads_coverstart(bamfile,
                                            start_block,
                                            VERBOSE=VERBOSE)

        # Sort reads by length
        if len(seqs) > reads_per_alignment:
            np.random.shuffle(seqs)
            seqs.sort(key=len, reverse=True)
            seqs = seqs[:reads_per_alignment]

            # Complete with N, approximately
            sl = len(seqs[0])
            seqs = [s + ('N' * (sl - len(s))) for s in seqs]

        seqrecs = [
            SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i))
            for i, s in enumerate(seqs)
        ]
        cons_block = build_local_consensus(seqrecs,
                                           VERBOSE=VERBOSE,
                                           full_cover=False)
        consensus = join_block_to_consensus(consensus,
                                            cons_block,
                                            VERBOSE=VERBOSE,
                                            deltamax=deltamax)

    return consensus