Beispiel #1
0
def build_consensus(bamfilename,
                    len_reference,
                    VERBOSE=0,
                    block_len_initial=100,
                    reads_per_alignment=31,
                    accept_holes=False,
                    store_allele_counts=False):
    '''Build a consensus from premapped and divided reads'''
    if VERBOSE:
        print 'Build consensus'

    import numpy as np
    import pysam
    from Bio import SeqIO
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.Alphabet.IUPAC import ambiguous_dna

    from hivwholeseq.utils.mapping import align_muscle
    # Three steps:
    # 1. collect reads uniformly across the fragment
    # 2. make local consensi
    # 3. join into fragmentwide consensus
    consensus = None
    consensi_local = []
    if store_allele_counts:
        allcounts_local = []
    pos_ref = 0
    block_len = block_len_initial
    with pysam.Samfile(bamfilename, 'rb') as bamfile:

        # Initial block
        if VERBOSE >= 2:
            print 'Block n', len(consensi_local) + 1,
        for pos_first_block in xrange(len_reference):
            bamfile.reset()

            # The first block has to make a consensus for the FIRST base, this needs
            # at least ONE read starting exactly at the first position. Otherwise,
            # the same is repeated for position 2, and so on.
            reads = [
                read for read in bamfile
                if (read.is_proper_pair) and (read.pos == pos_first_block)
            ]
            if not len(reads):
                continue

            np.random.shuffle(reads)
            reads = reads[:n_reads_per_ali]
            seqs = [
                SeqRecord(Seq(read.seq[:block_len], ambiguous_dna),
                          id=read.qname) for read in reads
            ]
            cons_local = build_local_consensus(
                seqs, VERBOSE=VERBOSE, store_allele_counts=store_allele_counts)
            if store_allele_counts:
                (cons_local, allcount_local) = cons_local
                allcounts_local.append(allcount_local)
            consensi_local.append(cons_local)
            pos_ref += (block_len_initial // 2) * (1 + pos_first_block //
                                                   (block_len_initial // 2))
            if VERBOSE >= 2:
                print 'pos', pos_first_block, 'to', pos_first_block + block_len, 'block len', block_len
            break

        # Start consensus
        if len(consensi_local) == 1:
            consensus = [consensi_local[0]]
            if store_allele_counts:
                allcounts = [allcounts_local[0]]

        # Divide reads by block (more efficient than scrolling the file every time)
        # FIXME: extract random subsample, assign to blocks, and only complete the missing blocks!
        reads_by_block = [[] for n_block in xrange((len_reference - pos_ref) //
                                                   (block_len_initial // 2))]
        bamfile.reset()
        for read in bamfile:
            if not read.is_proper_pair:
                continue
            pos_ref_tmp = pos_ref
            n_block = 1
            while (pos_ref_tmp < len_reference):
                block_len_tmp = min(block_len, len_reference - pos_ref)
                read_start = read.pos
                read_end = read.pos + sum(
                    bl for (bt, bl) in read.cigar if bt in (0, 2))
                if (pos_ref_tmp - 100 < read_start <= pos_ref_tmp) and \
                   (read_end >= pos_ref_tmp + block_len_tmp):
                    reads_by_block[n_block - 1].append(read)
                    break

                pos_ref_tmp += block_len_initial // 2
                n_block += 1

        # Stack local consensi on top of the first one
        n_block = 1
        while (pos_ref < len_reference):
            block_len = min(block_len, len_reference - pos_ref)
            if block_len < block_len_initial // 2:
                break
            if VERBOSE >= 2:
                print 'Block n', len(
                    consensi_local
                ) + 1, 'pos', pos_ref, 'to', pos_ref + block_len, 'block len', block_len

            # Get reads that cover the whole block
            reads = reads_by_block[n_block - 1]
            n_block += 1

            #FIXME
            #if n_block >= 2:
            #    print pos_ref, pos_ref + block_len
            #    import ipdb; ipdb.set_trace()

            # Internal coverage holes are not tolerated, but the last block
            # is allowed to be missing. However, we should try to squeeze out
            # all the bases by rescanning the reads a last time with less strict
            # criteria: if it has even one base more than what we have, add it
            if len(reads):
                full_cover = True
            else:
                full_cover = False
                bamfile.reset()
                reads = []
                for read in bamfile:
                    if not read.is_proper_pair:
                        continue
                    read_start = read.pos
                    read_end = read.pos + sum(
                        bl for (bt, bl) in read.cigar if bt in (0, 2))
                    if (read_start <= pos_ref) and (
                            read_end > pos_ref + block_len_initial // 2):
                        reads.append(read)

                if not len(reads):
                    if pos_ref + block_len < len_reference:
                        if VERBOSE >= 2:
                            print 'WARNING: consensus looks interrupted in mid-way'
                    break

            # Take a random subsample of reads. If it's a problematic block, not
            # fully covered, take more reads than usual
            if full_cover:
                np.random.shuffle(reads)
                reads = reads[:n_reads_per_ali]
            else:
                # Trim all, then take longest
                pass

            # Trim reads from the left to start all at the block start
            # NOTE: reads have been selected to start @ or before the block start!
            seqs = []
            for read in reads:
                pos_reft = read.pos

                # Find start of the block in the read
                start_found = False
                pos_read_start = 0
                pos_read_end = 0
                for (bt, bl) in read.cigar:
                    if bt == 1:
                        if not start_found:
                            pos_read_start += bl
                        pos_read_end += bl
                    elif bt == 2:
                        if (not start_found) and (pos_reft + bl > pos_ref):
                            start_found = True
                        if pos_reft + bl > pos_ref + block_len:
                            break
                        pos_reft += bl
                    else:
                        if (not start_found) and (pos_reft + bl > pos_ref):
                            pos_read_start += pos_ref - pos_reft
                            start_found = True
                        if pos_reft + bl > pos_ref + block_len:
                            pos_read_end += pos_ref + block_len - pos_reft
                            break

                        if not start_found:
                            pos_read_start += bl
                        pos_read_end += bl
                        pos_reft += bl

                seq = SeqRecord(Seq(read.seq[pos_read_start:pos_read_end],
                                    ambiguous_dna),
                                id=read.qname)
                seqs.append(seq)

            # If it's a problematic block, take longest reads
            if not full_cover:
                seqs.sort(key=len, reverse=True)
                seqs = seqs[:n_reads_per_ali]

            #FIXME
            #if n_block >= 2:
            #    print pos_ref, pos_ref + block_len
            #    import ipdb; ipdb.set_trace()

            # Make local consensus using a multiple sequence alignment
            # --------------
            # -----   ------
            # --------   ---
            #---------------
            cons_local = build_local_consensus(
                seqs,
                VERBOSE=VERBOSE,
                store_allele_counts=store_allele_counts,
                full_cover=full_cover)
            if store_allele_counts:
                (cons_local, allcount_local) = cons_local
                allcounts_local.append(allcount_local)
            consensi_local.append(cons_local)

            pos_ref += block_len_initial // 2

            # Join block <-- to the stack, like this:
            # ---------------------------
            #                        --------------------
            if consensus is None:
                consensus = [consensi_local[0]]
                if store_allele_counts:
                    allcounts = [allcounts_local[0]]
            else:
                cons = cons_local
                seed = consensus[-1][-20:]
                sl = len(seed)
                pos_start = cons.find(seed)
                # Allow imperfect matches
                if pos_start == -1:
                    consm = np.fromstring(cons, 'S1')
                    seedm = np.fromstring(seed, 'S1')
                    n_matches = [(consm[i:i + sl] == seedm).sum()
                                 for i in xrange(len(cons) + 1 - len(seed))]
                    pos_start = np.argmax(n_matches)

                    # Try to only add non-bogus stuff
                    if n_matches[pos_start] < 0.66 * sl:
                        pos_start = -1
                        if VERBOSE >= 4:
                            print 'Block n.', len(
                                consensi_local
                            ) + ': cannot stack to previous one!'

                if pos_start != -1:
                    consensus.append(cons[pos_start + sl:])
                    if store_allele_counts:
                        allcounts.append(allcounts_local[-1][:,
                                                             pos_start + sl:])

                elif accept_holes:
                    consensus.append('N' * 10)
                    consensus.append(cons)
                    if store_allele_counts:
                        tmpall = np.zeros((allcounts_local[-1].shape[0], 10),
                                          int)
                        tmpall[-1] = 1
                        allcounts.append(tmpall)
                        allcounts.append(allcounts_local[-1])

    if consensus is None:
        raise ValueError('Consensus is still None: unable to build!')

    consensus = ''.join(consensus)

    if store_allele_counts:
        allcounts = np.concatenate(allcounts, axis=1)
        return (consensus, allcounts)

    return consensus
def build_consensus(bamfilename, len_reference, VERBOSE=0,
                    block_len_initial=100,
                    reads_per_alignment=31,
                    accept_holes=False,
                    store_allele_counts=False):
    '''Build a consensus from premapped and divided reads'''
    if VERBOSE:
        print 'Build consensus'

    import numpy as np
    import pysam
    from Bio import SeqIO
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.Alphabet.IUPAC import ambiguous_dna
    
    from hivwholeseq.utils.mapping import align_muscle
    # Three steps:
    # 1. collect reads uniformly across the fragment
    # 2. make local consensi
    # 3. join into fragmentwide consensus
    consensus = None
    consensi_local = []
    if store_allele_counts:
        allcounts_local = []
    pos_ref = 0
    block_len = block_len_initial
    with pysam.Samfile(bamfilename, 'rb') as bamfile:

        # Initial block
        if VERBOSE >= 2:
            print 'Block n', len(consensi_local) + 1, 
        for pos_first_block in xrange(len_reference):
            bamfile.reset()

            # The first block has to make a consensus for the FIRST base, this needs
            # at least ONE read starting exactly at the first position. Otherwise,
            # the same is repeated for position 2, and so on.
            reads = [read for read in bamfile if (read.is_proper_pair) and (read.pos == pos_first_block)]
            if not len(reads):
                continue

            np.random.shuffle(reads)
            reads = reads[:n_reads_per_ali]
            seqs = [SeqRecord(Seq(read.seq[:block_len], ambiguous_dna), id=read.qname)
                    for read in reads]
            cons_local = build_local_consensus(seqs, VERBOSE=VERBOSE, store_allele_counts=store_allele_counts)
            if store_allele_counts:
                (cons_local, allcount_local) = cons_local
                allcounts_local.append(allcount_local)
            consensi_local.append(cons_local)
            pos_ref += (block_len_initial // 2) * (1 + pos_first_block // (block_len_initial // 2))
            if VERBOSE >= 2:
                print 'pos', pos_first_block, 'to', pos_first_block + block_len, 'block len', block_len
            break

        # Start consensus
        if len(consensi_local) == 1:
            consensus = [consensi_local[0]]
            if store_allele_counts:
                allcounts = [allcounts_local[0]]

        # Divide reads by block (more efficient than scrolling the file every time)
        # FIXME: extract random subsample, assign to blocks, and only complete the missing blocks!
        reads_by_block = [[] for n_block in xrange((len_reference - pos_ref) // (block_len_initial // 2))]
        bamfile.reset()
        for read in bamfile:
            if not read.is_proper_pair:
                continue
            pos_ref_tmp = pos_ref
            n_block = 1
            while (pos_ref_tmp < len_reference):
                block_len_tmp = min(block_len, len_reference - pos_ref)
                read_start = read.pos
                read_end = read.pos + sum(bl for (bt, bl) in read.cigar if bt in (0, 2))
                if (pos_ref_tmp - 100 < read_start <= pos_ref_tmp) and \
                   (read_end >= pos_ref_tmp + block_len_tmp):
                    reads_by_block[n_block - 1].append(read)
                    break

                pos_ref_tmp += block_len_initial // 2
                n_block += 1

        # Stack local consensi on top of the first one
        n_block = 1
        while (pos_ref < len_reference):
            block_len = min(block_len, len_reference - pos_ref)
            if block_len < block_len_initial // 2:
                break
            if VERBOSE >= 2:
                print 'Block n', len(consensi_local) + 1, 'pos', pos_ref, 'to', pos_ref + block_len, 'block len', block_len

            # Get reads that cover the whole block
            reads = reads_by_block[n_block - 1]
            n_block += 1

            #FIXME
            #if n_block >= 2:
            #    print pos_ref, pos_ref + block_len
            #    import ipdb; ipdb.set_trace()

            # Internal coverage holes are not tolerated, but the last block
            # is allowed to be missing. However, we should try to squeeze out
            # all the bases by rescanning the reads a last time with less strict
            # criteria: if it has even one base more than what we have, add it
            if len(reads):
                full_cover= True
            else:
                full_cover= False
                bamfile.reset()
                reads = []
                for read in bamfile:
                    if not read.is_proper_pair:
                        continue
                    read_start = read.pos
                    read_end = read.pos + sum(bl for (bt, bl) in read.cigar if bt in (0, 2))
                    if (read_start <= pos_ref) and (read_end > pos_ref + block_len_initial // 2):
                        reads.append(read)

                if not len(reads):
                    if pos_ref + block_len < len_reference:
                        if VERBOSE >= 2:
                            print 'WARNING: consensus looks interrupted in mid-way'
                    break

            # Take a random subsample of reads. If it's a problematic block, not
            # fully covered, take more reads than usual
            if full_cover:
                np.random.shuffle(reads)
                reads = reads[:n_reads_per_ali]
            else:
                # Trim all, then take longest
                pass

            # Trim reads from the left to start all at the block start
            # NOTE: reads have been selected to start @ or before the block start!
            seqs = []
            for read in reads:
                pos_reft = read.pos

                # Find start of the block in the read
                start_found = False
                pos_read_start = 0
                pos_read_end = 0
                for (bt, bl) in read.cigar:
                    if bt == 1:
                        if not start_found:
                            pos_read_start += bl
                        pos_read_end += bl
                    elif bt == 2:
                        if (not start_found) and (pos_reft + bl > pos_ref):
                            start_found = True
                        if pos_reft + bl > pos_ref + block_len:
                            break
                        pos_reft += bl
                    else:
                        if (not start_found) and (pos_reft + bl > pos_ref):
                            pos_read_start += pos_ref - pos_reft
                            start_found = True
                        if pos_reft + bl > pos_ref + block_len:
                            pos_read_end += pos_ref + block_len - pos_reft
                            break

                        if not start_found:
                            pos_read_start += bl
                        pos_read_end += bl
                        pos_reft += bl
                
                seq = SeqRecord(Seq(read.seq[pos_read_start: pos_read_end],
                                    ambiguous_dna), id=read.qname)
                seqs.append(seq)

            # If it's a problematic block, take longest reads
            if not full_cover:
                seqs.sort(key=len, reverse=True)
                seqs = seqs[:n_reads_per_ali]

            #FIXME
            #if n_block >= 2:
            #    print pos_ref, pos_ref + block_len
            #    import ipdb; ipdb.set_trace()

            # Make local consensus using a multiple sequence alignment
            # --------------
            # -----   ------
            # --------   ---
            #---------------
            cons_local = build_local_consensus(seqs, VERBOSE=VERBOSE,
                                               store_allele_counts=store_allele_counts,
                                               full_cover=full_cover)
            if store_allele_counts:
                (cons_local, allcount_local) = cons_local
                allcounts_local.append(allcount_local)
            consensi_local.append(cons_local)

            pos_ref += block_len_initial // 2

            # Join block <-- to the stack, like this:
            # ---------------------------
            #                        --------------------
            if consensus is None:
                consensus = [consensi_local[0]]
                if store_allele_counts:
                    allcounts = [allcounts_local[0]]
            else:
                cons = cons_local
                seed = consensus[-1][-20:]
                sl = len(seed)
                pos_start = cons.find(seed)
                # Allow imperfect matches
                if pos_start == -1:
                    consm = np.fromstring(cons, 'S1')
                    seedm = np.fromstring(seed, 'S1')
                    n_matches = [(consm[i: i + sl] == seedm).sum()
                                 for i in xrange(len(cons) + 1 - len(seed))]
                    pos_start = np.argmax(n_matches)
        
                    # Try to only add non-bogus stuff
                    if n_matches[pos_start] < 0.66 * sl:
                        pos_start = -1
                        if VERBOSE >= 4:
                            print 'Block n.', len(consensi_local)+': cannot stack to previous one!'
        
                if pos_start != -1:
                    consensus.append(cons[pos_start + sl:])
                    if store_allele_counts:
                        allcounts.append(allcounts_local[-1][:, pos_start + sl:])
                
                elif accept_holes:
                    consensus.append('N' * 10)
                    consensus.append(cons)
                    if store_allele_counts:
                        tmpall = np.zeros((allcounts_local[-1].shape[0], 10), int)
                        tmpall[-1] = 1
                        allcounts.append(tmpall)
                        allcounts.append(allcounts_local[-1])

    if consensus is None:
        raise ValueError('Consensus is still None: unable to build!')

    consensus = ''.join(consensus)

    if store_allele_counts:
        allcounts = np.concatenate(allcounts, axis=1)
        return (consensus, allcounts)

    return consensus
def build_consensus(bamfilename, len_reference, VERBOSE=0,
                    block_len=100,
                    reads_per_alignment=31,
                    deltamax=60):
    '''Build a consensus from mapped filtered reads'''
    if VERBOSE:
        print 'Build consensus'
    
    from operator import itemgetter
    import numpy as np
    import pysam
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.Alphabet.IUPAC import ambiguous_dna

    from hivwholeseq.utils.miseq import alpha
    from hivwholeseq.utils.mapping import pair_generator
    from hivwholeseq.utils.sequence import build_local_consensus
    
    with pysam.Samfile(bamfilename, 'rb') as bamfile:

        if VERBOSE >= 3:
            from hivwholeseq.utils.mapping import get_number_reads_open
            print 'The bamfile has', get_number_reads_open(bamfile), 'reads.'

        # Get first block covered, even if partially, and record where each read started
        if VERBOSE >= 2:
            print 'First block'

        block_len = block_len
        seqs = []
        n_block = 0
        while not seqs:
            start_block = n_block * (block_len // 2)
            for read in bamfile:
                if read.pos <= start_block:
                    seqs.append((read.pos, ('N' * read.pos) + read.seq[:block_len - read.pos]))
            bamfile.reset()
            n_block += 1
        
        # If there are too many reads, take the reads that start earliest
        if len(seqs) > reads_per_alignment:
            np.random.shuffle(seqs)
            seqs.sort(key=itemgetter(0))
            seqs = seqs[:reads_per_alignment]

        seqrecs = [SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i))
                   for i, (pos, s) in enumerate(seqs)]
        consensus = build_local_consensus(seqrecs, VERBOSE=VERBOSE, full_cover=False)

        # Block, by block, make local alignment and join to previous consensus
        # There are two ways of finishing the loop:
        # 1. if we cover all the way to the end of the reference, good
        # 2. if we find no reads fully covering a block BEFORE that, add a final block
        while start_block < len_reference:
            edges = (start_block, min(len_reference, start_block + block_len))

            if VERBOSE >= 2:
                print 'block n.', n_block, 'region:', edges

            seqs = pileup_trim_reads_coverfull(bamfile, edges, VERBOSE=VERBOSE)

            # If we do not find reads that fully cover, consider it the end of
            # the consensus, only the final block is missing
            if not seqs:
                break
            elif len(seqs) > reads_per_alignment:
                np.random.shuffle(seqs)
                seqs = seqs[:reads_per_alignment]

            # Make local consensus using a multiple sequence alignment
            # --------------
            # -----   ------
            # --------   ---
            #---------------
            seqrecs = [SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i))
                       for i, s in enumerate(seqs)]
            cons_block = build_local_consensus(seqrecs, VERBOSE=VERBOSE, full_cover=True)

            # Join to the rest of the consensus, like this:
            # ---------------------------
            #                        --------------------
            consensus = join_block_to_consensus(consensus, cons_block,
                                                VERBOSE=VERBOSE, deltamax=deltamax)

            start_block += 2 * block_len // 3
            n_block += 1

        # If we cover the whole reference, good
        else:
            return consensus

        if VERBOSE >= 2:
            print 'final block'

        # If we broke out of the while, a final block is needed
        seqs = pileup_trim_reads_coverstart(bamfile, start_block, VERBOSE=VERBOSE)

        # Sort reads by length
        if len(seqs) > reads_per_alignment:
            np.random.shuffle(seqs)
            seqs.sort(key=len, reverse=True)
            seqs = seqs[:reads_per_alignment]

            # Complete with N, approximately
            sl = len(seqs[0])
            seqs = [s+('N' * (sl - len(s))) for s in seqs]


        seqrecs = [SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i))
                   for i, s in enumerate(seqs)]
        cons_block = build_local_consensus(seqrecs, VERBOSE=VERBOSE, full_cover=False)
        consensus = join_block_to_consensus(consensus, cons_block, VERBOSE=VERBOSE,
                                            deltamax=deltamax)

    return consensus
Beispiel #4
0
def build_consensus(bamfilename,
                    len_reference,
                    VERBOSE=0,
                    block_len=100,
                    reads_per_alignment=31,
                    deltamax=60):
    '''Build a consensus from mapped filtered reads'''
    if VERBOSE:
        print 'Build consensus'

    from operator import itemgetter
    import numpy as np
    import pysam
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.Alphabet.IUPAC import ambiguous_dna

    from hivwholeseq.utils.miseq import alpha
    from hivwholeseq.utils.mapping import pair_generator
    from hivwholeseq.utils.sequence import build_local_consensus

    with pysam.Samfile(bamfilename, 'rb') as bamfile:

        if VERBOSE >= 3:
            from hivwholeseq.utils.mapping import get_number_reads_open
            print 'The bamfile has', get_number_reads_open(bamfile), 'reads.'

        # Get first block covered, even if partially, and record where each read started
        if VERBOSE >= 2:
            print 'First block'

        block_len = block_len
        seqs = []
        n_block = 0
        while not seqs:
            start_block = n_block * (block_len // 2)
            for read in bamfile:
                if read.pos <= start_block:
                    seqs.append(
                        (read.pos,
                         ('N' * read.pos) + read.seq[:block_len - read.pos]))
            bamfile.reset()
            n_block += 1

        # If there are too many reads, take the reads that start earliest
        if len(seqs) > reads_per_alignment:
            np.random.shuffle(seqs)
            seqs.sort(key=itemgetter(0))
            seqs = seqs[:reads_per_alignment]

        seqrecs = [
            SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i))
            for i, (pos, s) in enumerate(seqs)
        ]
        consensus = build_local_consensus(seqrecs,
                                          VERBOSE=VERBOSE,
                                          full_cover=False)

        # Block, by block, make local alignment and join to previous consensus
        # There are two ways of finishing the loop:
        # 1. if we cover all the way to the end of the reference, good
        # 2. if we find no reads fully covering a block BEFORE that, add a final block
        while start_block < len_reference:
            edges = (start_block, min(len_reference, start_block + block_len))

            if VERBOSE >= 2:
                print 'block n.', n_block, 'region:', edges

            seqs = pileup_trim_reads_coverfull(bamfile, edges, VERBOSE=VERBOSE)

            # If we do not find reads that fully cover, consider it the end of
            # the consensus, only the final block is missing
            if not seqs:
                break
            elif len(seqs) > reads_per_alignment:
                np.random.shuffle(seqs)
                seqs = seqs[:reads_per_alignment]

            # Make local consensus using a multiple sequence alignment
            # --------------
            # -----   ------
            # --------   ---
            #---------------
            seqrecs = [
                SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i))
                for i, s in enumerate(seqs)
            ]
            cons_block = build_local_consensus(seqrecs,
                                               VERBOSE=VERBOSE,
                                               full_cover=True)

            # Join to the rest of the consensus, like this:
            # ---------------------------
            #                        --------------------
            consensus = join_block_to_consensus(consensus,
                                                cons_block,
                                                VERBOSE=VERBOSE,
                                                deltamax=deltamax)

            start_block += 2 * block_len // 3
            n_block += 1

        # If we cover the whole reference, good
        else:
            return consensus

        if VERBOSE >= 2:
            print 'final block'

        # If we broke out of the while, a final block is needed
        seqs = pileup_trim_reads_coverstart(bamfile,
                                            start_block,
                                            VERBOSE=VERBOSE)

        # Sort reads by length
        if len(seqs) > reads_per_alignment:
            np.random.shuffle(seqs)
            seqs.sort(key=len, reverse=True)
            seqs = seqs[:reads_per_alignment]

            # Complete with N, approximately
            sl = len(seqs[0])
            seqs = [s + ('N' * (sl - len(s))) for s in seqs]

        seqrecs = [
            SeqRecord(Seq(s, ambiguous_dna), id=str(i), name=str(i))
            for i, s in enumerate(seqs)
        ]
        cons_block = build_local_consensus(seqrecs,
                                           VERBOSE=VERBOSE,
                                           full_cover=False)
        consensus = join_block_to_consensus(consensus,
                                            cons_block,
                                            VERBOSE=VERBOSE,
                                            deltamax=deltamax)

    return consensus