Ejemplo n.º 1
0
 def revcom(self):
     seq = kevlar.revcom(self.read.sequence)
     kmerseqrc = kevlar.revcom(self.kmerseq)
     newoffset = len(seq) - self.kmer.offset - self.kmer.ksize
     kmer = KmerOfInterest(self.kmer.ksize, newoffset, self.kmer.abund)
     kdict = {self.kmerseq: kmer, kmerseqrc: kmer}
     newread = Record(self.read.name, seq, annotations=[kmer], ikmers=kdict)
     return ReadWithKmer(newread, self.kmerseq)
Ejemplo n.º 2
0
def test_alpha():
    readfile = data_file('collect.alpha.txt')
    filterer = kevlar.filter.filter(readfile, memory=500)
    validated = list(filterer)
    assert len(validated) == 8
    badkmers = ['CAGGCCAGGGATCGCCGTG']
    goodkmers = [
        'AGGGGCGTGACTTAATAAG', 'GGGCGTGACTTAATAAGGT',
        'TAGGGGCGTGACTTAATAA', 'GGGGCGTGACTTAATAAGG',
    ]
    for record in validated:
        for kmer in record.annotations:
            seq = record.ikmerseq(kmer)
            assert seq not in badkmers and kevlar.revcom(seq) not in badkmers
            assert seq in goodkmers or kevlar.revcom(seq) in goodkmers
Ejemplo n.º 3
0
def merge_and_reannotate(pair, newname):
    """
    Assemble a pair of overlapping reads and resolve their interesting k-mers.

    When a pair of compatible reads is merged, the offset of the interesting
    k-mers must be computed for one of the reads.
    """
    contig = merge_pair(pair)
    newrecord = screed.Record(name=newname,
                              sequence=contig,
                              ikmers=pair.tail.ikmers)
    ksize = len(pair.tail.ikmers[0].sequence)
    if pair.sameorient:
        minoffset2keep = len(pair.tail.sequence) - pair.offset - ksize
        keepers = [ik for ik in pair.head.ikmers if ik.offset > minoffset2keep]
        for k in keepers:
            ikmer = kevlar.KmerOfInterest(k.sequence, k.offset + pair.offset,
                                          k.abund)
            newrecord.ikmers.append(ikmer)
    else:
        maxoffset2keep = pair.offset - ksize
        keepers = [ik for ik in pair.head.ikmers if ik.offset < maxoffset2keep]
        for k in keepers:
            ikmer = kevlar.KmerOfInterest(
                kevlar.revcom(k.sequence),
                len(pair.head.sequence) - k.offset - ksize + pair.offset,
                k.abund,
            )
            newrecord.ikmers.append(ikmer)

    return newrecord
Ejemplo n.º 4
0
def augment(augseqstream, nakedseqstream, upint=10000):
    """
    Augment an unannotated stream of sequences.

    - `augseqstream`: a stream of sequences annotated with k-mers of interest
    - `nakedseqstream`: a stream of unannotated sequences, to be augmented with
      k-mers of interest from `augseqstream`
    """
    ksize = None
    ikmers = dict()
    for n, record in enumerate(augseqstream):
        if n > 0 and n % upint == 0:
            kevlar.plog('[kevlar::augment] processed', n, 'input reads')
        for ikmer in record.annotations:
            seq = record.ikmerseq(ikmer)
            ikmers[seq] = ikmer.abund
            ikmers[kevlar.revcom(seq)] = ikmer.abund
            ksize = ikmer.ksize

    for record in nakedseqstream:
        qual = None
        if hasattr(record, 'quality') and record.quality is not None:
            qual = record.quality
        newrecord = kevlar.sequence.Record(
            name=record.name,
            sequence=record.sequence,
            quality=qual,
        )
        numkmers = len(record.sequence) - ksize + 1
        for offset in range(numkmers):
            kmer = record.sequence[offset:offset + ksize]
            if kmer in ikmers:
                abund = ikmers[kmer]
                newrecord.annotate(kmer, offset, abund)
        yield newrecord
Ejemplo n.º 5
0
def print_read_pair(pair, position, outstream=sys.stderr):
    """Convenience print function for debugging."""
    seq2 = pair.head.sequence
    if not pair.sameorient:
        seq2 = kevlar.revcom(pair.head.sequence)
    ksize = len(pair.head.ikmers[0].sequence)

    details = '--(overlap={:d}, offset={:d}, sameorient={})-->'.format(
        pair.overlap, pair.offset, pair.sameorient)
    info = '[kevlar::overlap] DEBUG: shared interesting k-mer '
    info += '{:s} {:s} {:s}'.format(pair.tail.name, details, pair.head.name)

    print('≠' * 80,
          '\n',
          info,
          '\n',
          '-' * 80,
          '\n',
          pair.tail.sequence,
          '\n',
          ' ' * position,
          '|' * ksize,
          '\n',
          ' ' * pair.offset,
          seq2,
          '\n',
          '≠' * 80,
          '\n',
          sep='',
          file=outstream)
Ejemplo n.º 6
0
def check_kmer_freq_in_read_pair(read1, read2, minkmer, debugstream=None):
    """
    Check interesting k-mer frequence in each read.

    When calculating offset between a pair of reads, do not use any interesting
    k-mers that occur multiple times in either read.
    """
    maxkmer = kevlar.revcom(minkmer)
    matches1 = [
        k for k in read1.ikmers
        if kevlar.same_seq(k.sequence, minkmer, maxkmer)
    ]
    matches2 = [
        k for k in read2.ikmers
        if kevlar.same_seq(k.sequence, minkmer, maxkmer)
    ]
    nmatches1 = len(matches1)
    nmatches2 = len(matches2)
    assert nmatches1 > 0 and nmatches1 > 0, (nmatches1, nmatches2)
    if nmatches1 > 1 or nmatches2 > 1:
        if debugstream:
            message = (
                'stubbornly refusing to calculate offset bewteen {:s} and '
                '{:s}; interesting k-mer {:s} occurs multiple times'.format(
                    read1.name, read2.name, minkmer))
            print('[kevlar::overlap] INFO', message, file=debugstream)
        return None, None

    kmer1 = matches1[0]
    kmer2 = matches2[0]
    return kmer1, kmer2
Ejemplo n.º 7
0
def determine_relative_orientation(read1, read2, kmer1, kmer2):
    """
    Determine the relative orientation of a pair of overlapping reads.

    Use the sequence and position of the shared interesting k-mers to determine
    the read's relative orientation.
    """
    ksize = len(kmer1.sequence)
    pos1 = kmer1.offset
    pos2 = kmer2.offset
    sameorient = True
    if kmer1.sequence != kmer2.sequence:
        assert kmer1.sequence == kevlar.revcom(kmer2.sequence)
        sameorient = False
        pos2 = len(read2.sequence) - (kmer2.offset + ksize)

    tail, head = read1, read2
    tailpos, headpos = pos1, pos2
    read1contained = pos1 == pos2 and len(read2.sequence) > len(read1.sequence)
    if pos2 > pos1 or read1contained:
        tail, head = read2, read1
        tailpos, headpos = headpos, tailpos
    offset = tailpos - headpos

    return tail, head, offset, sameorient, tailpos
Ejemplo n.º 8
0
def merge_pair(pair):
    """
    Assemble a pair of overlapping reads.

    Given a pair of compatible overlapping reads, collapse and merge them into
    a single sequence.
    """
    tailseq = pair.tail.sequence
    headseq = pair.head.sequence
    offset = pair.offset
    if pair.sameorient is False:
        headseq = kevlar.revcom(pair.head.sequence)
    if headseq in pair.tail.sequence:
        return pair.tail.sequence
    if pair.swapped:
        tailseq, headseq = headseq, tailseq
        offset += len(tailseq) - len(headseq)

    headindex = len(tailseq) - offset
    headsuffix = headseq[headindex:]
    tailprefix = tailseq[offset:offset + pair.overlap]
    assert tailprefix == headseq[:headindex], \
        'error: attempted to assemble incompatible reads'

    return tailseq + headsuffix
Ejemplo n.º 9
0
 def __iter__(self):
     for mincontig in sorted(self.contigs):
         maxcontig = kevlar.revcom(mincontig)
         kmers = self.contigs[mincontig]
         reads = set()
         for kmer in kmers:
             reads = reads.union(self.kmers[kmer])
         yield mincontig, maxcontig, kmers, reads
Ejemplo n.º 10
0
 def __init__(self, read, kmerseq):
     self.read = read
     self.kmer = read.ikmers[kmerseq]
     self.kmerseq = self.read.ikmerseq(self.kmer) if self.kmer else None
     self.num_occurrences = (
         self.read.sequence.count(kmerseq) +
         self.read.sequence.count(kevlar.revcom(kmerseq))
     )
Ejemplo n.º 11
0
def n_ikmers_present(record, window):
    n = 0
    for ikmer in record.annotations:
        seq = record.ikmerseq(ikmer)
        if seq in window:
            n += 1
        elif kevlar.revcom(seq) in window:
            n += 1
    return n
Ejemplo n.º 12
0
def validate_read_overlap(tail, head, offset, sameorient, minkmer, swapped):
    """Verify that the overlap between two reads is identical."""
    headseq = head.sequence if sameorient else kevlar.revcom(head.sequence)
    seg2offset = len(head.sequence) - len(tail.sequence) + offset
    if offset + len(headseq) <= len(tail.sequence):
        segment1 = tail.sequence[offset:offset + len(headseq)]
        segment2 = headseq
        seg2offset = None
    elif swapped:
        segment1 = tail.sequence[:-offset]
        segment2 = headseq[seg2offset:]
    else:
        segment1 = tail.sequence[offset:]
        segment2 = headseq[:-seg2offset]

    overlap1 = len(segment1)
    overlap2 = len(segment2)
    if overlap1 != overlap2:  # pragma: no cover
        maxkmer = kevlar.revcom(minkmer)
        print(
            '[kevlar::overlap] ERROR '
            'tail="{tail}" head="{head}" offset={offset} altoffset={altoffset}'
            ' tailoverlap={overlap} headoverlap={headover} tailolvp={tailseq}'
            ' headolvp={headseq} kmer={minkmer},{maxkmer} tailseq={tailread}'
            ' headseq={headread}'.format(
                tail=tail.name,
                head=tail.name,
                offset=offset,
                altoffset=seg2offset,
                overlap=overlap1,
                headover=len(segment2),
                tailseq=segment1,
                headseq=segment2,
                minkmer=minkmer,
                maxkmer=maxkmer,
                tailread=tail.sequence,
                headread=head.sequence,
            ),
            file=sys.stderr)
    assert overlap1 == overlap2
    if segment1 != segment2:
        return None
    return overlap1
Ejemplo n.º 13
0
def test_assemble_main(capsys):
    cliargs = ['assemble', data_file('var1.reads.augfastq')]
    args = kevlar.cli.parser().parse_args(cliargs)
    kevlar.assemble.main(args)
    out, err = capsys.readouterr()
    contig = ('GTCCTTGAGTCCATTAGAGACGGCTTCCGCCGTAGGCCCACTTCCTTAAAGTCGAGACTTCTA'
              'AAAACCGGGGTGTAACTCTTTTATTACAAAGCGACTATCCACCTGTAAGGACAGTGATA')
    print('DEBUG', contig)
    print('DEBUG', out)
    assert contig in out or kevlar.revcom(contig) in out
Ejemplo n.º 14
0
def test_validate():
    filelist = kevlar.tests.data_glob('collect.alpha.txt')
    readset = ReadSet(19, 5e3)
    for record in kevlar.seqio.afxstream(filelist):
        readset.add(record)
    readset.validate()

    assert readset.valid == (4, 32)
    assert len(readset) == 9
    assert readset.discarded == 1

    badkmers = ['CAGGCCAGGGATCGCCGTG']
    goodkmers = [
        'AGGGGCGTGACTTAATAAG', 'GGGCGTGACTTAATAAGGT',
        'TAGGGGCGTGACTTAATAA', 'GGGGCGTGACTTAATAAGG',
    ]
    for record in readset:
        for kmer in record.ikmers:
            assert kmer.sequence not in badkmers and \
                kevlar.revcom(kmer.sequence) not in badkmers
            assert kmer.sequence in goodkmers or \
                kevlar.revcom(kmer.sequence) in goodkmers
Ejemplo n.º 15
0
def test_validate_with_mask():
    kmer = 'AGGGGCGTGACTTAATAAG'
    mask = khmer.Nodetable(19, 1e3, 2)
    mask.add(kmer)

    filelist = kevlar.tests.data_glob('collect.beta.?.txt')
    readset, countgraph = kevlar.filter.load_input(filelist, 19, 5e3)
    kevlar.filter.validate_and_print(readset, countgraph, mask)
    assert readset.valid == (3, 24)
    for record in readset:
        for ikmer in record.ikmers:
            assert ikmer.sequence != kmer
            assert kevlar.revcom(ikmer.sequence) != kmer
Ejemplo n.º 16
0
def test_validate():
    filelist = kevlar.tests.data_glob('collect.alpha.txt')
    readset, countgraph = kevlar.filter.load_input(filelist, 19, 5e3)
    kevlar.filter.validate_and_print(readset, countgraph)

    assert readset.valid == (4, 32)
    assert len(readset) == 9
    assert readset.discarded == 1

    badkmers = ['CAGGCCAGGGATCGCCGTG']
    goodkmers = [
        'AGGGGCGTGACTTAATAAG',
        'GGGCGTGACTTAATAAGGT',
        'TAGGGGCGTGACTTAATAA',
        'GGGGCGTGACTTAATAAGG',
    ]
    for record in readset:
        for kmer in record.ikmers:
            assert kmer.sequence not in badkmers and \
                kevlar.revcom(kmer.sequence) not in badkmers
            assert kmer.sequence in goodkmers or \
                kevlar.revcom(kmer.sequence) in goodkmers
Ejemplo n.º 17
0
def augment(augseqstream, nakedseqstream, collapsemates=False, upint=10000):
    """
    Augment an unannotated stream of sequences.

    - `augseqstream`: a stream of sequences annotated with k-mers of interest
    - `nakedseqstream`: a stream of unannotated sequences, to be augmented with
      k-mers of interest from `augseqstream`
    """
    ksize = None
    ikmers = dict()
    mateseqs = dict()
    for n, record in enumerate(augseqstream):
        if n > 0 and n % upint == 0:
            print('[kevlar::augment] processed',
                  n,
                  'input reads',
                  file=sys.stderr)
        for ikmer in record.annotations:
            seq = record.ikmerseq(ikmer)
            ikmers[seq] = ikmer.abund
            ikmers[kevlar.revcom(seq)] = ikmer.abund
            ksize = ikmer.ksize
        assert len(record.mates) in (0, 1)
        if len(record.mates) == 1:
            mateseqs[record.name] = record.mates[0]
    print('[kevlar::augment] done loading input', file=sys.stderr)

    for record in nakedseqstream:
        qual = None
        if hasattr(record, 'quality') and record.quality is not None:
            qual = record.quality
        mates = list()
        if collapsemates:
            mates = sorted(mateseqs.values())
        else:
            if record.name in mateseqs:
                mates.append(mateseqs[record.name])
        newrecord = kevlar.sequence.Record(
            name=record.name,
            sequence=record.sequence,
            quality=qual,
            mates=mates,
        )
        numkmers = len(record.sequence) - ksize + 1
        for offset in range(numkmers):
            kmer = record.sequence[offset:offset + ksize]
            if kmer in ikmers:
                abund = ikmers[kmer]
                newrecord.annotate(kmer, offset, abund)
        yield newrecord
Ejemplo n.º 18
0
def test_validate_with_mask():
    kmer = 'AGGGGCGTGACTTAATAAG'
    mask = khmer.Nodetable(19, 1e3, 2)
    mask.add(kmer)

    filelist = kevlar.tests.data_glob('collect.beta.?.txt')
    readset = ReadSet(19, 5e3)
    for record in kevlar.seqio.afxstream(filelist):
        readset.add(record)
    readset.validate(mask=mask)
    assert readset.valid == (3, 24)
    for record in readset:
        for ikmer in record.ikmers:
            assert ikmer.sequence != kmer
            assert kevlar.revcom(ikmer.sequence) != kmer
Ejemplo n.º 19
0
 def collapse(self):
     unique_contigs = set()
     for contig in sorted(self.contigs, key=len, reverse=True):
         contigrc = kevlar.revcom(contig)
         merge = False
         for ucontig in unique_contigs:
             if contig in ucontig or contigrc in ucontig:
                 mergedkmers = self.contigs[ucontig].union(
                     self.contigs[contig]
                 )
                 self.contigs[ucontig] = mergedkmers
                 del self.contigs[contig]
                 merge = True
                 break
         if merge is False:
             unique_contigs.add(contig)
Ejemplo n.º 20
0
def call(targetlist,
         querylist,
         match=1,
         mismatch=2,
         gapopen=5,
         gapextend=0,
         ksize=31):
    """
    Wrap the `kevlar call` procedure as a generator function.

    Input is the following.
    - an iterable containing one or more target sequences from the reference
      genome, stored as khmer or screed sequence records
    - an iterable containing one or more contigs assembled by kevlar, stored as
      khmer or screed sequence records
    - alignment match score (integer)
    - alignment mismatch penalty (integer)
    - alignment gap open penalty (integer)
    - alignment gap extension penalty (integer)

    The function yields tuples of target sequence name, query sequence name,
    and alignment CIGAR string
    """
    for query in sorted(querylist, reverse=True, key=len):
        bestcigar = None
        bestscore = None
        besttarget = None
        bestorientation = None
        for target in sorted(targetlist, key=lambda record: record.name):
            cigar, score, strand = align_both_strands(target.sequence,
                                                      query.sequence, match,
                                                      mismatch, gapopen,
                                                      gapextend)
            if bestscore is None or score > bestscore:
                bestscore = score
                bestcigar = cigar
                besttarget = target
                bestorientation = strand

        if bestorientation == -1:
            query.sequence = kevlar.revcom(query.sequence)
        for varcall in make_call(besttarget, query, bestcigar, ksize):
            yield varcall
Ejemplo n.º 21
0
def align_both_strands(targetseq,
                       queryseq,
                       match=1,
                       mismatch=2,
                       gapopen=5,
                       gapextend=0):
    cigar1, score1 = kevlar.align(targetseq, queryseq, match, mismatch,
                                  gapopen, gapextend)
    cigar2, score2 = kevlar.align(targetseq, kevlar.revcom(queryseq), match,
                                  mismatch, gapopen, gapextend)

    if score2 > score1:
        cigar = cigar2
        score = score2
        strand = -1
    else:
        cigar = cigar1
        score = score1
        strand = 1
    return cigar, score, strand
Ejemplo n.º 22
0
def main(args):
    reads = dict()
    instream = kevlar.open(args.augfastq, 'r')
    for record in kevlar.parse_augmented_fastx(instream):
        reads[record.name] = record

    reader = khmer.ReadParser(args.fastq)
    outstream = kevlar.open(args.out, 'w')
    for read in reader:
        augrecord = reads[read.name]
        if len(read.sequence) < len(augrecord.sequence):
            ikmers = list()
            for kmer in augrecord.ikmers:
                stillthere = (
                    kmer.sequence in read.sequence or
                    kevlar.revcom(kmer.sequence) in read.sequence
                )
                if stillthere:
                    ikmers.append(kmer)
            if len(ikmers) == 0:
                continue
            augrecord.ikmers = ikmers
        kevlar.print_augmented_fastx(augrecord, outstream)
Ejemplo n.º 23
0
 def varseq(self):
     assert self.strand in (-1, 1)
     if self.strand == 1:
         return self.contig.sequence
     else:
         return kevlar.revcom(self.contig.sequence)
Ejemplo n.º 24
0
 def ikmers(self):
     for kmer in self.contig.annotations:
         seq = self.contig.ikmerseq(kmer)
         yield seq
         yield kevlar.revcom(seq)
Ejemplo n.º 25
0
def test_assumptions(kmer):
    ct = Counttable(27, 1e5, 2)
    kmer_rc = kevlar.revcom(kmer)
    assert ct.hash(kmer) == ct.hash(kmer_rc)
    assert ct.get_kmer_hashes(kmer)[0] == ct.get_kmer_hashes(kmer_rc)[0]