Exemple #1
0
    def do_write(self, outfp):
        outq = self.outqueue
        while self.worker_count > 0 or not outq.empty():
            try:
                g = outq.get(True, 1)
            except queue.Empty:
                continue

            for name, seq, qual in g.seqlist:
                if qual:
                    record = screed.Record(name=name,
                                           sequence=seq,
                                           quality=qual)
                else:
                    record = screed.Record(name=name, sequence=seq)
                write_record(record, outfp)

        if self.verbose:
            print("DONE writing.\nprocessed %d / wrote %d / removed %d" %
                  (self.n_processed, self.n_written,
                   self.n_processed - self.n_written),
                  file=sys.stderr)
            print("processed %d bp / wrote %d bp / removed %d bp" %
                  (self.bp_processed, self.bp_written,
                   self.bp_processed - self.bp_written),
                  file=sys.stderr)
            discarded = self.bp_processed - self.bp_written
            f = float(discarded) / float(self.bp_processed) * 100
            print("discarded %.1f%%" % f, file=sys.stderr)
Exemple #2
0
def trim_record(countgraph,
                record,
                cutoff,
                variable_coverage=False,
                normalize_to=None):
    name = record.name
    seq = record.sequence
    seqN = record.cleaned_seq

    if variable_coverage:  # only trim when sequence has high enough C
        if not countgraph.median_at_least(seqN, normalize_to):
            return record, False  # return unmodified

    _, trim_at = countgraph.trim_on_abundance(seqN, cutoff)

    # too short? eliminate read
    if trim_at < countgraph.ksize():
        return None, True

    # would we trim? if not, return unmodified.
    if trim_at == len(seq):
        return record, False

    # construct new record
    trim_seq = seq[:trim_at]
    if hasattr(record, 'quality'):
        trim_qual = record.quality[:trim_at]
        trim_rec = screed.Record(name=name,
                                 sequence=trim_seq,
                                 quality=trim_qual)
    else:
        trim_rec = screed.Record(name=name, sequence=trim_seq)

    return trim_rec, True
class Test_BrokenPairedReader(object):
    stream = [screed.Record(name='seq1/1', sequence='A' * 5),
              screed.Record(name='seq1/2', sequence='A' * 4),
              screed.Record(name='seq2/1', sequence='A' * 5),
              screed.Record(name='seq3/1', sequence='A' * 3),
              screed.Record(name='seq3/2', sequence='A' * 5)]

    def testDefault(self):
        x, n, m = gather(self.stream, min_length=1)

        expected = [('seq1/1', 'seq1/2'),
                    ('seq2/1', None),
                    ('seq3/1', 'seq3/2')]
        assert x == expected, x
        assert m == 3
        assert n == 3, n

    def testMinLength(self):
        x, n, m = gather(self.stream, min_length=3)

        expected = [('seq1/1', 'seq1/2'),
                    ('seq2/1', None),
                    ('seq3/1', 'seq3/2')]
        assert x == expected, x
        assert m == 3
        assert n == 3, n

    def testMinLength_2(self):
        x, n, m = gather(self.stream, min_length=4)

        expected = [('seq1/1', 'seq1/2'),
                    ('seq2/1', None),
                    ('seq3/2', None)]
        assert x == expected, x
        assert m == 3
        assert n == 3, n

    def testForceSingle(self):
        x, n, m = gather(self.stream, force_single=True)

        expected = [('seq1/1', None),
                    ('seq1/2', None),
                    ('seq2/1', None),
                    ('seq3/1', None),
                    ('seq3/2', None)]
        assert x == expected, x
        assert m == 5
        assert n == 4, n

    def testForceSingleAndMinLength(self):
        x, n, m = gather(self.stream, min_length=5, force_single=True)

        expected = [('seq1/1', None),
                    ('seq2/1', None),
                    ('seq3/2', None)]
        assert x == expected, x
        assert m == 3, m
        assert n == 2, n
def test_check_is_pair_4b():
    read1 = screed.Record(name='seq/1', sequence='AAA')
    read2 = screed.Record(name='seq/2', quality='###', sequence='AAA')

    try:
        check_is_pair(read1, read2)
        assert False                    # check_is_pair should fail here.
    except ValueError:
        pass
def test_BrokenPairedReader_OnPairs_4():
    stream = [screed.Record(name='seq1/1', sequence='A' * 3),  # too short
              screed.Record(name='seq1/2', sequence='A' * 4),
              screed.Record(name='seq3/1', sequence='A' * 4),
              screed.Record(name='seq3/2', sequence='A' * 5)]

    x, n, m = gather(stream, min_length=4, require_paired=True)

    expected = [('seq3/1', 'seq3/2')]
    assert x == expected, x
    assert m == 1
    assert n == 0, n
Exemple #6
0
def test_paired_2thread_more_seq():
    class TSP_TestPairedProcess(ThreadedSequenceProcessor):
        # write a new do_process function that ensures paired ends are kept.

        def do_process(self):
            inq = self.inqueue
            outq = self.outqueue

            while not self.done or not inq.empty():
                try:
                    g = inq.get(True, 1)
                except queue.Empty:
                    continue

                if len(g.seqlist) == 2:
                    first_rec = g.seqlist[0]
                    second_rec = g.seqlist[1]

                    assert first_rec['name'][:-1] == second_rec['name'][:-1]
                    assert first_rec['name'][-1] == '1'
                    assert second_rec['name'][-1] == '2'

                keep = []
                for record in g.seqlist:
                    name, sequence = self.process_fn(record)
                    if name:
                        keep.append((name, sequence, None))

                self.outqueue.put(SequenceGroup(0, keep))

            # end of thread; exit, decrement worker count.
            self.worker_count -= 1

    #

    tsp = TSP_TestPairedProcess(idem, 1, 1, verbose=False)

    input = [
        screed.Record(name='b/1', sequence='AAA'),
        screed.Record(name='a/1', sequence='AAA'),
        screed.Record(name='a/2', sequence='TTT'),
        screed.Record(name='c/2', sequence='AAA'),
    ]
    outfp = StringIO()

    tsp.start(input, outfp)

    x = load_records_d(outfp)
    assert len(x) == 4, x
    assert x['a/1'] == 'AAA'
    assert x['a/2'] == 'TTT'
    assert x['b/1'] == 'AAA'
    assert x['c/2'] == 'AAA'
def test_BrokenPairedReader_OnPairs_2():
    stream = [screed.Record(name='seq1/1', sequence='A' * 5),
              screed.Record(name='seq1/2', sequence='A' * 4),
              screed.Record(name='seq3/1', sequence='A' * 5),   # switched
              screed.Record(name='seq3/2', sequence='A' * 3)]   # wrt previous

    x, n, m = gather(stream, min_length=4, require_paired=True)

    expected = [('seq1/1', 'seq1/2')]
    assert x == expected, x
    assert m == 1
    assert n == 0, n
Exemple #8
0
def test_odd():
    tsp = ThreadedSequenceProcessor(every_other, 1, 1, verbose=False)

    inseqs = [
        screed.Record(name='a', sequence='AAA'),
        screed.Record(name='b', sequence='TTT'),
    ]
    outfp = StringIO()

    tsp.start(inseqs, outfp)

    x = load_records_d(outfp)
    assert len(x) == 1, x
    assert x['b'] == 'TTT'
Exemple #9
0
def test_basic_fastq_like():
    tsp = ThreadedSequenceProcessor(idem, 1, 1, verbose=False)

    inseqs = [
        screed.Record(name='a', sequence='AAA', quality='###'),
        screed.Record(name='b', sequence='TTT', quality='###'),
    ]
    outfp = StringIO()

    tsp.start(inseqs, outfp)

    x = load_records_fastq(outfp)
    for i in x:
        assert i['quality'] == '###'
Exemple #10
0
def test_basic_2thread():
    tsp = ThreadedSequenceProcessor(idem, 2, 1, verbose=False)

    inseqs = [
        screed.Record(name='a', sequence='AAA'),
        screed.Record(name='b', sequence='TTT'),
    ]
    outfp = StringIO()

    tsp.start(inseqs, outfp)

    x = load_records_d(outfp)
    assert len(x) == 2, x
    assert x['a'] == 'AAA'
    assert x['b'] == 'TTT'
Exemple #11
0
def merge_and_reannotate(pair, newname):
    """
    Assemble a pair of overlapping reads and resolve their interesting k-mers.

    When a pair of compatible reads is merged, the offset of the interesting
    k-mers must be computed for one of the reads.
    """
    contig = merge_pair(pair)
    newrecord = screed.Record(name=newname,
                              sequence=contig,
                              ikmers=pair.tail.ikmers)
    ksize = len(pair.tail.ikmers[0].sequence)
    if pair.sameorient:
        minoffset2keep = len(pair.tail.sequence) - pair.offset - ksize
        keepers = [ik for ik in pair.head.ikmers if ik.offset > minoffset2keep]
        for k in keepers:
            ikmer = kevlar.KmerOfInterest(k.sequence, k.offset + pair.offset,
                                          k.abund)
            newrecord.ikmers.append(ikmer)
    else:
        maxoffset2keep = pair.offset - ksize
        keepers = [ik for ik in pair.head.ikmers if ik.offset < maxoffset2keep]
        for k in keepers:
            ikmer = kevlar.KmerOfInterest(
                kevlar.revcom(k.sequence),
                len(pair.head.sequence) - k.offset - ksize + pair.offset,
                k.abund,
            )
            newrecord.ikmers.append(ikmer)

    return newrecord
Exemple #12
0
def main():
    counting_ht = sys.argv[1]
    infiles = sys.argv[2:]

    print('file with ht: %s' % counting_ht)

    print('making hashtable')
    ht = Countgraph.load(counting_ht)
    K = ht.ksize()

    for infile in infiles:
        print('filtering', infile)
        outfile = os.path.basename(infile) + '.below'

        outfp = open(outfile, 'w')

        paired_iter = broken_paired_reader(ReadParser(infile), min_length=K,
                                           force_single=True)
        for n, is_pair, read1, read2 in paired_iter:
            name = read1.name
            seq = read1.sequence
            if 'N' in seq:
                return None, None

            trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF)

            if trim_at >= K:
                write_record(screed.Record(name=name, sequence=trim_seq), outfp)
Exemple #13
0
def test_ikmer_abund_after_recalc():
    """
    Ensure interesting k-mer abundances are correct after recalculation.

    The interesting k-mer has an advertised abundance of 28, but a true
    abundance (in `counts`) of 10. The readset "validate" function should check
    and correct this.
    """
    read = screed.Record(
        name='read1',
        sequence='AAGCAGGGGTCTACATTGTCCTCGGGACTCGAGATTTCTTCGCTGT',
        ikmers=[KmerOfInterest('CATTGTCCTCGGGACTC', 13, [28, 0, 0])],
    )

    counts = khmer.Counttable(17, 1e5, 4)
    seq = 'TTCGTTCCCGAAGCAGGGGTCTACATTGTCCTCGGGACTCGAGATTTCTTCGCTGTTCCGTCCTTCA'
    for _ in range(10):
        counts.consume(seq)

    rs = ReadSet()
    rs.add(read)
    assert read.ikmers[0].abund[0] == 28

    rs.validate(counts, minabund=8)
    assert rs.valid == (1, 1)
    assert read.ikmers[0].abund[0] == 10
Exemple #14
0
def assemble_fml_asm(readstream, logstream=sys.stderr):
    reads = [r for r in readstream]
    assembler = kevlar.assembly.fml_asm(reads)
    for n, contig in enumerate(assembler, 1):
        name = 'contig{:d}'.format(n)
        record = screed.Record(name=name, sequence=contig)
        yield record
Exemple #15
0
def record2a():
    return screed.Record(
        name='read2',
        sequence='ACGCAAAGCTATTTACGCAA',
        ikmers=[
            KmerOfInterest('CGCAA', 1, [15, 0, 0]),
            KmerOfInterest('CGCAA', 15, [15, 0, 0]),
        ],
    )
Exemple #16
0
def record6():
    return screed.Record(
        name='read6',
        sequence='TCACTGTCAAGAGAGGCCTACGGATTCGGTTACTG',
        ikmers=[
            KmerOfInterest('CTGTCAA', 3, [12, 0, 0]),
            KmerOfInterest('TGTCAAG', 4, [13, 0, 0]),
        ],
    )
Exemple #17
0
def record5():
    return screed.Record(
        name='read5',
        sequence='CTCTTCCGGCAGTCACTGTCAAGAGAGGGTGAACT',
        ikmers=[
            KmerOfInterest('CTGTCAA', 15, [12, 0, 0]),
            KmerOfInterest('TGTCAAG', 16, [13, 0, 0]),
        ],
    )
Exemple #18
0
def record4():
    # similar to record2 but with a single nucleotide mismatch
    return screed.Record(
        name='read4',
        sequence='ACGCAATGCTATTTAAAACC',
        ikmers=[
            KmerOfInterest('CGCAA', 1, [15, 0, 0]),
            KmerOfInterest('AAAAC', 14, [19, 1, 0]),
        ],
    )
Exemple #19
0
def record3():
    # reverse complement of record2
    return screed.Record(
        name='read3',
        sequence='GGTTTTAAATAGCTTTGCGT',
        ikmers=[
            KmerOfInterest('GTTTT', 1, [19, 1, 0]),
            KmerOfInterest('TTGCG', 14, [15, 0, 0]),
        ],
    )
Exemple #20
0
def record10():
    return screed.Record(
        name='read10',
        sequence=('CAGGTCCCCACCCGGATACTTGAAGCAGGCAGCCT'),
        ikmers=[
            KmerOfInterest('TCCCCACCCGGATACTT', 4, [28, 0, 0]),
            KmerOfInterest('CCCCACCCGGATACTTG', 5, [26, 0, 0]),
            KmerOfInterest('CCCGGATACTTGAAGCA', 10, [21, 0, 0]),
        ],
    )
Exemple #21
0
def test_BrokenPairedReader_lowercase():
    stream = [screed.Record(name='seq1/1', sequence='acgtn'),
              screed.Record(name='seq1/2', sequence='AcGtN'),
              screed.Record(name='seq1/2', sequence='aCgTn')]

    results = []
    for num, is_pair, read1, read2 in broken_paired_reader(stream):
        results.append((read1, read2))

    a, b = results[0]
    assert a.sequence == 'acgtn'
    assert a.cleaned_seq == 'ACGTA'
    assert b.sequence == 'AcGtN'
    assert b.cleaned_seq == 'ACGTA'

    c, d = results[1]
    assert c.sequence == 'aCgTn'
    assert c.cleaned_seq == 'ACGTA'
    assert d is None
Exemple #22
0
def assemble_jca(readstream,
                 memory,
                 maxfpr=0.01,
                 collapse=True,
                 kmers_to_ignore=set(),
                 logstream=sys.stderr):
    print('[kevlar::assemble::jca] loading reads', file=logstream)
    countgraph = None
    variants = kevlar.VariantSet()
    for record in readstream:
        for kmer in record.ikmers:
            variants.add_kmer(kmer.sequence, record.name)
            if countgraph is None:
                ksize = len(kmer.sequence)
                countgraph = khmer.Countgraph(ksize, memory / 4, 4)
        countgraph.consume(record.sequence)
    fpr = kevlar.sketch.estimate_fpr(countgraph)
    msg = '[kevlar::assemble::jca]    done loading reads'
    msg += ', {:d} distinct k-mers stored'.format(countgraph.n_unique_kmers())
    msg += '; estimated false positive rate is {:1.3f}'.format(fpr)
    if fpr > maxfpr:
        msg += ' (FPR too high, bailing out!!!)'
        raise kevlar.sketch.KevlarUnsuitableFPRError(msg)
    print(msg, file=logstream)

    asm = khmer.JunctionCountAssembler(countgraph)
    for kmer in variants.kmers:
        if kmer in kmers_to_ignore:
            continue
        contigs = asm.assemble(kmer)
        for contig in contigs:
            if hasattr(contig, 'decode'):
                contig = contig.decode()
            if contig == '':
                print('    WARNING: no assembly found for k-mer',
                      kmer,
                      file=args.logfile)
                continue
            variants.add_contig(contig, kmer)

    print('    {:d} linear paths'.format(variants.ncontigs), file=logstream)

    if collapse:
        print('[kevlar::assemble::jca] Collapsing contigs', file=logstream)
        variants.collapse()
        print('    {:d} collapsed contigs'.format(variants.ncontigs),
              file=logstream)

    for n, contigdata in enumerate(variants, 1):
        contig, contigrc, kmers, reads = contigdata
        contigname = 'contig{:d}:length={:d}:nkmers={:d}:nreads={:d}'.format(
            n, len(contig), len(kmers), len(reads))
        contig = screed.Record(name=contigname, sequence=contig)
        yield contig
Exemple #23
0
def record9():
    return screed.Record(
        name='read9',
        sequence=('AGCAAGGCGCTCGCGTCAACGAAGTGAGCTCCCGTGGTCTTGAGTTATCG'
                  'CCTCACATAC'),
        ikmers=[
            KmerOfInterest('AGCAAGGCGCTCGCGTC', 0, [25, 0, 0]),
            KmerOfInterest('GCAAGGCGCTCGCGTCA', 1, [39, 0, 0]),
            KmerOfInterest('GTTATCGCCTCACATAC', 42, [15, 1, 1]),
            KmerOfInterest('AGTTATCGCCTCACATA', 43, [15, 1, 0]),
        ],
    )
Exemple #24
0
def record8():
    return screed.Record(
        name='read8',
        sequence=('GTATGTGAGGCGATAACTCAAGACCACGGGAGCTCACTTCGTTGACGCGA'
                  'GCGCCTTGCT'),
        ikmers=[
            KmerOfInterest('GTATGTGAGGCGATAAC', 0, [15, 1, 0]),
            KmerOfInterest('TATGTGAGGCGATAACT', 1, [15, 1, 1]),
            KmerOfInterest('TGACGCGAGCGCCTTGC', 42, [39, 0, 0]),
            KmerOfInterest('GACGCGAGCGCCTTGCT', 43, [25, 0, 0]),
        ],
    )
Exemple #25
0
def picorecord2():
    return screed.Record(
        name='seq1_901428_901847_3:0:0_0:0:0_87d/1',
        sequence=('TTACATTTATTCGTTTGTGCAGGCTGAGACCTCACTTCCAACTGTAATCCAAAAGCTTA'
                  'GTTTTTTTTTTGTTTCCCAAAGTAAGGCTGAGTGAACAATA'),
        ikmers=[
            KmerOfInterest('TTTTTTGTTTCCCAAAGTAAGGCTG', 64, [19, 0, 0]),
            KmerOfInterest('TTTTTGTTTCCCAAAGTAAGGCTGA', 65, [18, 1, 0]),
            KmerOfInterest('TTTTGTTTCCCAAAGTAAGGCTGAG', 66, [18, 1, 0]),
            KmerOfInterest('TTTGTTTCCCAAAGTAAGGCTGAGT', 67, [18, 0, 0]),
            KmerOfInterest('TTGTTTCCCAAAGTAAGGCTGAGTG', 68, [17, 0, 0]),
        ],
    )
Exemple #26
0
def test_variant_mapping():
    contig = screed.Record(
        name='contig1',
        sequence='CCTGAGCCCTCTCAAGTCGGGTCCTGGCCCGGTCTGCCCATGAGGCTGGGCCTGAGCCCC'
    )
    cutout = kevlar.reference.ReferenceCutout(
        defline='chr1_10000-10060',
        sequence='CCTGAGCCCTCTCAAGTCGGGTCCTGGCCCAGTCTGCCCATGAGGCTGGGCCTGAGCCCC'
    )
    mapping = VariantMapping(contig, cutout, score=1e6, cigar='60M')

    assert mapping.seqid == 'chr1'
    assert mapping.interval == ('chr1', 10000, 10060)
Exemple #27
0
def picorecord1():
    return screed.Record(
        name='seq1_901350_901788_1:0:0_0:0:0_21ca1/2',
        sequence=('GTTTTTTTTTTGTTTCCCAAAGTAAGGCTGAGTGAACAATATTTTCTCATAGTTTTGAC'
                  'AAAAACAAAGGAATCCTTAGTTATTAAACTCGGGAGTTTGA'),
        ikmers=[
            KmerOfInterest('TTTTTTGTTTCCCAAAGTAAGGCTG', 5, [19, 0, 0]),
            KmerOfInterest('TTTTTGTTTCCCAAAGTAAGGCTGA', 6, [18, 1, 0]),
            KmerOfInterest('TTTTGTTTCCCAAAGTAAGGCTGAG', 7, [18, 1, 0]),
            KmerOfInterest('TTTGTTTCCCAAAGTAAGGCTGAGT', 8, [18, 0, 0]),
            KmerOfInterest('TTGTTTCCCAAAGTAAGGCTGAGTG', 9, [17, 0, 0]),
        ],
    )
Exemple #28
0
def parse_augmented_fastx(instream):
    """
    Read augmented Fast[q|a] records into memory.

    The parsed records will have .name, .sequence, and .quality defined (unless
    it's augmented Fasta), as well as a list of interesting k-mers. See
    http://kevlar.readthedocs.io/en/latest/formats.html#augmented-sequences for
    more information.
    """
    record = None
    for line in instream:
        if line.startswith(('@', '>')):
            if record is not None:
                yield record
            readid = line[1:].strip()
            seq = next(instream).strip()
            if line.startswith('@'):
                _ = next(instream)
                qual = next(instream).strip()
                record = screed.Record(name=readid,
                                       sequence=seq,
                                       quality=qual,
                                       ikmers=list())
            else:
                record = screed.Record(name=readid,
                                       sequence=seq,
                                       ikmers=list())
        elif line.endswith('#\n'):
            offset = len(line) - len(line.lstrip())
            line = line.strip()[:-1]
            abundances = re.split('\s+', line)
            kmer = abundances.pop(0)
            abundances = [int(a) for a in abundances]
            ikmer = kevlar.KmerOfInterest(sequence=kmer,
                                          offset=offset,
                                          abund=abundances)
            record.ikmers.append(ikmer)
    if record is not None:
        yield record
Exemple #29
0
def picorecord3():
    return screed.Record(
        name='seq1_901428_901847_3:0:0_0:0:0_87d/1',
        sequence=('TATTGTTCACTCAGCCTTACTTTGGGAAACAAAAAAAAAACTAAGCTTTTGGATTACAG'
                  'TTGGAAGTGAGGTCTCAGCCTGCACAAACGAATAAATGTAA'),
        ikmers=[
            KmerOfInterest('CAGCCTTACTTTGGGAAACAAAAAA', 11, [17, 0, 0]),
            KmerOfInterest('TCAGCCTTACTTTGGGAAACAAAAA', 10, [18, 0, 0]),
            KmerOfInterest('CTCAGCCTTACTTTGGGAAACAAAA', 9, [18, 1, 0]),
            KmerOfInterest('ACTCAGCCTTACTTTGGGAAACAAA', 8, [18, 1, 0]),
            KmerOfInterest('CACTCAGCCTTACTTTGGGAAACAA', 7, [19, 0, 0]),
        ],
    )
Exemple #30
0
def record7():
    return screed.Record(
        name='read7',
        sequence=('CAGGTCCCCACCCGGATACTTGAAGCAGGCAGCCTCAAGGTATGTGAGGC'
                  'GATAACTCAA'),
        ikmers=[
            KmerOfInterest('TCCCCACCCGGATACTT', 4, [28, 0, 0]),
            KmerOfInterest('CCCCACCCGGATACTTG', 5, [26, 0, 0]),
            KmerOfInterest('CCCGGATACTTGAAGCA', 10, [21, 0, 0]),
            KmerOfInterest('GGTATGTGAGGCGATAA', 38, [14, 0, 0]),
            KmerOfInterest('GTATGTGAGGCGATAAC', 39, [15, 1, 0]),
            KmerOfInterest('TATGTGAGGCGATAACT', 40, [15, 1, 1]),
        ],
    )