Exemple #1
0
def filter(readstream,
           mask=None,
           minabund=5,
           ksize=31,
           memory=1e6,
           maxfpr=0.001,
           logstream=sys.stderr):
    timer = kevlar.Timer()
    timer.start('recalc')
    print('[kevlar::filter] Loading input; recalculate k-mer abundances,',
          'de-duplicate reads and merge k-mers',
          file=logstream)
    readset = kevlar.seqio.AnnotatedReadSet(ksize, memory)
    for record in readstream:
        readset.add(record)
    fpr = summarize_readset(readset, logstream)
    if fpr > maxfpr:
        raise KevlarUnsuitableFPRError('FPR too high, bailing out!!!')
    elapsed = timer.stop('recalc')
    print('[kevlar::filter] Input loaded in {:.2f} sec'.format(elapsed),
          file=logstream)

    timer.start('validate')
    print('[kevlar::filter] Validate k-mers and print reads', file=logstream)
    readset.validate(mask=mask, minabund=minabund)
    for n, record in enumerate(readset, 1):
        yield record
    summarize_validate(readset, n, logstream)
    elapsed = timer.stop('validate')
    print('[kevlar::filter] k-mers validated and reads printed',
          'in {:.2f} sec'.format(elapsed),
          file=logstream)
Exemple #2
0
def main(args):
    if (args.num_bands is None) is not (args.band is None):
        raise ValueError('Must specify --num-bands and --band together')
    myband = args.band - 1 if args.band else None
    if args.mask:
        args.mask = kevlar.sketch.load(args.mask)

    timer = kevlar.Timer()
    timer.start()

    docount = args.counter_size > 1
    dosmallcount = args.counter_size == 4
    sketch = load_sample_seqfile(args.seqfile,
                                 args.ksize,
                                 args.memory,
                                 args.max_fpr,
                                 count=docount,
                                 smallcount=dosmallcount,
                                 mask=args.mask,
                                 consume_masked=args.count_masked,
                                 numbands=args.num_bands,
                                 band=myband,
                                 numthreads=args.threads,
                                 outfile=args.counttable,
                                 logfile=args.logfile)

    total = timer.stop()
    message = 'Total time: {:.2f} seconds'.format(total)
    print('[kevlar::count]', message, file=args.logfile)
Exemple #3
0
def main(args):
    timer = kevlar.Timer()
    timer.start()

    mask = load_mask(args.mask,
                     args.ksize,
                     args.mask_memory,
                     maxfpr=args.mask_max_fpr,
                     savefile=args.save_mask,
                     logstream=args.logfile)
    readstream = kevlar.seqio.afxstream(args.augfastq)
    outstream = kevlar.open(args.out, 'w')
    filterstream = filter(readstream,
                          mask,
                          minabund=args.min_abund,
                          ksize=args.ksize,
                          memory=args.abund_memory,
                          maxfpr=args.abund_max_fpr,
                          logstream=args.logfile)
    for record in filterstream:
        kevlar.print_augmented_fastx(record, outstream)

    total = timer.stop()
    message = 'Total time: {:.2f} seconds'.format(total)
    print('[kevlar::filter]', message, file=args.logfile)
Exemple #4
0
def main(args):
    timer = kevlar.Timer()
    timer.start()
    if (not args.num_bands) is not (not args.band):
        raise ValueError('Must specify --num-bands and --band together')
    myband = args.band - 1 if args.band else None

    timer.start('loadall')
    print('[kevlar::novel] Loading control samples', file=args.logfile)
    timer.start('loadctrl')
    controls = load_samples(args.control_counts, args.control, args.ksize,
                            args.memory, args.max_fpr, args.num_bands, myband,
                            args.threads, args.logfile)
    elapsed = timer.stop('loadctrl')
    message = 'Control samples loaded in {:.2f} sec'.format(elapsed)
    print('[kevlar::novel]', message, file=args.logfile)

    print('[kevlar::novel] Loading case samples', file=args.logfile)
    timer.start('loadcases')
    cases = load_samples(args.case_counts, args.case, args.ksize, args.memory,
                         args.max_fpr, args.num_bands, myband, args.threads,
                         args.logfile)
    elapsed = timer.stop('loadcases')
    print('[kevlar::novel] Case samples loaded in {:.2f} sec'.format(elapsed),
          file=args.logfile)
    elapsed = timer.stop('loadall')
    print('[kevlar::novel] All samples loaded in {:.2f} sec'.format(elapsed),
          file=args.logfile)

    timer.start('iter')
    ncases = len(args.case)
    message = 'Iterating over reads from {:d} case sample(s)'.format(ncases)
    print('[kevlar::novel]', message, file=args.logfile)
    outstream = kevlar.open(args.out, 'w')
    infiles = [f for filelist in args.case for f in filelist]
    caserecords = kevlar.multi_file_iter_screed(infiles)
    readstream = novel(
        caserecords,
        cases,
        controls,
        ksize=args.ksize,
        abundscreen=args.abund_screen,
        casemin=args.case_min,
        ctrlmax=args.ctrl_max,
        numbands=args.num_bands,
        band=myband,
        skipuntil=args.skip_until,
        updateint=args.upint,
        logstream=args.logfile,
    )
    for augmented_read in readstream:
        kevlar.print_augmented_fastx(augmented_read, outstream)

    elapsed = timer.stop('iter')
    message = 'Iterated over all case reads in {:.2f} seconds'.format(elapsed)
    print('[kevlar::novel]', message, file=args.logfile)

    total = timer.stop()
    message = 'Total time: {:.2f} seconds'.format(total)
    print('[kevlar::novel]', message, file=args.logfile)
Exemple #5
0
def filter(readfile, mask=None, memory=1e6, maxfpr=0.01, casemin=6, ctrlmax=1):
    timer = kevlar.Timer()
    timer.start()
    reader = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r'))
    counts = first_pass(reader, mask, memory, timer)
    check_fpr(counts, maxfpr)
    reader = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r'))
    for read in second_pass(reader, counts, casemin, ctrlmax, timer):
        yield read
    total = timer.stop()
    message = 'Total time: {:.2f} seconds'.format(total)
    kevlar.plog('[kevlar::filter]', message)
Exemple #6
0
def main(args):
    if (args.num_bands is None) is not (args.band is None):
        raise ValueError('Must specify --num-bands and --band together')
    myband = args.band - 1 if args.band else None

    timer = kevlar.Timer()
    timer.start()

    timer.start('loadctrl')
    print('[kevlar::count] Loading control samples', file=args.logfile)
    outfiles, infilelists = split_infiles_outfiles(args.control)
    controls = kevlar.counting.load_samples(infilelists,
                                            args.ksize,
                                            args.memory,
                                            outfiles=outfiles,
                                            memfraction=args.mem_frac,
                                            maxfpr=args.max_fpr,
                                            maxabund=args.ctrl_max,
                                            mask=None,
                                            numbands=args.num_bands,
                                            band=myband,
                                            numthreads=args.threads,
                                            logfile=args.logfile)
    elapsed = timer.stop('loadctrl')
    numcontrols = len(controls)
    message = '{:d} samples loaded in {:.2f} sec'.format(numcontrols, elapsed)
    print('[kevlar::count]', message, file=args.logfile)

    print('[kevlar::count] Loading case samples', file=args.logfile)
    timer.start('loadcase')
    outfiles, infilelists = split_infiles_outfiles(args.case)
    casemask = outfiles[0] if args.mem_frac else None
    cases = kevlar.counting.load_samples(infilelists,
                                         args.ksize,
                                         args.memory,
                                         outfiles=outfiles,
                                         memfraction=args.mem_frac,
                                         maxfpr=args.max_fpr,
                                         maxabund=args.ctrl_max,
                                         mask=casemask,
                                         numbands=args.num_bands,
                                         band=myband,
                                         numthreads=args.threads,
                                         logfile=args.logfile)
    elapsed = timer.stop('loadcase')
    numcases = len(cases)
    message = '{:d} sample(s) loaded in {:.2f} sec'.format(numcases, elapsed)
    print('[kevlar::count]', message, file=args.logfile)

    total = timer.stop()
    message = 'Total time: {:.2f} seconds'.format(total)
    print('[kevlar::count]', message, file=args.logfile)
Exemple #7
0
 def __init__(self,
              message,
              interval=10,
              breaks=[100, 1000, 10000],
              usetimer=False):
     self.message = message
     self.counter = 0
     self.interval = interval
     self.nextupdate = interval
     self.breaks = breaks
     self.timer = None
     if usetimer:
         self.timer = kevlar.Timer()
         self.timer.start()
Exemple #8
0
def partition(readstream,
              strict=False,
              minabund=None,
              maxabund=None,
              dedup=True,
              gmlfile=None,
              logstream=sys.stderr):
    timer = kevlar.Timer()
    timer.start()

    timer.start('loadreads')
    print('[kevlar::partition] Loading reads', file=logstream)

    graph = kevlar.ReadGraph()
    graph.load(readstream, minabund=minabund, maxabund=maxabund)
    elapsed = timer.stop('loadreads')
    print('[kevlar::partition]',
          'Reads loaded in {:.2f} sec'.format(elapsed),
          file=logstream)

    timer.start('buildgraph')
    mode = 'strict' if strict else 'relaxed'
    message = 'Building read graph in {:s} mode'.format(mode)
    print('[kevlar::partition]', message, file=logstream)
    graph.populate_edges(strict=strict)
    elapsed = timer.stop('buildgraph')
    print('[kevlar::partition]',
          'Graph built in {:.2f} sec'.format(elapsed),
          file=logstream)

    if gmlfile:  # pragma: no cover
        kevlar.to_gml(graph, gmlfile, logstream)

    timer.start('partition')
    print('[kevlar::partition] Partition readgraph', file=logstream)
    part_iter = graph.partitions(dedup, minabund, maxabund, abundfilt=True)
    for part in part_iter:
        reads = [graph.get_record(readname) for readname in list(part)]
        yield reads
    elapsed = timer.stop('partition')
    print('[kevlar::partition]',
          'Partitioning done in {:.2f} sec'.format(elapsed),
          file=logstream)

    total = timer.stop()
    message = 'Total time: {:.2f} seconds'.format(total)
    print('[kevlar::partition]', message, file=logstream)
Exemple #9
0
def main(args):
    timer = kevlar.Timer()
    timer.start()

    mask = None
    if args.mask:
        timer.start('loadmask')
        print('[kevlar::filter] Loading mask from',
              args.mask,
              file=args.logfile)
        mask = load_mask(args.mask,
                         args.ksize,
                         args.mask_memory,
                         maxfpr=args.mask_max_fpr,
                         savefile=args.save_mask,
                         logfile=args.logfile)
        elapsed = timer.stop('loadmask')
        print('[kevlar::filter]',
              'Mask loaded in {:.2f} sec'.format(elapsed),
              file=args.logfile)

    timer.start('recalc')
    print('[kevlar::filter] Loading input; recalculate k-mer abundances,',
          'de-duplicate reads and merge k-mers',
          file=args.logfile)
    readset, countgraph = load_input(args.augfastq, args.ksize,
                                     args.abund_memory, args.abund_max_fpr,
                                     args.logfile)
    elapsed = timer.stop('recalc')
    print('[kevlar::filter] Input loaded in {:.2f} sec'.format(elapsed),
          file=args.logfile)

    timer.start('validate')
    print('[kevlar::filter] Validate k-mers and print reads',
          file=args.logfile)
    outstream = kevlar.open(args.out, 'w')
    augstream = kevlar.open(args.aug_out, 'w') if args.aug_out else None
    validate_and_print(readset, countgraph, mask, args.min_abund, outstream,
                       augstream, args.logfile)
    elapsed = timer.stop('validate')
    print('[kevlar::filter] k-mers validated and reads printed',
          'in {:.2f} sec'.format(elapsed),
          file=args.logfile)

    total = timer.stop()
    message = 'Total time: {:.2f} seconds'.format(total)
    print('[kevlar::filter]', message, file=args.logfile)
Exemple #10
0
def main(args):
    timer = kevlar.Timer()
    timer.start()

    timer.start('loadgenome')
    print('[kevlar::gentrio] Loading genome...', end='', file=sys.stderr)
    seqfile = kevlar.open(args.genome, 'r')
    genomeseqs = kevlar.seqio.parse_seq_dict(seqfile)
    elapsed = timer.stop('loadgenome')
    print('done! ({:.3f} seconds elapsed)'.format(elapsed), file=sys.stderr)

    samples = ('proband', 'mother', 'father')
    outfiles = ['{:s}-{:s}.fasta'.format(args.prefix, s) for s in samples]
    outstreams = [kevlar.open(outfile, 'w') for outfile in outfiles]

    vcfout = None
    if args.vcf:
        vcfout = kevlar.open(args.vcf, 'w')
        kevlar.vcf_header(vcfout, source='kevlar::gentrio', infoheader=True)
    weights = weights_str_to_dict(args.weights)
    mutator = gentrio(genomeseqs,
                      outstreams,
                      ninh=args.inherited,
                      ndenovo=args.de_novo,
                      weights=weights,
                      seed=args.seed,
                      logstream=args.logfile)

    timer.start('mutate')
    print('[kevlar::gentrio] Begin generating and applying mutations:',
          file=sys.stderr)
    for variant in mutator:
        if vcfout:
            print(variant.vcf, file=vcfout)
    elapsed = timer.stop('mutate')
    print('[kevlar::gentrio] Done applying mutations! ',
          end='',
          file=sys.stderr)
    print('({:.3f} seconds elapsed)'.format(elapsed), file=sys.stderr)

    for outstream in outstreams:
        outstream.close()

    elapsed = timer.stop()
    print('[kevlar::gentrio] Trio simulation complete; ', file=sys.stderr)
    print(' total runtime: {:.3f} seconds'.format(elapsed), file=sys.stderr)
Exemple #11
0
def load_mask(maskfiles,
              ksize,
              memory,
              maxfpr=0.001,
              savefile=None,
              logstream=sys.stderr):
    """Load reference genome and/or contaminant database from a file."""
    if maskfiles is None:
        return None

    timer = kevlar.Timer()
    timer.start('loadmask')
    print('[kevlar::filter] Loading mask from', maskfiles, file=logstream)

    if len(maskfiles) == 1 and maskfiles[0].endswith(('.nt', '.nodetable')):
        mask = kevlar.sketch.load(maskfiles[0])
        message = '    nodetable loaded'
    else:
        buckets = memory * khmer._buckets_per_byte['nodegraph'] / 4
        mask = khmer.Nodetable(ksize, buckets, 4)
        nr, nk = 0, 0
        for maskfile in maskfiles:
            numreads, numkmers = mask.consume_seqfile(maskfile)
            nr += numreads
            nk += numkmers
        message = '    {:d} sequences and {:d} k-mers consumed'.format(nr, nk)
    fpr = kevlar.sketch.estimate_fpr(mask)
    message += '; estimated false positive rate is {:1.3f}'.format(fpr)
    print(message, file=logstream)
    if fpr > maxfpr:
        raise KevlarUnsuitableFPRError('FPR too high, bailing out!!!')
    if savefile:
        mask.save(savefile)
        message = '    nodetable saved to "{:s}"'.format(savefile)
        print(message, file=logstream)

    elapsed = timer.stop('loadmask')
    print('[kevlar::filter]',
          'Mask loaded in {:.2f} sec'.format(elapsed),
          file=logstream)
    return mask
Exemple #12
0
def partition(readstream, strict=False, minabund=None, maxabund=None,
              dedup=True, gmlfile=None):
    timer = kevlar.Timer()
    timer.start()

    timer.start('loadreads')
    kevlar.plog('[kevlar::partition] Loading reads')

    graph = kevlar.ReadGraph()
    graph.load(readstream, minabund=minabund, maxabund=maxabund)
    elapsed = timer.stop('loadreads')
    message = 'Reads loaded in {:.2f} sec'.format(elapsed)
    kevlar.plog('[kevlar::partition]', message)

    timer.start('buildgraph')
    mode = 'strict' if strict else 'relaxed'
    message = 'Building read graph in {:s} mode'.format(mode)
    kevlar.plog('[kevlar::partition]', message)
    graph.populate_edges(strict=strict)
    elapsed = timer.stop('buildgraph')
    message = 'Graph built in {:.2f} sec'.format(elapsed)
    kevlar.plog('[kevlar::partition]', message)

    if gmlfile:  # pragma: no cover
        kevlar.to_gml(graph, gmlfile, logstream)

    timer.start('partition')
    kevlar.plog('[kevlar::partition] Partition readgraph')
    part_iter = graph.partitions(dedup, minabund, maxabund, abundfilt=True)
    for n, part in enumerate(part_iter, 1):
        reads = [graph.get_record(readname) for readname in list(part)]
        for read in reads:
            read.name += ' kvcc={:d}'.format(n)
        yield n, reads
    elapsed = timer.stop('partition')
    message = 'Partitioning done in {:.2f} sec'.format(elapsed)
    kevlar.plog('[kevlar::partition]', message)

    total = timer.stop()
    message = 'Total time: {:.2f} seconds'.format(total)
    kevlar.plog('[kevlar::partition]', message)
Exemple #13
0
def main(args):
    if (args.num_bands is None) is not (args.band is None):
        raise ValueError('Must specify --num-bands and --band together')
    myband = args.band - 1 if args.band else None
    if len(args.outfiles) != len(args.sample):
        message = 'number of outfiles must match number of declared samples'
        raise ValueError(message)

    timer = kevlar.Timer()
    timer.start()

    loader = load_samples(
        args.sample, args.ksize, args.memory, memfraction=args.memfrac,
        maxfpr=args.max_fpr, maxabund=args.max_abund, numbands=args.num_bands,
        band=args.band, numthreads=args.threads, logfile=args.logfile
    )
    for sketch, outfile in zip(loader, args.outfiles):
        sketch.save(outfile)

    total = timer.stop()
    message = 'Total time: {:.2f} seconds'.format(total)
    print('[kevlar::effcount]', message, file=args.logfile)
Exemple #14
0
def novel(casestream,
          casecounts,
          controlcounts,
          ksize=31,
          abundscreen=None,
          casemin=5,
          ctrlmax=0,
          numbands=None,
          band=None,
          skipuntil=None,
          updateint=10000,
          logstream=sys.stderr):
    numbands_unset = not numbands
    band_unset = not band and band != 0
    if numbands_unset is not band_unset:
        raise ValueError('Must specify `numbands` and `band` together')

    if band is not None and band < 0:
        maxband = numbands - 1
        message = '`band` must be a value between 0 and {:d}'.format(maxband)
        message += ' (`numbands` - 1), inclusive'
        raise ValueError(message)

    timer = kevlar.Timer()
    timer.start()
    nkmers = 0
    nreads = 0
    nextupdate = updateint
    unique_kmers = set()
    for n, record, mate in kevlar.paired_reader(casestream):
        if skipuntil:  # pragma: no cover
            if record.name == skipuntil:
                message = 'Found read {:s}'.format(skipuntil)
                message += ' (skipped {:d} reads)'.format(n)
                print('[kevlar::novel]', message, file=logstream)
                skipuntil = False
            continue
        if n >= nextupdate:
            nextupdate += updateint
            elapsed = timer.probe()
            msg = '    processed {} reads'.format(n)
            msg += ' in {:.2f} seconds...'.format(elapsed)
            print(msg, file=logstream)
        if len(record.sequence) < ksize:
            continue
        if re.search('[^ACGT]', record.sequence):
            # This check should be temporary; hopefully khmer will handle
            # this soon.
            continue

        discard_read = False
        irecord = None
        for i, kmer in enumerate(casecounts[0].get_kmers(record.sequence)):
            if numbands:
                khash = casecounts[0].hash(kmer)
                if khash & (numbands - 1) != band - 1:
                    continue
            interesting, discard, caseabund, ctrlabund = kmer_is_interesting(
                kmer,
                casecounts,
                controlcounts,
                case_min=casemin,
                ctrl_max=ctrlmax,
                screen_thresh=abundscreen,
            )
            if discard:
                discard_read = True
                break
            if not interesting:
                continue
            if irecord is None:
                irecord = kevlar.sequence.copy_record(record)
            abund = tuple(caseabund + ctrlabund)
            irecord.annotate(kmer, i, abund)
            minkmer = kevlar.revcommin(kmer)
            unique_kmers.add(minkmer)

        if discard_read or irecord is None:
            continue

        nreads += 1
        nkmers += len(irecord.annotations)
        if mate:
            irecord.add_mate(mate.sequence)
        yield irecord

    elapsed = timer.stop()
    message = 'Found {:d} instances'.format(nkmers)
    message += ' of {:d} unique novel kmers'.format(len(unique_kmers))
    message += ' in {:d} reads'.format(nreads)
    message += ' in {:.2f} seconds'.format(elapsed)
    print('[kevlar::novel]', message, file=logstream)
Exemple #15
0
def novel(casestream,
          casecounts,
          controlcounts,
          ksize=31,
          abundscreen=None,
          casemin=5,
          ctrlmax=0,
          numbands=None,
          band=None,
          skipuntil=None):
    numbands_unset = not numbands
    band_unset = not band and band != 0
    if numbands_unset is not band_unset:
        raise ValueError('Must specify `numbands` and `band` together')

    if band is not None and band < 0:
        maxband = numbands - 1
        message = '`band` must be a value between 0 and {:d}'.format(maxband)
        message += ' (`numbands` - 1), inclusive'
        raise ValueError(message)

    timer = kevlar.Timer()
    timer.start()
    nkmers = 0
    nreads = 0
    update_message = '[kevlar::novel]     processed {counter} reads'
    skip_message = None
    if skipuntil:
        msg = '; skipping reads in search of {read}'.format(read=skipuntil)
        skip_message = update_message + msg
    first_message = skip_message if skipuntil else update_message
    progress_indicator = kevlar.ProgressIndicator(
        first_message,
        interval=1e6,
        breaks=[1e7, 1e8, 1e9],
        usetimer=True,
    )
    unique_kmers = set()
    for n, record in enumerate(casestream, 1):
        progress_indicator.update()
        if skipuntil:  # pragma: no cover
            if record.name == skipuntil:
                message = 'Found read {:s}'.format(skipuntil)
                message += ' (skipped {:d} reads)'.format(n)
                kevlar.plog('[kevlar::novel]', message)
                skipuntil = False
                progress_indicator.message = update_message
            continue

        if len(record.sequence) < ksize:
            continue
        if re.search('[^ACGT]', record.sequence):
            # This check should be temporary; hopefully khmer will handle
            # this soon.
            continue

        discard_read = False
        irecord = None
        for i, kmer in enumerate(casecounts[0].get_kmers(record.sequence)):
            if numbands:
                khash = casecounts[0].hash(kmer)
                if khash & (numbands - 1) != band - 1:
                    continue
            interesting, discard, caseabund, ctrlabund = kmer_is_interesting(
                kmer,
                casecounts,
                controlcounts,
                case_min=casemin,
                ctrl_max=ctrlmax,
                screen_thresh=abundscreen,
            )
            if discard:
                discard_read = True
                break
            if not interesting:
                continue
            if irecord is None:
                irecord = kevlar.sequence.copy_record(record)
            abund = tuple(caseabund + ctrlabund)
            irecord.annotate(kmer, i, abund)
            minkmer = kevlar.revcommin(kmer)
            unique_kmers.add(minkmer)

        if discard_read or irecord is None:
            continue

        nreads += 1
        nkmers += len(irecord.annotations)
        yield irecord

    elapsed = timer.stop()
    message = 'Found {:d} instances'.format(nkmers)
    message += ' of {:d} unique novel kmers'.format(len(unique_kmers))
    message += ' in {:d} reads'.format(nreads)
    message += ' in {:.2f} seconds'.format(elapsed)
    kevlar.plog('[kevlar::novel]', message)