Example #1
0
def main():
    info('filter-abund.py', ['counting'])
    args = sanitize_help(get_parser()).parse_args()

    check_input_files(args.input_graph, args.force)
    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        print("Accepting input from stdin; output filename must "
              "be provided with -o.", file=sys.stderr)
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    print('loading countgraph:', args.input_graph,
          file=sys.stderr)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    print("K:", ksize, file=sys.stderr)

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = countgraph.get_median_count(seqN)
            if med < args.normalize_to:
                return name, seq

        _, trim_at = countgraph.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        print('filtering', infile, file=sys.stderr)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads)
        tsp.start(verbose_loader(infile), outfp)

        print('output in', outfile, file=sys.stderr)
Example #2
0
def main():
    counting_ht = sys.argv[1]
    infiles = sys.argv[2:]

    print('file with ht: %s' % counting_ht)
    print('-- settings:')
    print('N THREADS', WORKER_THREADS)
    print('--')

    print('making hashtable')
    ht = khmer.load_countgraph(counting_ht)
    K = ht.ksize()

    for infile in infiles:
        print('filtering', infile)
        outfile = os.path.basename(infile) + '.below'

        outfp = open(outfile, 'w')

        def process_fn(record, ht=ht):
            name = record['name']
            seq = record['sequence']
            if 'N' in seq:
                return None, None

            trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF)

            if trim_at >= K:
                return name, trim_seq

            return None, None

        tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

        tsp.start(verbose_fasta_iter(infile), outfp)
Example #3
0
def main():
    info("filter-abund-single.py", ["counting"])
    args = get_parser().parse_args()
    check_file_status(args.datafile)
    check_space([args.datafile])
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize)
    report_on_config(args)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.threads * 64 * 1024)

    print "making k-mer counting table"
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile, args.threads)
    threads = []
    print "consuming input, round 1 --", args.datafile
    for _ in xrange(args.threads):
        cur_thread = threading.Thread(target=htable.consume_fasta_with_reads_parser, args=(rparser,))
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    fp_rate = khmer.calc_expected_collisions(htable)
    print "fp rate estimated to be %1.3f" % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record["name"]
        seq = record["sequence"]
        if "N" in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    print "filtering", args.datafile
    outfile = os.path.basename(args.datafile) + ".abundfilt"
    outfp = open(outfile, "w")

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print "output in", outfile

    if args.savetable:
        print "Saving k-mer counting table filename", args.savetable
        print "...saving to", args.savetable
        htable.save(args.savetable)
Example #4
0
def main():
    counting_ht = sys.argv[1]
    infiles = sys.argv[2:]
        
    print 'file with ht: %s' % counting_ht
    print '-- settings:'
    print 'N THREADS', WORKER_THREADS
    print '--'

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, 1, 1)
    ht.load(counting_ht)

    for infile in infiles:
       print 'filtering', infile
       outfile = infile + '.abundfilt'

       outfp = open(outfile, 'w')

       def process_fn(record, ht=ht):
          name = record['name']
          seq = record['sequence']
          if 'N' in seq:
              return None, None

          trim_seq, trim_at = ht.trim_on_abundance(seq, 2)

          if trim_at >= K:
              return name, trim_seq

          return None, None

       tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

       tsp.start(verbose_fasta_iter(infile), outfp)
Example #5
0
def main():
    htfile = sys.argv[1]
    outfiles = sys.argv[2:]

    print 'loading hashbits'
    ht = khmer.load_hashbits(htfile)

    def process_fn(record, ht=ht):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_sodd(seq, MAX_SODD)

        if trim_at >= ht.ksize():
            return name, trim_seq

        return None, None

    for filename in outfiles:
        outpath = os.path.basename(filename) + '.sodd'
        outfp = open(outpath, 'w')

        tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)
        tsp.start(verbose_fasta_iter(filename), outfp)
Example #6
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument(
        "--cutoff", "-C", dest="coverage", default=DEFAULT_COVERAGE, type=int, help="Diginorm coverage."
    )
    parser.add_argument(
        "--max-error-region",
        "-M",
        dest="max_error_region",
        default=DEFAULT_MAX_ERROR_REGION,
        type=int,
        help="Max length of error region allowed",
    )
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print "file with ht: %s" % counting_ht

    print "loading hashtable"
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()
    C = args.coverage
    max_error_region = args.max_error_region

    print "K:", K
    print "C:", C
    print "max error region:", max_error_region

    # the filtering function.
    def process_fn(record):
        # read_aligner is probably not threadsafe?
        aligner = khmer.new_readaligner(ht, 1, C, max_error_region)

        name = record["name"]
        seq = record["sequence"]

        seq = seq.replace("N", "A")

        grXreAlign, reXgrAlign = aligner.align(seq)

        if len(reXgrAlign) > 0:
            graph_seq = grXreAlign.replace("-", "")
            seq = graph_seq

        return name, seq

    # the filtering loop
    for infile in infiles:
        print "filtering", infile
        outfile = os.path.basename(infile) + ".corr"
        outfp = open(outfile, "w")

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print "output in", outfile
Example #7
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument(
        "--cutoff", "-C", dest="cutoff", default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance."
    )

    parser.add_argument("-V", "--variable-coverage", action="store_true", dest="variable_coverage", default=False)
    parser.add_argument(
        "--normalize-to",
        "-Z",
        type=int,
        dest="normalize_to",
        help="base variable-coverage cutoff on this median k-mer abundance",
        default=DEFAULT_NORMALIZE_LIMIT,
    )

    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print "file with ht: %s" % counting_ht

    print "loading hashtable"
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    ### the filtering function.
    def process_fn(record):
        name = record["name"]
        seq = record["sequence"]
        if "N" in seq:
            return None, None

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = ht.get_median_count(seq)
            if med < args.normalize_to:
                return name, seq

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    ### the filtering loop
    for infile in infiles:
        print "filtering", infile
        outfile = os.path.basename(infile) + ".abundfilt"
        outfp = open(outfile, "w")

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print "output in", outfile
def test_basic_fastq_like():
    tsp = ThreadedSequenceProcessor(idem, 1, 1, verbose=False)

    inseqs = [screed.Record(name='a', sequence='AAA', quality='###'),
              screed.Record(name='b', sequence='TTT', quality='###'), ]
    outfp = StringIO()

    tsp.start(inseqs, outfp)

    x = load_records_fastq(outfp)
    for i in x:
        assert i['quality'] == '###'
Example #9
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--cutoff', '-C', dest='coverage',
                        default=DEFAULT_COVERAGE, type=int,
                        help="Diginorm coverage.")
    parser.add_argument('--max-error-region', '-M', dest='max_error_region',
                        default=DEFAULT_MAX_ERROR_REGION, type=int,
                        help="Max length of error region allowed")
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()
    C = args.coverage
    max_error_region = args.max_error_region

    print "K:", K
    print "C:", C
    print "max error region:", max_error_region

    # the filtering function.
    def process_fn(record):
        # read_aligner is probably not threadsafe?
        aligner = khmer.new_readaligner(ht, 1, C, max_error_region)

        name = record['name']
        seq = record['sequence']

        seq = seq.replace('N', 'A')

        grXreAlign, reXgrAlign = aligner.align(seq)

        if len(reXgrAlign) > 0:
            graph_seq = grXreAlign.replace('-', '')
            seq = graph_seq

        return name, seq

    # the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.corr'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
def test_odd():
    tsp = ThreadedSequenceProcessor(every_other, 1, 1, verbose=False)

    input = [ dict(name='a', sequence='AAA'),
              dict(name='b', sequence='TTT'), ]
    outfp = StringIO()

    tsp.start(input, outfp)

    x = load_records_d(outfp)
    assert len(x) == 1, x
    assert x['b'] == 'TTT'
def test_basic_fastq_like():
    tsp = ThreadedSequenceProcessor(idem, 1, 1, verbose=False)

    input = [dict(name='a', sequence='AAA', accuracy='###'),
             dict(name='b', sequence='TTT', accuracy='###'), ]
    outfp = StringIO()

    tsp.start(input, outfp)

    x = load_records_fastq(outfp)
    for i in x:
        assert i['accuracy'] == '###'
Example #12
0
def main():
    info('filter-abund.py', ['counting'])
    args = get_parser().parse_args()

    check_input_files(args.input_table, args.force)
    infiles = args.input_filename
    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    print('loading counting table:', args.input_table,
          file=sys.stderr)
    htable = khmer.load_counting_hash(args.input_table)
    ksize = htable.ksize()

    print("K:", ksize, file=sys.stderr)

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = htable.get_median_count(seqN)
            if med < args.normalize_to:
                return name, seq

        _, trim_at = htable.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    # the filtering loop
    for infile in infiles:
        print('filtering', infile, file=sys.stderr)
        if args.single_output_filename != '':
            outfile = args.single_output_filename
            outfp = open(outfile, 'a')
        else:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads)
        tsp.start(verbose_loader(infile), outfp)

        print('output in', outfile, file=sys.stderr)
def main():
    repfile = sys.argv[1]
    infile = sys.argv[1]
    if len(sys.argv) >= 3:
        infile = sys.argv[2]

    outfile = os.path.basename(infile) + '.loess'
    if len(sys.argv) >= 4:
        outfile = sys.argv[3]

    print 'file with representative artifacts: %s' % repfile
    print 'input file to degree filter: %s' % infile
    print 'filtering to output:', outfile
    print '-- settings:'
    print 'K', K
    print 'HASHTABLE SIZE %g' % HASHTABLE_SIZE
    print 'N HASHTABLES %d' % N_HT
    print 'N THREADS', WORKER_THREADS
    print 'RADIUS', RADIUS
    print 'MAX DENSITY', MAX_VOLUME / RADIUS
    print '--'

    print 'making hashtable'
    ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)

    outfp = open(outfile, 'w')

    print 'eating', repfile
    ht.consume_fasta(repfile)

    def process_fn(record, ht=ht):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_density_explosion(seq, RADIUS,
                                                         MAX_VOLUME)

#        if trim_at >= K:
#            return name, trim_seq

        if trim_at == len(seq):
            return name, seq

        return None, None

    tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

    ###

    tsp.start(verbose_fasta_iter(infile), outfp)
def test_basic_2thread():
    tsp = ThreadedSequenceProcessor(idem, 2, 1, verbose=False)

    input = [ dict(name='a', sequence='AAA'),
              dict(name='b', sequence='TTT'), ]
    outfp = StringIO()

    tsp.start(input, outfp)

    x = load_records_d(outfp)
    assert len(x) == 2, x
    assert x['a'] == 'AAA'
    assert x['b'] == 'TTT'
def test_basic():
    tsp = ThreadedSequenceProcessor(idem, 1, 1, verbose=False)

    inseqs = [screed.Record(name='a', sequence='AAA'),
              screed.Record(name='b', sequence='TTT'), ]
    outfp = StringIO()

    tsp.start(inseqs, outfp)

    x = load_records_d(outfp)
    assert len(x) == 2, x
    assert x['a'] == 'AAA'
    assert x['b'] == 'TTT'
Example #16
0
def main():
    repfile = sys.argv[1]
    infile = sys.argv[1]
    if len(sys.argv) >= 3:
        infile = sys.argv[2]

    outfile = os.path.basename(infile) + ".loess"
    if len(sys.argv) >= 4:
        outfile = sys.argv[3]

    print "file with representative artifacts: %s" % repfile
    print "input file to degree filter: %s" % infile
    print "filtering to output:", outfile
    print "-- settings:"
    print "K", K
    print "HASHTABLE SIZE %g" % HASHTABLE_SIZE
    print "N HASHTABLES %d" % N_HT
    print "N THREADS", WORKER_THREADS
    print "RADIUS", RADIUS
    print "MAX DENSITY", MAX_VOLUME / RADIUS
    print "--"

    print "making hashtable"
    ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)

    outfp = open(outfile, "w")

    print "eating", repfile
    ht.consume_fasta(repfile)

    def process_fn(record, ht=ht):
        name = record["name"]
        seq = record["sequence"]
        if "N" in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_density_explosion(seq, RADIUS, MAX_VOLUME)

        #        if trim_at >= K:
        #            return name, trim_seq

        if trim_at == len(seq):
            return name, seq

        return None, None

    tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

    ###

    tsp.start(verbose_fasta_iter(infile), outfp)
Example #17
0
def main():
    info('filter-abund.py', ['counting'])
    args = get_parser().parse_args()

    counting_ht = args.input_table
    infiles = args.input_filename

    for _ in infiles:
        check_file_status(_, args.force)

    check_space(infiles, args.force)

    print >>sys.stderr, 'loading hashtable'
    htable = khmer.load_counting_hash(counting_ht)
    ksize = htable.ksize()

    print >>sys.stderr, "K:", ksize

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = htable.get_median_count(seq)
            if med < args.normalize_to:
                return name, seq

        trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)

        if trim_at >= ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print >>sys.stderr, 'filtering', infile
        if args.single_output_filename != '':
            outfile = args.single_output_filename
            outfp = open(outfile, 'a')
        else:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads)
        tsp.start(verbose_loader(infile), outfp)

        print >>sys.stderr, 'output in', outfile
Example #18
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--cutoff', '-C', dest='cutoff',
                        default=DEFAULT_CUTOFF, type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('-o', '--outputpath', dest='outputpath', default='.')
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames
    outpath = args.outputpath    

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    ### the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None


    ### the filtering loop
    for infile in infiles:
       print 'filtering', infile
       outfile = outpath + '/' + os.path.basename(infile) + '.abundfilt'
       outfp = open(outfile, 'w')

       tsp = ThreadedSequenceProcessor(process_fn)
       tsp.start(verbose_loader(infile), outfp)

       print 'output in', outfile
Example #19
0
def main():
    stoptags = sys.argv[1]
    infile = sys.argv[2]

    outfile = os.path.basename(infile) + '.stopkeep'
    if len(sys.argv) >= 4:
        outfile = sys.argv[3]

    print 'file with stop tags: %s' % stoptags
    print 'input file to filter: %s' % infile
    print 'filtering to output:', outfile
    print '-- settings:'
    print 'K', K
    print 'N THREADS', WORKER_THREADS
    print '--'

    print 'making hashtable'
    ht = khmer.new_hashbits(K, 1, 1)

    ht.load_stop_tags(stoptags)

    outfp = open(outfile, 'w')

    def process_fn(record, ht=ht):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_stoptags(seq)

        if trim_at < K:
            return name, seq

        seq = seq[trim_at:]
        if seq:
            return name, seq

        return None, None

    tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

    ###

    tsp.start(verbose_fasta_iter(infile), outfp)
Example #20
0
def main():
    repfile = sys.argv[1]
    infile = sys.argv[2]

    outfile = os.path.basename(infile) + '.fno255'
    if len(sys.argv) >= 4:
        outfile = sys.argv[3]

    print 'file to count from: %s' % repfile
    print 'input file to filter: %s' % infile
    print 'filtering to output:', outfile
    print '-- settings:'
    print 'K', K
    print 'N THREADS', WORKER_THREADS
    print '--'

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    print 'consuming input', repfile
    ht.consume_fasta(repfile)

    outfp = open(outfile, 'w')

    def process_fn(record, ht=ht):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        if len(seq) < K:
            return None, None

        if ht.get_max_count(seq) >= 255:
            return None, None

        return name, seq

    tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

    ###

    tsp.start(verbose_fastq_iter(infile), outfp)
Example #21
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-k', default=DEFAULT_K, type=int, help='k-mer size',
                        dest='ksize')
    parser.add_argument('stoptags_file')
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()
    K = args.ksize

    stoptags = args.stoptags_file
    infiles = args.input_filenames

    print 'loading stop tags, with K', K
    ht = khmer.new_hashbits(K, 1, 1)
    ht.load_stop_tags(stoptags)

    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_stoptags(seq)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    ### the filtering loop
    for infile in infiles:
       print 'filtering', infile
       outfile = os.path.dirname(infile) + '/' + os.path.basename(infile) + '.stopfilt'

       outfp = open(outfile, 'w')

       tsp = ThreadedSequenceProcessor(process_fn)
       tsp.start(verbose_loader(infile), outfp)

       print 'output in', outfile
Example #22
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--cutoff', '-C', dest='cutoff',
                        default=DEFAULT_CUTOFF, type=int,
                        help="Trim at reads above this median abundance.")
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    ### the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']

        med, _, _ = ht.get_median_count(seq)

        if med >= args.cutoff:
            return name, seq

        return None, None

    ### the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.himed'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Example #23
0
def main():
    counting_ht = sys.argv[1]
    infiles = sys.argv[2:]

    print 'file with ht: %s' % counting_ht
    print '-- settings:'
    print 'N THREADS', WORKER_THREADS
    print '--'

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, 1, 1)
    ht.load(counting_ht)

    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.ham1filt'

        outfp = open(outfile, 'w')

        def process_fn(record, ht=ht):
            name = record['name']
            seq = record['sequence']
            if 'N' in seq:
                return None, None

            for pos in range(len(seq) - K):
                kmer = seq[pos:pos + K]
                if ht.max_hamming1_count(kmer) > 2000:
                    trim_at = pos + K - 1
                    seq = seq[:trim_at]
                    break

            if len(seq) >= K:
                return name, seq

            return None, None

        tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

        tsp.start(verbose_fasta_iter(infile), outfp)
Example #24
0
def main():
    info("filter-stoptags.py", ["graph"])
    args = get_parser().parse_args()
    stoptags = args.stoptags_file
    infiles = args.input_filenames

    for _ in infiles:
        check_file_status(_)

    check_space(infiles)

    print "loading stop tags, with K", args.ksize
    htable = khmer.new_hashbits(args.ksize, 1, 1)
    htable.load_stop_tags(stoptags)

    def process_fn(record):
        name = record["name"]
        seq = record["sequence"]
        if "N" in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_stoptags(seq)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print "filtering", infile
        outfile = os.path.basename(infile) + ".stopfilt"

        outfp = open(outfile, "w")

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print "output in", outfile
Example #25
0
def main():
    info('filter-stoptags.py', ['graph'])
    args = get_parser().parse_args()
    stoptags = args.stoptags_file
    infiles = args.input_filenames

    for _ in infiles:
        check_input_files(_, args.force)

    check_space(infiles, args.force)

    print >>sys.stderr, 'loading stop tags, with K', args.ksize
    htable = khmer.new_hashbits(args.ksize, 1, 1)
    htable.load_stop_tags(stoptags)

    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_stoptags(seq)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print >>sys.stderr, 'filtering', infile
        outfile = os.path.basename(infile) + '.stopfilt'

        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print >>sys.stderr, 'output in', outfile
Example #26
0
def main():
    parser = build_counting_args()
    parser.add_argument('--coverage', '-C', dest='coverage',
                        default=DEFAULT_COVERAGE, type=int)
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']

        med, avg, dev = ht.get_median_count(seq)

        if random.randint(1, med) > args.coverage:
            return None, None

        return name, seq

    # the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.medfilt'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Example #27
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument("--coverage", "-C", dest="coverage", default=DEFAULT_COVERAGE, type=int)
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print "file with ht: %s" % counting_ht

    print "loading hashtable"
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    ### the filtering function.
    def process_fn(record):
        name = record["name"]
        seq = record["sequence"]

        med, avg, dev = ht.get_median_count(seq)

        if random.randint(1, med) > args.coverage:
            return None, None

        return name, seq

    ### the filtering loop
    for infile in infiles:
        print "filtering", infile
        outfile = os.path.basename(infile) + ".medfilt"
        outfp = open(outfile, "w")

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print "output in", outfile
Example #28
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    stoptags = args.stoptags_file
    infiles = args.input_filenames

    for _ in infiles:
        check_input_files(_, args.force)

    check_space(infiles, args.force)

    print('loading stop tags, with K', args.ksize, file=sys.stderr)
    nodegraph = Nodegraph(args.ksize, 1, 1)
    nodegraph.load_stop_tags(stoptags)

    def process_fn(record):
        name = record.name
        seq = record.sequence
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = nodegraph.trim_on_stoptags(seq)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print('filtering', infile, file=sys.stderr)
        outfile = os.path.basename(infile) + '.stopfilt'

        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print('output in', outfile, file=sys.stderr)
Example #29
0
def main():
    print '-- settings:'
    print 'K', K
    print 'N THREADS', WORKER_THREADS
    print '--'

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    for filename in sys.argv[1:]:
        print 'consuming input', filename
        ht.consume_fasta(filename)

    def process_fn(record, ht=ht):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        if len(seq) < K:
            return None, None

        if ht.get_min_count(seq) < 2:
            return None, None

        return name, seq

    for filename in sys.argv[1:]:
        print '***', filename
        outfile = os.path.basename(filename) + '.f2'
        if os.path.exists(outfile):
            print 'SKIPPING', outfile, ' -- already exists'
            continue

        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)
        tsp.start(verbose_fasta_iter(filename), outfp)
Example #30
0
def main():
    print "-- settings:"
    print "K", K
    print "N THREADS", WORKER_THREADS
    print "--"

    print "making hashtable"
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    for filename in sys.argv[1:]:
        print "consuming input", filename
        ht.consume_fasta(filename)

    def process_fn(record, ht=ht):
        name = record["name"]
        seq = record["sequence"]
        if "N" in seq:
            return None, None

        if len(seq) < K:
            return None, None

        if ht.get_min_count(seq) < 2:
            return None, None

        return name, seq

    for filename in sys.argv[1:]:
        print "***", filename
        outfile = os.path.basename(filename) + ".f2"
        if os.path.exists(outfile):
            print "SKIPPING", outfile, " -- already exists"
            continue

        outfp = open(outfile, "w")

        tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)
        tsp.start(verbose_fasta_iter(filename), outfp)
Example #31
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--cutoff',
                        '-C',
                        dest='cutoff',
                        default=DEFAULT_CUTOFF,
                        type=int,
                        help="Trim at k-mers below this abundance.")

    parser.add_argument('-V',
                        '--variable-coverage',
                        action='store_true',
                        dest='variable_coverage',
                        default=False)
    parser.add_argument('--normalize-to',
                        '-Z',
                        type=int,
                        dest='normalize_to',
                        help='base variable-coverage cutoff on this median'
                        ' k-mer abundance',
                        default=DEFAULT_NORMALIZE_LIMIT)

    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = ht.get_median_count(seq)
            if med < args.normalize_to:
                return name, seq

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.abundfilt'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Example #32
0
def main():
    info('filter-abund-single.py', ['counting', 'SeqAn'])
    args = get_parser().parse_args()
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)
    if args.savetable:
        check_space_for_hashtable(args, 'countgraph', args.force)
    report_on_config(args)

    print('making countgraph', file=sys.stderr)
    htable = khmer_args.create_countgraph(args)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    print('consuming input, round 1 --', args.datafile, file=sys.stderr)
    for _ in range(args.threads):
        cur_thread = \
            threading.Thread(
                target=htable.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    if args.report_total_kmers:
        print('Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers()),
              file=sys.stderr)

    fp_rate = khmer.calc_expected_collisions(htable, args.force)
    print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        _, trim_at = htable.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= args.ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    # the filtering loop
    print('filtering', args.datafile, file=sys.stderr)
    outfile = os.path.basename(args.datafile) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print('output in', outfile, file=sys.stderr)

    if args.savetable:
        print('Saving k-mer counting table filename',
              args.savetable,
              file=sys.stderr)
        print('...saving to', args.savetable, file=sys.stderr)
        htable.save(args.savetable)
    print('wrote to: ', outfile, file=sys.stderr)
Example #33
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('filter-abund.py', ['counting'])

    configure_logging(args.quiet)

    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    log_info('loading countgraph: {graph}', graph=args.input_graph)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    log_info("K: {ksize}", ksize=ksize)

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = countgraph.get_median_count(seqN)
            if med < args.normalize_to:
                return name, seq

        _, trim_at = countgraph.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        log_info('filtering {infile}', infile=infile)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads,
                                        verbose=not args.quiet)
        tsp.start(verbose_loader(infile), outfp)

        log_info('output in {outfile}', outfile=outfile)
Example #34
0
def main():
    parser = build_construct_args(
        "Filter k-mers at the given abundance (inmem version).")
    add_threading_args(parser)

    parser.add_argument('--cutoff',
                        '-C',
                        dest='cutoff',
                        default=DEFAULT_CUTOFF,
                        type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--savehash', dest='savehash', default='')
    parser.add_argument('datafile')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)

    filename = args.datafile

    # first, load reads into hash table
    rparser = khmer.ReadParser(filename, n_threads)
    threads = []
    print 'consuming input, round 1 --', filename
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    # the filtering loop
    print 'filtering', filename
    outfile = os.path.basename(filename) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(filename), outfp)

    print 'output in', outfile

    if args.savehash:
        print 'Saving hashfile', args.savehash
        print '...saving to', args.savehash
        ht.save(args.savehash)
Example #35
0
def main():
    info('filter-abund-single.py', ['counting', 'SeqAn'])
    args = get_parser().parse_args()
    check_file_status(args.datafile, args.force)
    check_space([args.datafile], args.force)
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize,
                                  args.force)
    report_on_config(args)

    print >> sys.stderr, 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    print >> sys.stderr, 'consuming input, round 1 --', args.datafile
    for _ in xrange(args.threads):
        cur_thread = \
            threading.Thread(
                target=htable.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable)
    print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    print >> sys.stderr, 'filtering', args.datafile
    outfile = os.path.basename(args.datafile) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print >> sys.stderr, 'output in', outfile

    if args.savetable:
        print >>sys.stderr, 'Saving k-mer counting table filename', \
            args.savetable
        print >> sys.stderr, '...saving to', args.savetable
        htable.save(args.savetable)
    print >> sys.stderr, 'wrote to: ', outfile
Example #36
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('filter-abund-single.py', ['counting', 'SeqAn'])

    configure_logging(args.quiet)
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    log_info('making countgraph')
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile)
    for _ in range(args.threads):
        cur_thread = \
            threading.Thread(
                target=graph.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = graph.get_median_count(seqN)
            if med < args.normalize_to:
                return name, seq

        _, trim_at = graph.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= args.ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    # the filtering loop
    log_info('filtering {datafile}', datafile=args.datafile)
    if args.outfile is None:
        outfile = os.path.basename(args.datafile) + '.abundfilt'
    else:
        outfile = args.outfile
    outfp = open(outfile, 'wb')
    outfp = get_file_writer(outfp, args.gzip, args.bzip)

    tsp = ThreadedSequenceProcessor(process_fn, verbose=not args.quiet)
    tsp.start(verbose_loader(args.datafile), outfp)

    log_info('output in {outfile}', outfile=outfile)

    if args.savegraph:
        log_info('Saving k-mer countgraph filename {graph}',
                 graph=args.savegraph)
        graph.save(args.savegraph)