Esempio n. 1
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--cutoff',
                        '-C',
                        dest='coverage',
                        default=DEFAULT_COVERAGE,
                        type=int,
                        help="Diginorm coverage.")
    parser.add_argument('--max-error-region',
                        '-M',
                        dest='max_error_region',
                        default=DEFAULT_MAX_ERROR_REGION,
                        type=int,
                        help="Max length of error region allowed")
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()
    C = args.coverage
    max_error_region = args.max_error_region

    print "K:", K
    print "C:", C
    print "max error region:", max_error_region

    # the filtering function.
    def process_fn(record):
        # read_aligner is probably not threadsafe?
        aligner = khmer.new_readaligner(ht, 1, C, max_error_region)

        name = record['name']
        seq = record['sequence']

        seq = seq.replace('N', 'A')

        grXreAlign, reXgrAlign = aligner.align(seq)

        if len(reXgrAlign) > 0:
            graph_seq = grXreAlign.replace('-', '')
            seq = graph_seq

        return name, seq

    # the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.corr'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Esempio n. 2
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument(
        "--cutoff", "-C", dest="coverage", default=DEFAULT_COVERAGE, type=int, help="Diginorm coverage."
    )
    parser.add_argument(
        "--max-error-region",
        "-M",
        dest="max_error_region",
        default=DEFAULT_MAX_ERROR_REGION,
        type=int,
        help="Max length of error region allowed",
    )
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print "file with ht: %s" % counting_ht

    print "loading hashtable"
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()
    C = args.coverage
    max_error_region = args.max_error_region

    print "K:", K
    print "C:", C
    print "max error region:", max_error_region

    # the filtering function.
    def process_fn(record):
        # read_aligner is probably not threadsafe?
        aligner = khmer.new_readaligner(ht, 1, C, max_error_region)

        name = record["name"]
        seq = record["sequence"]

        seq = seq.replace("N", "A")

        grXreAlign, reXgrAlign = aligner.align(seq)

        if len(reXgrAlign) > 0:
            graph_seq = grXreAlign.replace("-", "")
            seq = graph_seq

        return name, seq

    # the filtering loop
    for infile in infiles:
        print "filtering", infile
        outfile = os.path.basename(infile) + ".corr"
        outfp = open(outfile, "w")

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print "output in", outfile
Esempio n. 3
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument(
        "--cutoff", "-C", dest="cutoff", default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance."
    )

    parser.add_argument("-V", "--variable-coverage", action="store_true", dest="variable_coverage", default=False)
    parser.add_argument(
        "--normalize-to",
        "-Z",
        type=int,
        dest="normalize_to",
        help="base variable-coverage cutoff on this median k-mer abundance",
        default=DEFAULT_NORMALIZE_LIMIT,
    )

    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print "file with ht: %s" % counting_ht

    print "loading hashtable"
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    ### the filtering function.
    def process_fn(record):
        name = record["name"]
        seq = record["sequence"]
        if "N" in seq:
            return None, None

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = ht.get_median_count(seq)
            if med < args.normalize_to:
                return name, seq

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    ### the filtering loop
    for infile in infiles:
        print "filtering", infile
        outfile = os.path.basename(infile) + ".abundfilt"
        outfp = open(outfile, "w")

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print "output in", outfile
Esempio n. 4
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--coverage',
                        '-C',
                        dest='coverage',
                        default=DEFAULT_COVERAGE,
                        type=int)
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    xxxfp = None

    print "K:", K

    ### the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']

        med, avg, dev = ht.get_median_count(seq)
        pct = dev / avg * 100

        xxxfp.write('%s %s %s %s %s\n' % (med, avg, dev, pct, name))

        if random.randint(1, med) > args.coverage or pct > 100:
            return None, None

        return name, seq

    ### the filtering loop
    for infile in infiles:
        print 'filtering', infile
        xxxfp = open(os.path.basename(infile) + '.medpctfilt.stats', 'w')
        outfile = os.path.basename(infile) + '.medpctfilt'
        outfp = open(outfile, 'w')

        for n, record in enumerate(screed.open(infile)):
            if n % 100000 == 0:
                print '...', n

            name, seq = process_fn(record)
            if name and seq:
                print >> outfp, '>%s\n%s' % (name, seq)

        print 'output in', outfile
Esempio n. 5
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--cutoff', '-C', dest='coverage',
                        default=DEFAULT_COVERAGE, type=int,
                        help="Diginorm coverage.")
    parser.add_argument('--max-error-region', '-M', dest='max_error_region',
                        default=DEFAULT_MAX_ERROR_REGION, type=int,
                        help="Max length of error region allowed")
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()
    C = args.coverage
    max_error_region = args.max_error_region

    print "K:", K
    print "C:", C
    print "max error region:", max_error_region

    # the filtering function.
    def process_fn(record):
        # read_aligner is probably not threadsafe?
        aligner = khmer.new_readaligner(ht, 1, C, max_error_region)

        name = record['name']
        seq = record['sequence']

        seq = seq.replace('N', 'A')

        grXreAlign, reXgrAlign = aligner.align(seq)

        if len(reXgrAlign) > 0:
            graph_seq = grXreAlign.replace('-', '')
            seq = graph_seq

        return name, seq

    # the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.corr'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--coverage', '-C', dest='coverage',
                        default=DEFAULT_COVERAGE, type=int)
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    xxxfp = None

    print "K:", K

    ### the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']

        med, avg, dev = ht.get_median_count(seq)
        pct = dev / avg * 100

        xxxfp.write('%s %s %s %s %s\n' % (med, avg, dev, pct, name))

        if random.randint(1, med) > args.coverage or pct > 100:
            return None, None

        return name, seq

    ### the filtering loop
    for infile in infiles:
       print 'filtering', infile
       xxxfp = open(os.path.basename(infile) + '.medpctfilt.stats', 'w')
       outfile = os.path.basename(infile) + '.medpctfilt'
       outfp = open(outfile, 'w')

       for n, record in enumerate(screed.open(infile)):
           if n % 100000 == 0:
               print '...', n

           name, seq = process_fn(record)
           if name and seq:
               print >>outfp, '>%s\n%s' % (name, seq)

       print 'output in', outfile
Esempio n. 7
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--cutoff',
                        '-C',
                        dest='cutoff',
                        default=DEFAULT_CUTOFF,
                        type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('-o', '--outputpath', dest='outputpath', default='.')
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames
    outpath = args.outputpath

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    ### the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    ### the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = outpath + '/' + os.path.basename(infile) + '.abundfilt'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Esempio n. 8
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--cutoff', '-C', dest='cutoff',
                        default=DEFAULT_CUTOFF, type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('-o', '--outputpath', dest='outputpath', default='.')
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames
    outpath = args.outputpath    

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    ### the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None


    ### the filtering loop
    for infile in infiles:
       print 'filtering', infile
       outfile = outpath + '/' + os.path.basename(infile) + '.abundfilt'
       outfp = open(outfile, 'w')

       tsp = ThreadedSequenceProcessor(process_fn)
       tsp.start(verbose_loader(infile), outfp)

       print 'output in', outfile
Esempio n. 9
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--cutoff',
                        '-C',
                        dest='cutoff',
                        default=DEFAULT_CUTOFF,
                        type=int,
                        help="Trim at reads above this median abundance.")
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    ### the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']

        med, _, _ = ht.get_median_count(seq)

        if med >= args.cutoff:
            return name, seq

        return None, None

    ### the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.himed'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Esempio n. 10
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--coverage',
                        '-C',
                        dest='coverage',
                        default=DEFAULT_COVERAGE,
                        type=int)
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    ### the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']

        med, avg, dev = ht.get_median_count(seq)

        if random.randint(1, med) > args.coverage:
            return None, None

        return name, seq

    ### the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.medfilt'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Esempio n. 11
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--cutoff', '-C', dest='cutoff',
                        default=DEFAULT_CUTOFF, type=int,
                        help="Trim at reads above this median abundance.")
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    ### the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']

        med, _, _ = ht.get_median_count(seq)

        if med >= args.cutoff:
            return name, seq

        return None, None

    ### the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.himed'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Esempio n. 12
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--coverage', '-C', dest='coverage',
                        default=DEFAULT_COVERAGE, type=int)
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']

        med, avg, dev = ht.get_median_count(seq)

        if random.randint(1, med) > args.coverage:
            return None, None

        return name, seq

    # the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.medfilt'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Esempio n. 13
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument("--coverage", "-C", dest="coverage", default=DEFAULT_COVERAGE, type=int)
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print "file with ht: %s" % counting_ht

    print "loading hashtable"
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    ### the filtering function.
    def process_fn(record):
        name = record["name"]
        seq = record["sequence"]

        med, avg, dev = ht.get_median_count(seq)

        if random.randint(1, med) > args.coverage:
            return None, None

        return name, seq

    ### the filtering loop
    for infile in infiles:
        print "filtering", infile
        outfile = os.path.basename(infile) + ".medfilt"
        outfp = open(outfile, "w")

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print "output in", outfile
Esempio n. 14
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--cutoff',
                        '-C',
                        dest='cutoff',
                        default=DEFAULT_CUTOFF,
                        type=int,
                        help="Trim at k-mers below this abundance.")

    parser.add_argument('-V',
                        '--variable-coverage',
                        action='store_true',
                        dest='variable_coverage',
                        default=False)
    parser.add_argument('--normalize-to',
                        '-Z',
                        type=int,
                        dest='normalize_to',
                        help='base variable-coverage cutoff on this median'
                        ' k-mer abundance',
                        default=DEFAULT_NORMALIZE_LIMIT)

    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = ht.get_median_count(seq)
            if med < args.normalize_to:
                return name, seq

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.abundfilt'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile