Example #1
0
def main():
    info('sweep-reads-buffered.py', ['sweep'])
    parser = get_parser()
    args = parser.parse_args()

    if args.min_tablesize < MIN_HSIZE:
        args.min_tablesize = MIN_HSIZE
    if args.ksize < MIN_KSIZE:
        args.ksize = MIN_KSIZE

    report_on_config(args, hashtype='hashbits')

    K = args.ksize
    HT_SIZE = args.min_tablesize
    N_HT = args.n_tables

    traversal_range = args.traversal_range
    input_fastp = args.input_fastp

    if not args.outdir:
        outdir = os.path.dirname(input_fastp)
    else:
        outdir = args.outdir

    max_buffers = args.max_buffers
    output_pref = args.output_prefix
    buf_size = args.buffer_size
    max_reads = args.max_reads

    check_file_status(args.input_fastp)
    check_valid_file_exists(args.input_files)
    all_input_files = [input_fastp]
    all_input_files.extend(args.input_files)

    # Check disk space availability
    check_space(all_input_files)

    # figure out input file type (FA/FQ) -- based on first file
    ix = iter(screed.open(args.input_files[0]))
    record = ix.next()
    del ix

    extension = 'fa'
    if hasattr(record, 'accuracy'):      # fastq!
        extension = 'fq'

    output_buffer = ReadBufferManager(
        max_buffers, max_reads, buf_size, output_pref, outdir, extension)

    # consume the partitioned fasta with which to label the graph
    ht = khmer.LabelHash(K, HT_SIZE, N_HT)
    try:
        print >>sys.stderr, 'consuming input sequences...'
        if args.label_by_pid:
            print >>sys.stderr, '...labeling by partition id (pid)'
            ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp)
        elif args.label_by_seq:
            print >>sys.stderr, '...labeling by sequence'
            for n, record in enumerate(screed.open(input_fastp)):
                if n % 50000 == 0:
                    print >>sys.stderr, \
                        '...consumed {n} sequences...'.format(n=n)
                ht.consume_sequence_and_tag_with_labels(record.sequence, n)
        else:
            print >>sys.stderr, \
                '...labeling to create groups of size {s}'.format(
                    s=args.group_size)
            label = -1
            g = 0
            try:
                outfp = open('{pref}_base_{g}.{ext}'.format(pref=output_pref,
                                                            g=g,
                                                            ext=extension
                                                            ), 'wb')
                for n, record in enumerate(screed.open(input_fastp)):
                    if n % args.group_size == 0:
                        label += 1
                        if label > g:
                            g = label
                            outfp = open('{pref}_base_{g}.{ext}'.format(
                                pref=output_pref, g=g,
                                ext=extension), 'wb')
                    if n % 50000 == 0:
                        print >>sys.stderr, \
                            '...consumed {n} sequences...'.format(n=n)
                    ht.consume_sequence_and_tag_with_labels(record.sequence,
                                                            label)

                    if hasattr(record, 'accuracy'):
                        outfp.write('@{name}\n{seq}+{accuracy}\n'.format(
                            name=record.name,
                            seq=record.sequence,
                            accuracy=record.accuracy))
                    else:
                        outfp.write('>{name}\n{seq}\n'.format(
                            name=record.name,
                            seq=record.sequence))

            except IOError as e:
                print >>sys.stderr, '!! ERROR !!', e
                print >>sys.stderr, '...error splitting input. exiting...'

    except IOError as e:
        print >>sys.stderr, '!! ERROR: !!', e
        print >>sys.stderr, '...error consuming \
                            {i}. exiting...'.format(i=input_fastp)

    print >>sys.stderr, 'done consuming input sequence. \
                        added {t} tags and {l} \
                        labels...'.format(t=ht.n_tags(), l=ht.n_labels())

    label_dict = defaultdict(int)
    label_number_dist = []

    n_orphaned = 0
    n_labeled = 0
    n_mlabeled = 0

    total_t = time.clock()
    start_t = time.clock()
    for read_file in args.input_files:
        print >>sys.stderr, '** sweeping {read_file} for labels...'.format(
            read_file=read_file)
        file_t = 0.0
        try:
            read_fp = screed.open(read_file)
        except IOError as error:
            print >>sys.stderr, '!! ERROR: !!', error
            print >>sys.stderr, '*** Could not open {fn}, skipping...'.format(
                fn=read_file)
        else:
            for _, record in enumerate(read_fp):
                if _ % 50000 == 0:
                    end_t = time.clock()
                    batch_t = end_t - start_t
                    file_t += batch_t
                    print >>sys.stderr, '\tswept {n} reads [{nc} labeled, \
                                         {no} orphaned] \
                                        ** {sec}s ({sect}s total)' \
                                        .format(n=_, nc=n_labeled,
                                                no=n_orphaned,
                                                sec=batch_t, sect=file_t)
                    start_t = time.clock()
                seq = record.sequence
                name = record.name
                try:
                    labels = ht.sweep_label_neighborhood(seq, traversal_range)
                except ValueError as e:
                    pass
                else:
                    if hasattr(record, 'accuracy'):
                        seq_str = fmt_fastq(name, seq, record.accuracy, labels)
                    else:
                        seq_str = fmt_fasta(name, seq, labels)
                    label_number_dist.append(len(labels))
                    if labels:
                        n_labeled += 1
                        if len(labels) > 1:
                            output_buffer.queue(seq_str, 'multi')
                            n_mlabeled += 1
                            label_dict['multi'] += 1
                        else:
                            output_buffer.queue(seq_str, labels[0])
                            label_dict[labels[0]] += 1
                    else:
                        n_orphaned += 1
                        output_buffer.queue(seq_str, 'orphaned')
                        label_dict['orphaned'] += 1
            print >>sys.stderr, '** End of file {fn}...'.format(fn=read_file)
            output_buffer.flush_all()
            read_fp.close()

    # gotta output anything left in the buffers at the end!
    print >>sys.stderr, '** End of run...'
    output_buffer.flush_all()
    total_t = time.clock() - total_t

    if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0:
        print >>sys.stderr, '! WARNING: Sweep finished with errors !'
        print >>sys.stderr, '** {writee} reads not written'.format(
            writee=output_buffer.num_write_errors)
        print >>sys.stderr, '** {filee} errors opening files'.format(
            filee=output_buffer.num_file_errors)

    print >>sys.stderr, 'swept {n_reads} for labels...'.format(
        n_reads=n_labeled + n_orphaned)
    print >>sys.stderr, '...with {nc} labeled and {no} orphaned'.format(
        nc=n_labeled, no=n_orphaned)
    print >>sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled)

    print >>sys.stderr, '** outputting label number distribution...'
    fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref))
    with open(fn, 'wb') as outfp:
        for nc in label_number_dist:
            outfp.write('{nc}\n'.format(nc=nc))

    fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref))
    print >>sys.stderr, '** outputting label read counts...'
    with open(fn, 'wb') as outfp:
        for k in label_dict:
            outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))
Example #2
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    info('normalize-by-median.py', ['diginorm'])
    args = get_parser().parse_args()

    report_on_config(args)

    report_fp = args.report

    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(
            args.n_tables * args.min_tablesize, args.force)

    # list to save error files along with throwing exceptions
    if args.force:
        corrupt_files = []

    if args.loadtable:
        print 'loading k-mer counting table from', args.loadtable
        htable = khmer.load_counting_hash(args.loadtable)
    else:
        print 'making k-mer counting table'
        htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                         args.n_tables)

    total = 0
    discarded = 0
    input_filename = None

    for index, input_filename in enumerate(args.input_filenames):
        if args.single_output_filename != '':
            output_name = args.single_output_filename
            outfp = open(args.single_output_filename, 'a')
        else:
            output_name = os.path.basename(input_filename) + '.keep'
            outfp = open(output_name, 'w')

        total_acc = 0
        discarded_acc = 0

        try:
            total_acc, discarded_acc = normalize_by_median(input_filename,
                                                           outfp, htable, args,
                                                           report_fp)
        except IOError as err:
            handle_error(err, output_name, input_filename, args.fail_save,
                         htable)
            if not args.force:
                print >> sys.stderr, '** Exiting!'

                sys.exit(1)
            else:
                print >> sys.stderr, '*** Skipping error file, moving on...'
                corrupt_files.append(input_filename)
        else:
            if total_acc == 0 and discarded_acc == 0:
                print 'SKIPPED empty file', input_filename
            else:
                total += total_acc
                discarded += discarded_acc
                print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\
                      .format(inp=input_filename, kept=total - discarded,
                              total=total, perc=int(100. - discarded /
                                                    float(total) * 100.))
                print 'output in', output_name

        if (args.dump_frequency > 0 and
                index > 0 and index % args.dump_frequency == 0):
            print 'Backup: Saving k-mer counting file through', input_filename
            if args.savetable:
                hashname = args.savetable
                print '...saving to', hashname
            else:
                hashname = 'backup.ct'
                print 'Nothing given for savetable, saving to', hashname
            htable.save(hashname)

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    if args.savetable:
        print 'Saving k-mer counting table through', input_filename
        print '...saving to', args.savetable
        htable.save(args.savetable)

    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        print >> sys.stderr, "** WARNING: Finished with errors!"
        print >> sys.stderr, "** IOErrors occurred in the following files:"
        print >> sys.stderr, "\t", " ".join(corrupt_files)

    if fp_rate > MAX_FALSE_POSITIVE_RATE:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " for this data set. Increase tablesize/# "
                              "tables.")
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        if not args.force:
            sys.exit(1)
Example #3
0
def main():
    info("sweep-reads-buffered.py", ["sweep"])
    parser = get_parser()
    args = parser.parse_args()

    if args.min_tablesize < MIN_HSIZE:
        args.min_tablesize = MIN_HSIZE
    if args.ksize < MIN_KSIZE:
        args.ksize = MIN_KSIZE

    report_on_config(args, hashtype="hashbits")

    K = args.ksize
    HT_SIZE = args.min_tablesize
    N_HT = args.n_tables

    traversal_range = args.traversal_range
    input_fastp = args.input_fastp

    if not args.outdir:
        outdir = os.path.dirname(input_fastp)
    else:
        outdir = args.outdir

    max_buffers = args.max_buffers
    output_pref = args.output_prefix
    buf_size = args.buffer_size
    max_reads = args.max_reads

    check_file_status(args.input_fastp)
    check_valid_file_exists(args.input_files)
    all_input_files = [input_fastp]
    all_input_files.extend(args.input_files)
    # Check disk space availability
    check_space(all_input_files)

    output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size, output_pref, outdir)

    # consume the partitioned fasta with which to label the graph
    ht = khmer.LabelHash(K, HT_SIZE, N_HT)
    try:
        print >>sys.stderr, "consuming input sequences..."
        if args.label_by_pid:
            print >>sys.stderr, "...labeling by partition id (pid)"
            ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp)
        elif args.label_by_seq:
            print >>sys.stderr, "...labeling by sequence"
            for n, record in enumerate(screed.open(input_fastp)):
                if n % 50000 == 0:
                    print >>sys.stderr, "...consumed {n} sequences...".format(n=n)
                ht.consume_sequence_and_tag_with_labels(record.sequence, n)
        else:
            print >>sys.stderr, "...labeling to create groups of size {s}".format(s=args.group_size)
            label = -1
            g = 0
            try:
                outfp = open("{pref}_base_{g}.fa".format(pref=output_pref, g=g), "wb")
                for n, record in enumerate(screed.open(input_fastp)):
                    if n % args.group_size == 0:
                        label += 1
                        if label > g:
                            g = label
                            outfp = open("{pref}_base_{g}.fa".format(pref=output_pref, g=g), "wb")
                    if n % 50000 == 0:
                        print >>sys.stderr, "...consumed {n} sequences...".format(n=n)
                    ht.consume_sequence_and_tag_with_labels(record.sequence, label)
                    outfp.write(">{name}\n{seq}\n".format(name=record.name, seq=record.sequence))

            except IOError as e:
                print >>sys.stderr, "!! ERROR !!", e
                print >>sys.stderr, "...error splitting input. exiting..."

    except IOError as e:
        print >>sys.stderr, "!! ERROR: !!", e
        print >>sys.stderr, "...error consuming \
                            {i}. exiting...".format(
            i=input_fastp
        )

    print >>sys.stderr, "done consuming input sequence. \
                        added {t} tags and {l} \
                        labels...".format(
        t=ht.n_tags(), l=ht.n_labels()
    )

    label_dict = defaultdict(int)
    label_number_dist = []

    n_orphaned = 0
    n_labeled = 0
    n_mlabeled = 0

    total_t = time.clock()
    start_t = time.clock()
    for read_file in args.input_files:
        print >>sys.stderr, "** sweeping {read_file} for labels...".format(read_file=read_file)
        file_t = 0.0
        try:
            read_fp = screed.open(read_file)
        except IOError as error:
            print >>sys.stderr, "!! ERROR: !!", error
            print >>sys.stderr, "*** Could not open {fn}, skipping...".format(fn=read_file)
        else:
            for _, record in enumerate(read_fp):
                if _ % 50000 == 0:
                    end_t = time.clock()
                    batch_t = end_t - start_t
                    file_t += batch_t
                    print >>sys.stderr, "\tswept {n} reads [{nc} labeled, \
                                         {no} orphaned] \
                                        ** {sec}s ({sect}s total)".format(
                        n=_, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t
                    )
                    start_t = time.clock()
                seq = record.sequence
                name = record.name
                try:
                    labels = ht.sweep_label_neighborhood(seq, traversal_range)
                except ValueError as e:
                    pass
                else:
                    seq_str = fmt_fasta(name, seq, labels)
                    label_number_dist.append(len(labels))
                    if labels:
                        n_labeled += 1
                        if len(labels) > 1:
                            output_buffer.queue(seq_str, "multi")
                            n_mlabeled += 1
                            label_dict["multi"] += 1
                        else:
                            output_buffer.queue(seq_str, labels[0])
                            label_dict[labels[0]] += 1
                    else:
                        n_orphaned += 1
                        output_buffer.queue(seq_str, "orphaned")
                        label_dict["orphaned"] += 1
            print >>sys.stderr, "** End of file {fn}...".format(fn=read_file)
            output_buffer.flush_all()
            read_fp.close()

    # gotta output anything left in the buffers at the end!
    print >>sys.stderr, "** End of run..."
    output_buffer.flush_all()
    total_t = time.clock() - total_t

    if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0:
        print >>sys.stderr, "! WARNING: Sweep finished with errors !"
        print >>sys.stderr, "** {writee} reads not written".format(writee=output_buffer.num_write_errors)
        print >>sys.stderr, "** {filee} errors opening files".format(filee=output_buffer.num_file_errors)

    print >>sys.stderr, "swept {n_reads} for labels...".format(n_reads=n_labeled + n_orphaned)
    print >>sys.stderr, "...with {nc} labeled and {no} orphaned".format(nc=n_labeled, no=n_orphaned)
    print >>sys.stderr, "...and {nmc} multilabeled".format(nmc=n_mlabeled)

    print >>sys.stderr, "** outputting label number distribution..."
    fn = os.path.join(outdir, "{pref}.dist.txt".format(pref=output_pref))
    with open(fn, "wb") as outfp:
        for nc in label_number_dist:
            outfp.write("{nc}\n".format(nc=nc))

    fn = os.path.join(outdir, "{pref}.counts.csv".format(pref=output_pref))
    print >>sys.stderr, "** outputting label read counts..."
    with open(fn, "wb") as outfp:
        for k in label_dict:
            outfp.write("{l},{c}\n".format(l=k, c=label_dict[k]))