コード例 #1
0
    def test_assemble_left_double_fork(self, left_double_fork_structure):
        # assemble entire contig + branch points b/c of labels; start from end
        graph, contig, L, HDN, R, branch = left_double_fork_structure
        lh = khmer.GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)

        # first try without the labels
        paths = asm.assemble(contig[-K:])

        assert len(paths) == 1
        # without labels, should get the beginning of the HDN thru the end
        assert paths[0] == contig[HDN.pos:]

        # now add labels and check that we get two full length paths
        hdn = graph.find_high_degree_nodes(contig)
        hdn += graph.find_high_degree_nodes(branch)
        print(list(hdn))
        lh.label_across_high_degree_nodes(contig, hdn, 1)
        lh.label_across_high_degree_nodes(branch, hdn, 2)
        print(lh.get_tag_labels(list(hdn)[0]))

        paths = asm.assemble(contig[-K:])

        assert len(paths) == 2

        assert any(utils._equals_rc(path, contig) for path in paths)
        assert any(utils._equals_rc(path, branch) for path in paths)
コード例 #2
0
    def test_hash_as_seed(self, linear_structure):
        graph, contig = linear_structure
        lh = khmer.GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)

        left = graph.hash(contig[:K])
        assert utils._equals_rc(asm.assemble(left).pop(), contig)
コード例 #3
0
    def test_assemble_tandem_repeats(self, tandem_repeat_structure):
        # assemble one copy of a tandem repeat
        graph, repeat, tandem_repeats = tandem_repeat_structure
        lh = khmer.GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)
        paths = asm.assemble(repeat[:K])

        assert len(paths) == 1
        # There are K-1 k-mers spanning the junction between
        # the beginning and end of the repeat
        assert len(paths[0]) == len(repeat) + K - 1
コード例 #4
0
    def test_assemble_snp_bubble_single(self, snp_bubble_structure):
        # assemble entire contig + one of two paths through a bubble
        graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure
        lh = khmer.GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)

        hdn = graph.find_high_degree_nodes(wildtype)
        assert len(hdn) == 2
        lh.label_across_high_degree_nodes(wildtype, hdn, 1)

        paths = asm.assemble(wildtype[:K])

        assert len(paths) == 1
        assert utils._equals_rc(paths[0], wildtype)
コード例 #5
0
    def test_beginning_to_end_across_tip(self, right_tip_structure):
        # assemble entire contig, ignoring branch point b/c of labels
        graph, contig, L, HDN, R, tip = right_tip_structure
        lh = khmer.GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)
        hdn = graph.find_high_degree_nodes(contig)
        # L, HDN, and R will be labeled with 1
        lh.label_across_high_degree_nodes(contig, hdn, 1)

        path = asm.assemble(contig[:K])

        assert len(path) == 1, "there should only be one path"
        path = path[0]  # @CTB

        assert len(path) == len(contig)
        assert utils._equals_rc(path, contig)
コード例 #6
0
    def test_assemble_snp_bubble_both(self, snp_bubble_structure):
        # assemble entire contig + both paths
        graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure
        lh = khmer.GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)

        hdn = graph.find_high_degree_nodes(wildtype)
        hdn += graph.find_high_degree_nodes(mutant)
        assert len(hdn) == 2
        lh.label_across_high_degree_nodes(wildtype, hdn, 1)
        lh.label_across_high_degree_nodes(mutant, hdn, 2)

        paths = asm.assemble(wildtype[:K])

        assert len(paths) == 2

        assert any(utils._contains_rc(wildtype, path) for path in paths)
        assert any(utils._contains_rc(mutant, path) for path in paths)
コード例 #7
0
    def test_assemble_right_double_fork(self, right_double_fork_structure):
        # assemble two contigs from a double forked structure
        graph, contig, L, HDN, R, branch = right_double_fork_structure
        lh = khmer.GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)

        hdn = graph.find_high_degree_nodes(contig)
        hdn += graph.find_high_degree_nodes(branch)
        print(list(hdn))
        lh.label_across_high_degree_nodes(contig, hdn, 1)
        lh.label_across_high_degree_nodes(branch, hdn, 2)
        print(lh.get_tag_labels(list(hdn)[0]))

        paths = asm.assemble(contig[:K])
        print('Path lengths', [len(x) for x in paths])

        assert len(paths) == 2

        assert any(utils._equals_rc(path, contig) for path in paths)
        assert any(utils._equals_rc(path, branch) for path in paths)
コード例 #8
0
    def test_assemble_snp_bubble_stopbf(self, snp_bubble_structure):
        # assemble one side of bubble, blocked with stop_filter,
        # when labels on both branches
        # stop_filter should trip a filter failure, negating the label spanning
        graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure
        stop_filter = khmer.Nodegraph(K, 1e5, 4)
        lh = khmer.GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh, stop_filter=stop_filter)

        hdn = graph.find_high_degree_nodes(wildtype)
        hdn += graph.find_high_degree_nodes(mutant)
        assert len(hdn) == 2
        lh.label_across_high_degree_nodes(wildtype, hdn, 1)
        lh.label_across_high_degree_nodes(mutant, hdn, 2)

        # do the labeling, but block the mutant with stop_filter
        stop_filter.count(mutant[HDN_L.pos + 1:HDN_L.pos + K + 1])
        paths = asm.assemble(wildtype[:K])

        assert len(paths) == 1
        assert any(utils._equals_rc(path, wildtype) for path in paths)
コード例 #9
0
    def test_assemble_right_triple_fork(self, right_triple_fork_structure):
        # assemble three contigs from a trip fork
        (graph, contig, L, HDN, R, top_sequence,
         bottom_sequence) = right_triple_fork_structure
        lh = khmer.GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)

        hdn = graph.find_high_degree_nodes(contig)
        hdn += graph.find_high_degree_nodes(top_sequence)
        hdn += graph.find_high_degree_nodes(bottom_sequence)
        print(list(hdn))
        lh.label_across_high_degree_nodes(contig, hdn, 1)
        lh.label_across_high_degree_nodes(top_sequence, hdn, 2)
        lh.label_across_high_degree_nodes(bottom_sequence, hdn, 3)
        print(lh.get_tag_labels(list(hdn)[0]))

        paths = asm.assemble(contig[:K])
        print([len(x) for x in paths])

        assert len(paths) == 3

        assert any(utils._equals_rc(path, contig) for path in paths)
        assert any(utils._equals_rc(path, top_sequence) for path in paths)
        assert any(utils._equals_rc(path, bottom_sequence) for path in paths)
コード例 #10
0
def main():
    #info('sweep-files.py', ['sweep'])
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    if args.max_tablesize < MIN_HSIZE:
        args.max_tablesize = MIN_HSIZE
    if args.ksize < MIN_KSIZE:
        args.ksize = MIN_KSIZE

    report_on_config(args, graphtype='nodegraph')

    K = args.ksize
    HT_SIZE = args.max_tablesize
    N_HT = args.n_tables

    traversal_range = args.traversal_range

    outputs = {}

    # Consume the database files and assign each a unique label in the
    # de Bruin graph; open a file and output queue for each file as well.
    ht = khmer.GraphLabels(K, HT_SIZE, N_HT)
    try:
        print('consuming and labeling input sequences...', file=sys.stderr)

        for i, dbfile in enumerate(args.db):

            name = args.output_prefix + os.path.basename(dbfile)
            outfp = open(os.path.join(args.outdir, name) + '.sweep', 'wb')
            outq = IODeque(args.max_queue_size, outfp)
            outputs[i] = outq

            for n, record in enumerate(screed.open(dbfile)):
                if n % 50000 == 0:
                    print('...consumed {n} sequences...'.format(n=n),
                          file=sys.stderr)
                ht.consume_sequence_and_tag_with_labels(record.sequence, i)

    except (IOError, OSError) as e:
        print('!! ERROR: !!', e, file=sys.stderr)
        print('...error setting up outputs. exiting...', file=sys.stderr)

    print('done consuming input sequence. \
                        added {t} tags and {l} labels...' \
                        .format(t=ht.n_tags(), l=ht.n_labels()), file=sys.stderr)

    n_orphaned = 0
    n_labeled = 0
    n_mlabeled = 0

    # Iterate through all the reads and check for the labels with which they
    # intersect. Queue to the corresponding label when found.
    for read_file in args.query:
        print('** sweeping {read_file} for labels...'.format(
            read_file=read_file),
              file=sys.stderr)
        try:
            read_fp = screed.open(read_file)
        except IOError as error:
            print('!! ERROR: !!', error, file=sys.stderr)
            print('*** Could not open {fn}, skipping...'.format(fn=read_file),
                  file=sys.stderr)
        else:
            for n, record in enumerate(read_fp):
                if n % 50000 == 0 and n > 0:
                    print('\tswept {n} reads [{nc} labeled, {no} orphaned]' \
                                        .format(n=n, nc=n_labeled,
                                                no=n_orphaned), file=sys.stderr)
                seq = record.sequence
                try:
                    labels = ht.sweep_label_neighborhood(seq, traversal_range)
                except ValueError as e:
                    # sweep_label_neighborhood throws a ValueError when
                    # len(seq) < K. just catch it and move on.
                    pass
                else:
                    if labels:
                        n_labeled += 1
                        if len(labels) > 1:
                            n_mlabeled += 1
                        for label in labels:
                            outputs[label].append(record)
                    else:
                        n_orphaned += 1

            print('** End of file {fn}...'.format(fn=read_file),
                  file=sys.stderr)
            read_fp.close()

    # gotta output anything left in the buffers at the end!
    print('** End of run...', file=sys.stderr)
    for q in list(outputs.values()):
        q.clear()

    print('swept {n_reads}...'.format(n_reads=n_labeled + n_orphaned),
          file=sys.stderr)
    print('...with {nc} labeled and {no} orphaned'.format(nc=n_labeled,
                                                          no=n_orphaned),
          file=sys.stderr)
    print('...and {nmc} multilabeled'.format(nmc=n_mlabeled), file=sys.stderr)
コード例 #11
0
def main():
    info('sweep-reads-buffered.py', ['sweep'])
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    if args.max_tablesize < MAX_HSIZE:
        args.max_tablesize = MAX_HSIZE
    if args.ksize < MIN_KSIZE:
        args.ksize = MIN_KSIZE

    report_on_config(args, graphtype='nodegraph')

    K = args.ksize
    HT_SIZE = args.max_tablesize
    N_HT = args.n_tables

    traversal_range = args.traversal_range
    input_fastp = args.input_fastp

    if not args.outdir:
        outdir = os.path.dirname(input_fastp)
    else:
        outdir = args.outdir

    max_buffers = args.max_buffers
    output_pref = args.output_prefix
    buf_size = args.buffer_size
    max_reads = args.max_reads

    check_input_files(args.input_fastp, args.force)
    check_valid_file_exists(args.input_files)
    all_input_files = [input_fastp]
    all_input_files.extend(args.input_files)

    # Check disk space availability
    check_space(all_input_files, args.force)

    # figure out input file type (FA/FQ) -- based on first file
    ix = iter(screed.open(args.input_files[0]))
    record = next(ix)
    del ix

    extension = 'fa'
    if hasattr(record, 'quality'):      # fastq!
        extension = 'fq'

    output_buffer = ReadBufferManager(
        max_buffers, max_reads, buf_size, output_pref, outdir, extension)

    # consume the partitioned fasta with which to label the graph
    ht = khmer.GraphLabels(K, HT_SIZE, N_HT)
    try:
        print('consuming input sequences...', file=sys.stderr)
        if args.label_by_pid:
            print('...labeling by partition id (pid)', file=sys.stderr)
            ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp)
        elif args.label_by_seq:
            print('...labeling by sequence', file=sys.stderr)
            for n, record in enumerate(screed.open(input_fastp)):
                if n % 50000 == 0:
                    print('...consumed {n} sequences...'.format(n=n), file=sys.stderr)
                ht.consume_sequence_and_tag_with_labels(record.sequence, n)
        else:
            print('...labeling to create groups of size {s}'.format(
                    s=args.group_size), file=sys.stderr)
            label = -1
            g = 0
            try:
                outfp = open('{pref}_base_{g}.{ext}'.format(pref=output_pref,
                                                            g=g,
                                                            ext=extension
                                                            ), 'wb')
                for n, record in enumerate(screed.open(input_fastp)):
                    if n % args.group_size == 0:
                        label += 1
                        if label > g:
                            g = label
                            outfp = open('{pref}_base_{g}.{ext}'.format(
                                pref=output_pref, g=g,
                                ext=extension), 'wb')
                    if n % 50000 == 0:
                        print('...consumed {n} sequences...'.format(n=n), file=sys.stderr)
                    ht.consume_sequence_and_tag_with_labels(record.sequence,
                                                            label)

                    write_record(record, outfp)

            except (IOError, OSError) as e:
                print('!! ERROR !!', e, file=sys.stderr)
                print('...error splitting input. exiting...', file=sys.stderr)

    except (IOError, OSError) as e:
        print('!! ERROR: !!', e, file=sys.stderr)
        print('...error consuming \
                            {i}. exiting...'.format(i=input_fastp), file=sys.stderr)

    print('done consuming input sequence. \
                        added {t} tags and {l} \
                        labels...'.format(t=ht.graph.n_tags(),
                                          l=ht.n_labels()))

    label_dict = defaultdict(int)
    label_number_dist = []

    n_orphaned = 0
    n_labeled = 0
    n_mlabeled = 0

    total_t = time.clock()
    start_t = time.clock()
    for read_file in args.input_files:
        print('** sweeping {read_file} for labels...'.format(
            read_file=read_file), file=sys.stderr)
        file_t = 0.0
        try:
            read_fp = screed.open(read_file)
        except (IOError, OSError) as error:
            print('!! ERROR: !!', error, file=sys.stderr)
            print('*** Could not open {fn}, skipping...'.format(
                fn=read_file), file=sys.stderr)
        else:
            for _, record in enumerate(read_fp):
                if _ % 50000 == 0:
                    end_t = time.clock()
                    batch_t = end_t - start_t
                    file_t += batch_t
                    print('\tswept {n} reads [{nc} labeled, \
                                         {no} orphaned] \
                                        ** {sec}s ({sect}s total)' \
                                        .format(n=_, nc=n_labeled,
                                                no=n_orphaned,
                                                sec=batch_t, sect=file_t), file=sys.stderr)
                    start_t = time.clock()
                seq = record.sequence
                name = record.name
                try:
                    labels = ht.sweep_label_neighborhood(seq, traversal_range)
                except ValueError as e:
                    pass
                else:
                    if hasattr(record, 'quality'):
                        seq_str = fmt_fastq(name, seq, record.quality, labels)
                    else:
                        seq_str = fmt_fasta(name, seq, labels)
                    label_number_dist.append(len(labels))
                    if labels:
                        n_labeled += 1
                        if len(labels) > 1:
                            output_buffer.queue(seq_str, 'multi')
                            n_mlabeled += 1
                            label_dict['multi'] += 1
                        else:
                            output_buffer.queue(seq_str, labels[0])
                            label_dict[labels[0]] += 1
                    else:
                        n_orphaned += 1
                        output_buffer.queue(seq_str, 'orphaned')
                        label_dict['orphaned'] += 1
            print('** End of file {fn}...'.format(fn=read_file), file=sys.stderr)
            output_buffer.flush_all()
            read_fp.close()

    # gotta output anything left in the buffers at the end!
    print('** End of run...', file=sys.stderr)
    output_buffer.flush_all()
    total_t = time.clock() - total_t

    if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0:
        print('! WARNING: Sweep finished with errors !', file=sys.stderr)
        print('** {writee} reads not written'.format(
            writee=output_buffer.num_write_errors), file=sys.stderr)
        print('** {filee} errors opening files'.format(
            filee=output_buffer.num_file_errors), file=sys.stderr)

    print('swept {n_reads} for labels...'.format(
        n_reads=n_labeled + n_orphaned), file=sys.stderr)
    print('...with {nc} labeled and {no} orphaned'.format(
        nc=n_labeled, no=n_orphaned), file=sys.stderr)
    print('...and {nmc} multilabeled'.format(nmc=n_mlabeled), file=sys.stderr)

    print('** outputting label number distribution...', file=sys.stderr)
    fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref))
    with open(fn, 'w', encoding='utf-8') as outfp:
        for nc in label_number_dist:
            outfp.write('{nc}\n'.format(nc=nc))

    fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref))
    print('** outputting label read counts...', file=sys.stderr)
    with open(fn, 'w', encoding='utf-8') as outfp:
        for k in label_dict:
            outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))