Example #1
0
def test_error_create():
    from khmer import _GraphLabels
    try:
        _GraphLabels(None)
        assert 0, "This should fail."
    except ValueError as err:
        print(str(err))
Example #2
0
def test_error_create():
    from khmer import _GraphLabels
    try:
        _GraphLabels(None)
        assert 0, "This should fail."
    except ValueError as err:
        print(str(err))
Example #3
0
    def test_assemble_left_double_fork(self, left_double_fork_structure):
        # assemble entire contig + branch points b/c of labels; start from end
        graph, contig, L, HDN, R, branch = left_double_fork_structure
        lh = khmer._GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)

        # first try without the labels
        paths = asm.assemble(contig[-K:])

        assert len(paths) == 1
        # without labels, should get the beginning of the HDN thru the end
        assert paths[0] == contig[HDN.pos:]

        # now add labels and check that we get two full length paths
        hdn = graph.find_high_degree_nodes(contig)
        hdn += graph.find_high_degree_nodes(branch)
        print(list(hdn))
        lh.label_across_high_degree_nodes(contig, hdn, 1)
        lh.label_across_high_degree_nodes(branch, hdn, 2)
        print(lh.get_tag_labels(list(hdn)[0]))

        paths = asm.assemble(contig[-K:])

        assert len(paths) == 2

        assert any(utils._equals_rc(path, contig) for path in paths)
        assert any(utils._equals_rc(path, branch) for path in paths)
Example #4
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('assembly')
    p.add_argument('readfiles', nargs='+')
    p.add_argument('-o', '--output', default=None)
    p.add_argument('-k', '--ksize', default=DEFAULT_KSIZE, type=int)
    p.add_argument('-x', '--tablesize', default=NODEGRAPH_SIZE,
                   type=float)
    args = p.parse_args()

    ng = khmer.Nodegraph(args.ksize, args.tablesize, 4)

    # first, consume & tag the reads

    for readfile in args.readfiles:
        print('loading & tagging reads from:', readfile)
        ng.consume_seqfile_and_tag(readfile)

    ## next, consume & label the assembly

    print('loading & tagging assembly from:', args.assembly)
    lh = khmer._GraphLabels(ng)
    lh.consume_seqfile_and_tag_with_labels(args.assembly)

    if args.output:
        outfp = open(args.output, 'w')

    ## finally, walk across the reads & find those with no labels
        
    n = 0
    m = 0

    for readfile in args.readfiles:
        print('loading reads from:', readfile)
        if not args.output:
            outfile = os.path.basename(readfile) + '.leftover2'
            outfp = open(outfile, 'w')
            print('writing to:', outfile, file=sys.stderr)

        for record in screed.open(readfile):
            if n % 100000 == 0 and n:
                print('...', readfile, n, m, file=sys.stderr)
            x = ng.get_tags_and_positions(record.sequence)

            do_extract = False
            for (pos, tag) in x:
                if not lh.get_tag_labels(tag):
                    do_extract = True
                    break

            if do_extract:
                khmer.utils.write_record(record, outfp)
                m += 1

            n += 1

        if not args.output:
            outfp.close()

    print('%d left out of assembly, of %d reads' % (m, n), file=sys.stderr)
Example #5
0
    def test_assemble_left_double_fork(self, left_double_fork_structure):
        # assemble entire contig + branch points b/c of labels; start from end
        graph, contig, L, HDN, R, branch = left_double_fork_structure
        lh = khmer._GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)

        # first try without the labels
        paths = asm.assemble(contig[-K:])

        assert len(paths) == 1
        # without labels, should get the beginning of the HDN thru the end
        assert paths[0] == contig[HDN.pos :]

        # now add labels and check that we get two full length paths
        hdn = graph.find_high_degree_nodes(contig)
        hdn += graph.find_high_degree_nodes(branch)
        print(list(hdn))
        lh.label_across_high_degree_nodes(contig, hdn, 1)
        lh.label_across_high_degree_nodes(branch, hdn, 2)
        print(lh.get_tag_labels(list(hdn)[0]))

        paths = asm.assemble(contig[-K:])

        assert len(paths) == 2

        assert any(utils._equals_rc(path, contig) for path in paths)
        assert any(utils._equals_rc(path, branch) for path in paths)
def main():
    p = argparse.ArgumentParser()
    p.add_argument('assembly')
    p.add_argument('readfiles', nargs='+')
    p.add_argument('-o', '--output', default=None)
    p.add_argument('-k', '--ksize', default=DEFAULT_KSIZE, type=int)
    p.add_argument('-x', '--tablesize', default=NODEGRAPH_SIZE,
                   type=float)
    args = p.parse_args()

    ng = khmer.Nodegraph(args.ksize, args.tablesize, 4)

    # first, consume & tag the reads

    for readfile in args.readfiles:
        print('loading & tagging reads from:', readfile)
        ng.consume_seqfile_and_tag(readfile)

    ## next, consume & label the assembly

    print('loading & tagging assembly from:', args.assembly)
    lh = khmer._GraphLabels(ng)
    lh.consume_seqfile_and_tag_with_labels(args.assembly)

    if args.output:
        outfp = open(args.output, 'w')

    ## finally, walk across the reads & find those with no labels
        
    n = 0
    m = 0

    for readfile in args.readfiles:
        print('loading reads from:', readfile)
        if not args.output:
            outfile = os.path.basename(readfile) + '.leftover2'
            outfp = open(outfile, 'w')
            print('writing to:', outfile, file=sys.stderr)

        for record in screed.open(readfile):
            if n % 100000 == 0 and n:
                print('...', readfile, n, m, file=sys.stderr)
            x = ng.get_tags_and_positions(record.sequence)

            do_extract = False
            for (pos, tag) in x:
                if not lh.get_tag_labels(tag):
                    do_extract = True
                    break

            if do_extract:
                khmer.utils.write_record(record, outfp)
                m += 1

            n += 1

        if not args.output:
            outfp.close()

    print('%d left out of assembly, of %d reads' % (m, n), file=sys.stderr)
Example #7
0
def test_consume_seqfile_and_tag_with_labels(Graphtype):
    infile = utils.get_test_data('valid-read-testing.fq')

    # read this in consume_and_tag
    graph = Graphtype(15, PRIMES_1m)
    x = _GraphLabels(graph)
    x.consume_seqfile_and_tag_with_labels(infile)

    assert x.n_labels() == 9
def test_consume_partitioned_seqfile_and_label(Graphtype):
    infile = utils.get_test_data('valid-read-testing.fq')

    # read this in consume_and_tag
    graph = Graphtype(15, PRIMES_1m)
    x = _GraphLabels(graph)
    x.consume_partitioned_fasta_and_tag_with_labels(infile)

    assert x.n_labels() == 9
Example #9
0
def test_consume_partitioned_seqfile_and_label(Graphtype):
    infile = utils.get_test_data('valid-read-testing.fq')

    # read this in consume_and_tag
    graph = Graphtype(15, *params_1m)
    x = _GraphLabels(graph)
    x.consume_partitioned_fasta_and_tag_with_labels(infile)

    assert x.n_labels() == 9
Example #10
0
    def test_assemble_tandem_repeats(self, tandem_repeat_structure):
        # assemble one copy of a tandem repeat
        graph, repeat, tandem_repeats = tandem_repeat_structure
        lh = khmer._GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)
        paths = asm.assemble(repeat[:K])

        assert len(paths) == 1
        # There are K-1 k-mers spanning the junction between
        # the beginning and end of the repeat
        assert len(paths[0]) == len(repeat) + K - 1
Example #11
0
    def test_assemble_tandem_repeats(self, tandem_repeat_structure):
        # assemble one copy of a tandem repeat
        graph, repeat, tandem_repeats = tandem_repeat_structure
        lh = khmer._GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)
        paths = asm.assemble(repeat[:K])

        assert len(paths) == 1
        # There are K-1 k-mers spanning the junction between
        # the beginning and end of the repeat
        assert len(paths[0]) == len(repeat) + K - 1
Example #12
0
    def test_assemble_snp_bubble_single(self, snp_bubble_structure):
        # assemble entire contig + one of two paths through a bubble
        graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure
        lh = khmer._GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)

        hdn = graph.find_high_degree_nodes(wildtype)
        assert len(hdn) == 2
        lh.label_across_high_degree_nodes(wildtype, hdn, 1)

        paths = asm.assemble(wildtype[:K])

        assert len(paths) == 1
        assert utils._equals_rc(paths[0], wildtype)
Example #13
0
    def test_assemble_snp_bubble_single(self, snp_bubble_structure):
        # assemble entire contig + one of two paths through a bubble
        graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure
        lh = khmer._GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)

        hdn = graph.find_high_degree_nodes(wildtype)
        assert len(hdn) == 2
        lh.label_across_high_degree_nodes(wildtype, hdn, 1)

        paths = asm.assemble(wildtype[:K])

        assert len(paths) == 1
        assert utils._equals_rc(paths[0], wildtype)
Example #14
0
    def test_beginning_to_end_across_tip(self, right_tip_structure):
        # assemble entire contig, ignoring branch point b/c of labels
        graph, contig, L, HDN, R, tip = right_tip_structure
        lh = khmer._GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)
        hdn = graph.find_high_degree_nodes(contig)
        # L, HDN, and R will be labeled with 1
        lh.label_across_high_degree_nodes(contig, hdn, 1)

        path = asm.assemble(contig[:K])

        assert len(path) == 1, "there should only be one path"
        path = path[0]  # @CTB

        assert len(path) == len(contig)
        assert utils._equals_rc(path, contig)
Example #15
0
    def test_beginning_to_end_across_tip(self, right_tip_structure):
        # assemble entire contig, ignoring branch point b/c of labels
        graph, contig, L, HDN, R, tip = right_tip_structure
        lh = khmer._GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)
        hdn = graph.find_high_degree_nodes(contig)
        # L, HDN, and R will be labeled with 1
        lh.label_across_high_degree_nodes(contig, hdn, 1)

        path = asm.assemble(contig[:K])

        assert len(path) == 1, "there should only be one path"
        path = path[0]  # @CTB

        assert len(path) == len(contig)
        assert utils._equals_rc(path, contig)
Example #16
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('fastq_files', nargs='+')
    args = p.parse_args()

    cg = khmer.Countgraph(K, 1e8, 4)

    kept = 0
    hdn = khmer.HashSet(K)
    lh = khmer._GraphLabels(cg)
    next_label = 1
    next_orf = 1
    output = set()

    for filename in args.fastq_files:
        for n, record in enumerate(screed.open(filename)):
            if n and n % 10000 == 0:
                print('...', n, file=sys.stderr)

            if len(record.sequence) < K:
                continue

            cov, _, _ = cg.get_median_count(record.sequence)
            if cov < 20:
                kept += 1
                cg.consume(record.sequence)
            elif cov < 30:
                #print('intermediate', next_label, file=sys.stderr)
                seq, pos = cg.trim_on_abundance(record.sequence, 3)
                if len(seq) < K:
                    continue

                cg.consume(seq)
                hdn = cg.find_high_degree_nodes(seq)
                lh.label_across_high_degree_nodes(seq, hdn, next_label)
                next_label += 1
            elif cov == 30:
                contigs = lh.assemble_labeled_path(record.sequence[:K])
                for contig in contigs:
                    for t in translate(contig):
                        for o in extract_orfs(t):
                            if hash(o) not in output:
                                output.add(hash(o))
                                print('>orf%d\n%s' % (next_orf, o))
                                next_orf += 1
Example #17
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('fastq_files', nargs='+')
    args = p.parse_args()

    cg = khmer.Countgraph(K, 1e8, 4)

    kept = 0
    hdn = khmer.HashSet(K)
    lh = khmer._GraphLabels(cg)
    next_label = 1
    next_orf = 1
    output = set()

    for filename in args.fastq_files:
        for n, record in enumerate(screed.open(filename)):
            if n and n % 10000 == 0:
                print('...', n, file=sys.stderr)

            if len(record.sequence) < K:
                continue

            cov, _, _ = cg.get_median_count(record.sequence)
            if cov < 20:
                kept += 1
                cg.consume(record.sequence)
            elif cov < 30:
                #print('intermediate', next_label, file=sys.stderr)
                seq, pos = cg.trim_on_abundance(record.sequence, 3)
                if len(seq) < K:
                    continue
                
                cg.consume(seq)
                hdn = cg.find_high_degree_nodes(seq)
                lh.label_across_high_degree_nodes(seq, hdn, next_label)
                next_label += 1
            elif cov == 30:
                contigs = lh.assemble_labeled_path(record.sequence[:K])
                for contig in contigs:
                    for t in translate(contig):
                        for o in extract_orfs(t):
                            if hash(o) not in output:
                                output.add(hash(o))
                                print('>orf%d\n%s' % (next_orf, o))
                                next_orf += 1
Example #18
0
    def test_assemble_snp_bubble_both(self, snp_bubble_structure):
        # assemble entire contig + both paths
        graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure
        lh = khmer._GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)

        hdn = graph.find_high_degree_nodes(wildtype)
        hdn += graph.find_high_degree_nodes(mutant)
        assert len(hdn) == 2
        lh.label_across_high_degree_nodes(wildtype, hdn, 1)
        lh.label_across_high_degree_nodes(mutant, hdn, 2)

        paths = asm.assemble(wildtype[:K])

        assert len(paths) == 2

        assert any(utils._contains_rc(wildtype, path) for path in paths)
        assert any(utils._contains_rc(mutant, path) for path in paths)
Example #19
0
    def test_assemble_snp_bubble_both(self, snp_bubble_structure):
        # assemble entire contig + both paths
        graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure
        lh = khmer._GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)

        hdn = graph.find_high_degree_nodes(wildtype)
        hdn += graph.find_high_degree_nodes(mutant)
        assert len(hdn) == 2
        lh.label_across_high_degree_nodes(wildtype, hdn, 1)
        lh.label_across_high_degree_nodes(mutant, hdn, 2)

        paths = asm.assemble(wildtype[:K])

        assert len(paths) == 2

        assert any(utils._contains_rc(wildtype, path) for path in paths)
        assert any(utils._contains_rc(mutant, path) for path in paths)
Example #20
0
    def test_assemble_right_double_fork(self, right_double_fork_structure):
        # assemble two contigs from a double forked structure
        graph, contig, L, HDN, R, branch = right_double_fork_structure
        lh = khmer._GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)

        hdn = graph.find_high_degree_nodes(contig)
        hdn += graph.find_high_degree_nodes(branch)
        print(list(hdn))
        lh.label_across_high_degree_nodes(contig, hdn, 1)
        lh.label_across_high_degree_nodes(branch, hdn, 2)
        print(lh.get_tag_labels(list(hdn)[0]))

        paths = asm.assemble(contig[:K])
        print("Path lengths", [len(x) for x in paths])

        assert len(paths) == 2

        assert any(utils._equals_rc(path, contig) for path in paths)
        assert any(utils._equals_rc(path, branch) for path in paths)
Example #21
0
    def test_assemble_right_double_fork(self, right_double_fork_structure):
        # assemble two contigs from a double forked structure
        graph, contig, L, HDN, R, branch = right_double_fork_structure
        lh = khmer._GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)

        hdn = graph.find_high_degree_nodes(contig)
        hdn += graph.find_high_degree_nodes(branch)
        print(list(hdn))
        lh.label_across_high_degree_nodes(contig, hdn, 1)
        lh.label_across_high_degree_nodes(branch, hdn, 2)
        print(lh.get_tag_labels(list(hdn)[0]))

        paths = asm.assemble(contig[:K])
        print('Path lengths', [len(x) for x in paths])

        assert len(paths) == 2

        assert any(utils._equals_rc(path, contig) for path in paths)
        assert any(utils._equals_rc(path, branch) for path in paths)
Example #22
0
    def test_assemble_snp_bubble_stopbf(self, snp_bubble_structure):
        # assemble one side of bubble, blocked with stop_bf,
        # when labels on both branches
        # stop_bf should trip a filter failure, negating the label spanning
        graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure
        stop_bf = khmer.Nodegraph(K, 1e5, 4)
        lh = khmer._GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)

        hdn = graph.find_high_degree_nodes(wildtype)
        hdn += graph.find_high_degree_nodes(mutant)
        assert len(hdn) == 2
        lh.label_across_high_degree_nodes(wildtype, hdn, 1)
        lh.label_across_high_degree_nodes(mutant, hdn, 2)

        # do the labeling, but block the mutant with stop_bf
        stop_bf.count(mutant[HDN_L.pos + 1:HDN_L.pos + K + 1])
        paths = asm.assemble(wildtype[:K], stop_bf)

        assert len(paths) == 1
        assert any(utils._equals_rc(path, wildtype) for path in paths)
Example #23
0
    def test_assemble_snp_bubble_stopbf(self, snp_bubble_structure):
        # assemble one side of bubble, blocked with stop_bf,
        # when labels on both branches
        # stop_bf should trip a filter failure, negating the label spanning
        graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure
        stop_bf = khmer.Nodegraph(K, 1e5, 4)
        lh = khmer._GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)

        hdn = graph.find_high_degree_nodes(wildtype)
        hdn += graph.find_high_degree_nodes(mutant)
        assert len(hdn) == 2
        lh.label_across_high_degree_nodes(wildtype, hdn, 1)
        lh.label_across_high_degree_nodes(mutant, hdn, 2)

        # do the labeling, but block the mutant with stop_bf
        stop_bf.count(mutant[HDN_L.pos + 1 : HDN_L.pos + K + 1])
        paths = asm.assemble(wildtype[:K], stop_bf)

        assert len(paths) == 1
        assert any(utils._equals_rc(path, wildtype) for path in paths)
Example #24
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('contig_files', nargs='+')
    args = p.parse_args()

    ng = khmer.Nodegraph(K, 1e8, 4)
    starts = []

    for filename in args.contig_files:
        for n, record in enumerate(screed.open(filename)):
            if n and n % 10000 == 0:
                print('...', n)
            ng.consume(record.sequence)
            starts.append(record.sequence[:K])

    hdn = khmer.HashSet(K)
    for filename in args.contig_files:
        for n, record in enumerate(screed.open(filename)):
            if n and n % 10000 == 0:
                print('...', n)
            hdn += ng.find_high_degree_nodes(record.sequence)

    lh = khmer._GraphLabels(ng)
    for filename in args.contig_files:
        for n, record in enumerate(screed.open(filename)):
            if n and n % 10000 == 0:
                print('...', n)
                lh.label_across_high_degree_nodes(record.sequence, hdn, n)


    counter = 0
    for k in starts:
        contigs = lh.assemble_labeled_path(k)
        if not contigs:
            print('nada...')
        for c in contigs:
            print('>%d\n%s' % (counter, c))
            counter += 1
Example #25
0
    def test_assemble_right_triple_fork(self, right_triple_fork_structure):
        # assemble three contigs from a trip fork
        (graph, contig, L, HDN, R, top_sequence, bottom_sequence) = right_triple_fork_structure
        lh = khmer._GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)

        hdn = graph.find_high_degree_nodes(contig)
        hdn += graph.find_high_degree_nodes(top_sequence)
        hdn += graph.find_high_degree_nodes(bottom_sequence)
        print(list(hdn))
        lh.label_across_high_degree_nodes(contig, hdn, 1)
        lh.label_across_high_degree_nodes(top_sequence, hdn, 2)
        lh.label_across_high_degree_nodes(bottom_sequence, hdn, 3)
        print(lh.get_tag_labels(list(hdn)[0]))

        paths = asm.assemble(contig[:K])
        print([len(x) for x in paths])

        assert len(paths) == 3

        assert any(utils._equals_rc(path, contig) for path in paths)
        assert any(utils._equals_rc(path, top_sequence) for path in paths)
        assert any(utils._equals_rc(path, bottom_sequence) for path in paths)
Example #26
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('contig_files', nargs='+')
    args = p.parse_args()

    ng = khmer.Nodegraph(K, 1e8, 4)
    starts = []

    for filename in args.contig_files:
        for n, record in enumerate(screed.open(filename)):
            if n and n % 10000 == 0:
                print('...', n)
            ng.consume(record.sequence)
            starts.append(record.sequence[:K])

    hdn = khmer.HashSet(K)
    for filename in args.contig_files:
        for n, record in enumerate(screed.open(filename)):
            if n and n % 10000 == 0:
                print('...', n)
            hdn += ng.find_high_degree_nodes(record.sequence)

    lh = khmer._GraphLabels(ng)
    for filename in args.contig_files:
        for n, record in enumerate(screed.open(filename)):
            if n and n % 10000 == 0:
                print('...', n)
                lh.label_across_high_degree_nodes(record.sequence, hdn, n)

    counter = 0
    for k in starts:
        contigs = lh.assemble_labeled_path(k)
        if not contigs:
            print('nada...')
        for c in contigs:
            print('>%d\n%s' % (counter, c))
            counter += 1
Example #27
0
    def test_assemble_right_triple_fork(self, right_triple_fork_structure):
        # assemble three contigs from a trip fork
        (graph, contig, L, HDN, R, top_sequence,
         bottom_sequence) = right_triple_fork_structure
        lh = khmer._GraphLabels(graph)
        asm = khmer.SimpleLabeledAssembler(lh)

        hdn = graph.find_high_degree_nodes(contig)
        hdn += graph.find_high_degree_nodes(top_sequence)
        hdn += graph.find_high_degree_nodes(bottom_sequence)
        print(list(hdn))
        lh.label_across_high_degree_nodes(contig, hdn, 1)
        lh.label_across_high_degree_nodes(top_sequence, hdn, 2)
        lh.label_across_high_degree_nodes(bottom_sequence, hdn, 3)
        print(lh.get_tag_labels(list(hdn)[0]))

        paths = asm.assemble(contig[:K])
        print([len(x) for x in paths])

        assert len(paths) == 3

        assert any(utils._equals_rc(path, contig) for path in paths)
        assert any(utils._equals_rc(path, top_sequence) for path in paths)
        assert any(utils._equals_rc(path, bottom_sequence) for path in paths)
Example #28
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('seqfiles', nargs='+')
    parser.add_argument('-o', '--output', default=None)
    parser.add_argument('-k', '--ksize', default=DEFAULT_KSIZE, type=int)
    parser.add_argument('-x', '--tablesize', default=NODEGRAPH_SIZE,
                            type=float)
    parser.add_argument('--force', action='store_true')
    #parser.add_argument('--gml', action='store_true')
    args = parser.parse_args()

    assert args.ksize % 2, "ksize must be odd"
    assert args.output, "you probably want an output file"

    print('building graphs and loading files')

    # Create graph, and two stop bloom filters - one for loading, one for
    # traversing. Create them all here so that we can error out quickly
    # if memory is a problem.

    graph = khmer.Nodegraph(args.ksize, args.tablesize, 2)
    stop_bf = khmer.Nodegraph(args.ksize, args.tablesize, 2)
    stop_bf2 = khmer.Nodegraph(args.ksize, args.tablesize, 2)
    n = 0

    # load in all of the input sequences, one file at a time.
    for seqfile in args.seqfiles:
        for record in screed.open(seqfile):
            n += 1
            if n % 10000 == 0:
                print('...', seqfile, n)
            graph.consume(record.sequence)

    # complain if too small set of graphs was used.
    fp_rate = khmer.calc_expected_collisions(graph,
                                             args.force, max_false_pos=.05)

    # initialize the object that will track information for us.
    pathy = Pathfinder(args.ksize)

    print('finding high degree nodes')
    degree_nodes = khmer.HashSet(args.ksize)
    n = 0
    for seqfile in args.seqfiles:
        for record in screed.open(seqfile):
            n += 1
            if n % 10000 == 0:
                print('...2', seqfile, n)
            # walk across sequences, find all high degree nodes,
            # name them and cherish them. Don't do this on identical sequences.
            if min(stop_bf2.get_kmer_counts(record.sequence)) == 0:
                stop_bf2.consume(record.sequence)
                degree_nodes += graph.find_high_degree_nodes(record.sequence)
    del stop_bf2

    if not len(degree_nodes):
        print('no high degree nodes; exiting.')
        sys.exit(0)

    ####

    lh = khmer._GraphLabels(graph)
    n = 0
    for seqfile in args.seqfiles:
        for record in screed.open(seqfile):
            n += 1
            if n % 10000 == 0:
                print('...2', seqfile, n)
            lh.label_across_high_degree_nodes(record.sequence, degree_nodes, n)

    print('num labels:', lh.n_labels())

    # get all of the degree > 2 nodes and give them IDs.
    for node in degree_nodes:
        pathy.new_segment(node)

    print('traversing linear segments from', len(degree_nodes), 'nodes')

    # now traverse from each high degree nodes into all neighboring nodes,
    # seeking adjacencies.  if neighbor is high degree node, add it to
    # adjacencies; if neighbor is not, then traverse the linear path.  also
    # track minhashes while we're at it.
    for n, k in enumerate(degree_nodes):
        if n % 10000 == 0:
            print('...', n, 'of', len(degree_nodes))

        # retrieve the segment ID of the primary node.
        k_id = pathy.segments_r[k]

        # find all the neighbors of this high-degree node.
        nbh = graph.neighbors(k)
        for nk in nbh:
            # neighbor is high degree? fine, mark its adjacencies.
            if nk in degree_nodes:
                nk_id = pathy.segments_r[nk]
                pathy.add_adjacency(k_id, nk_id)
            else:
                # linear! walk it.
                traverse_and_mark_linear_paths(graph, nk, stop_bf, pathy,
                                               degree_nodes, lh)

    print(len(pathy.segments), 'segments, containing',
              sum(pathy.segments.values()), 'nodes')

    # save to GML
    if args.output:
        import graph_writer

        print('saving to', args.output)
        fp = open(args.output, 'w')
        w = graph_writer.GmlWriter(fp, [], [])

        for k, v in pathy.segments.items():
            w.add_vertex(k, v, [])

        for k, v in pathy.adjacencies.items():
            for edge in v:
                w.add_edge(k, edge, [])
Example #29
0
def main():
    p = build_counting_args(descr='Streaming assembly with tracking info')
    p.add_argument('fastq_files', nargs='+')
    p.add_argument('-o',
                   type=argparse.FileType('w'),
                   default='assembly-stats.csv')
    args = p.parse_args()

    cg = create_countgraph(args)

    kept = 0
    hdn = khmer.HashSet(args.ksize)
    lh = khmer._GraphLabels(cg)
    next_label = 1
    next_orf = 1
    output = set()
    statswriter = csv.DictWriter(args.o,
                                 delimiter=',',
                                 fieldnames=[
                                     'read_n', 'action', 'cov', 'n_hdn',
                                     'contig_n', 'orf_n', 'new'
                                 ])

    for filename in args.fastq_files:
        for n, record in enumerate(screed.open(filename)):
            if n and n % 10000 == 0:
                print('...', n, file=sys.stderr)

            if len(record.sequence) < args.ksize:
                continue

            cov, _, _ = cg.get_median_count(record.sequence)
            if cov < 20:
                kept += 1
                cg.consume(record.sequence)
                statswriter.writerow({
                    'read_n': n,
                    'action': 'c',
                    'cov': cov,
                    'n_hdn': None,
                    'contig_n': None,
                    'orf_n': None,
                    'new': None
                })
            elif cov < 30:
                #print('intermediate', next_label, file=sys.stderr)
                seq, pos = cg.trim_on_abundance(record.sequence, 3)
                if len(seq) < args.ksize:
                    continue

                cg.consume(seq)
                hdn = cg.find_high_degree_nodes(seq)
                lh.label_across_high_degree_nodes(seq, hdn, next_label)
                next_label += 1
                statswriter.writerow({
                    'read_n': n,
                    'action': 'l',
                    'cov': cov,
                    'n_hdn': len(hdn),
                    'contig_n': None,
                    'orf_n': None,
                    'new': None
                })
            elif cov == 30:
                contigs = lh.assemble_labeled_path(
                    record.sequence[:args.ksize])
                for contig_n, contig in enumerate(contigs):
                    statswriter.writerow({
                        'read_n': n,
                        'action': 'a',
                        'cov': cov,
                        'n_hdn': None,
                        'contig_n': contig_n,
                        'orf_n': None,
                        'new': None
                    })
                    for t in translate(contig):
                        for orf_n, o in enumerate(extract_orfs(t)):
                            if hash(o) not in output:
                                new = True
                                output.add(hash(o))
                                print('>orf%d\n%s' % (next_orf, o))
                                next_orf += 1
                            else:
                                new = False
                            statswriter.writerow({
                                'read_n': n,
                                'action': 'a',
                                'cov': cov,
                                'n_hdn': None,
                                'contig_n': contig_n,
                                'orf_n': orf_n,
                                'new': new
                            })
Example #30
0
def main():
    p = build_counting_args(descr='Streaming assembly with tracking info')
    p.add_argument('fastq_files', nargs='+')
    p.add_argument('-o', type=argparse.FileType('w'),
                   default='assembly-stats.csv')
    args = p.parse_args()

    cg = create_countgraph(args)

    kept = 0
    hdn = khmer.HashSet(args.ksize)
    lh = khmer._GraphLabels(cg)
    next_label = 1
    next_orf = 1
    output = set()
    statswriter = csv.DictWriter(args.o, delimiter=',',
                                 fieldnames=['read_n', 'action', 'cov', 'n_hdn',
                                             'contig_n', 'orf_n', 'new'])

    for filename in args.fastq_files:
        for n, record in enumerate(screed.open(filename)):
            if n and n % 10000 == 0:
                print('...', n, file=sys.stderr)

            if len(record.sequence) < args.ksize:
                continue

            cov, _, _ = cg.get_median_count(record.sequence)
            if cov < 20:
                kept += 1
                cg.consume(record.sequence)
                statswriter.writerow({'read_n': n, 'action': 'c', 'cov': cov,
                                      'n_hdn': None, 'contig_n': None, 
                                      'orf_n': None, 'new': None})
            elif cov < 30:
                #print('intermediate', next_label, file=sys.stderr)
                seq, pos = cg.trim_on_abundance(record.sequence, 3)
                if len(seq) < args.ksize:
                    continue
                
                cg.consume(seq)
                hdn = cg.find_high_degree_nodes(seq)
                lh.label_across_high_degree_nodes(seq, hdn, next_label)
                next_label += 1
                statswriter.writerow({'read_n': n, 'action': 'l', 'cov': cov,
                                      'n_hdn': len(hdn), 'contig_n': None, 
                                      'orf_n': None, 'new': None})
            elif cov == 30:
                contigs = lh.assemble_labeled_path(record.sequence[:args.ksize])
                for contig_n, contig in enumerate(contigs):
                    statswriter.writerow({'read_n': n, 'action': 'a', 'cov': cov,
                                          'n_hdn': None, 'contig_n': contig_n, 
                                          'orf_n': None, 'new': None})
                    for t in translate(contig):
                        for orf_n, o in enumerate(extract_orfs(t)):
                            if hash(o) not in output:
                                new = True
                                output.add(hash(o))
                                print('>orf%d\n%s' % (next_orf, o))
                                next_orf += 1
                            else:
                                new = False
                            statswriter.writerow({'read_n': n, 'action': 'a', 'cov': cov,
                                                  'n_hdn': None, 'contig_n': contig_n, 
                                                  'orf_n': orf_n, 'new': new})
Example #31
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('seqfiles', nargs='+')
    parser.add_argument('-o', '--output', default=None)
    parser.add_argument('-k', '--ksize', default=DEFAULT_KSIZE, type=int)
    parser.add_argument('-x',
                        '--tablesize',
                        default=NODEGRAPH_SIZE,
                        type=float)
    parser.add_argument('--force', action='store_true')
    #parser.add_argument('--gml', action='store_true')
    args = parser.parse_args()

    assert args.ksize % 2, "ksize must be odd"
    assert args.output, "you probably want an output file"

    print('building graphs and loading files')

    # Create graph, and two stop bloom filters - one for loading, one for
    # traversing. Create them all here so that we can error out quickly
    # if memory is a problem.

    graph = khmer.Nodegraph(args.ksize, args.tablesize, 2)
    stop_bf = khmer.Nodegraph(args.ksize, args.tablesize, 2)
    stop_bf2 = khmer.Nodegraph(args.ksize, args.tablesize, 2)
    n = 0

    # load in all of the input sequences, one file at a time.
    for seqfile in args.seqfiles:
        for record in screed.open(seqfile):
            n += 1
            if n % 10000 == 0:
                print('...', seqfile, n)
            graph.consume(record.sequence)

    # complain if too small set of graphs was used.
    fp_rate = khmer.calc_expected_collisions(graph,
                                             args.force,
                                             max_false_pos=.05)

    # initialize the object that will track information for us.
    pathy = Pathfinder(args.ksize)

    print('finding high degree nodes')
    degree_nodes = khmer.HashSet(args.ksize)
    n = 0
    for seqfile in args.seqfiles:
        for record in screed.open(seqfile):
            n += 1
            if n % 10000 == 0:
                print('...2', seqfile, n)
            # walk across sequences, find all high degree nodes,
            # name them and cherish them. Don't do this on identical sequences.
            if min(stop_bf2.get_kmer_counts(record.sequence)) == 0:
                stop_bf2.consume(record.sequence)
                degree_nodes += graph.find_high_degree_nodes(record.sequence)
    del stop_bf2

    if not len(degree_nodes):
        print('no high degree nodes; exiting.')
        sys.exit(0)

    ####

    lh = khmer._GraphLabels(graph)
    n = 0
    for seqfile in args.seqfiles:
        for record in screed.open(seqfile):
            n += 1
            if n % 10000 == 0:
                print('...2', seqfile, n)
            lh.label_across_high_degree_nodes(record.sequence, degree_nodes, n)

    print('num labels:', lh.n_labels())

    # get all of the degree > 2 nodes and give them IDs.
    for node in degree_nodes:
        pathy.new_segment(node)

    print('traversing linear segments from', len(degree_nodes), 'nodes')

    # now traverse from each high degree nodes into all neighboring nodes,
    # seeking adjacencies.  if neighbor is high degree node, add it to
    # adjacencies; if neighbor is not, then traverse the linear path.  also
    # track minhashes while we're at it.
    for n, k in enumerate(degree_nodes):
        if n % 10000 == 0:
            print('...', n, 'of', len(degree_nodes))

        # retrieve the segment ID of the primary node.
        k_id = pathy.segments_r[k]

        # find all the neighbors of this high-degree node.
        nbh = graph.neighbors(k)
        for nk in nbh:
            # neighbor is high degree? fine, mark its adjacencies.
            if nk in degree_nodes:
                nk_id = pathy.segments_r[nk]
                pathy.add_adjacency(k_id, nk_id)
            else:
                # linear! walk it.
                traverse_and_mark_linear_paths(graph, nk, stop_bf, pathy,
                                               degree_nodes, lh)

    print(len(pathy.segments), 'segments, containing',
          sum(pathy.segments.values()), 'nodes')

    # save to GML
    if args.output:
        import graph_writer

        print('saving to', args.output)
        fp = open(args.output, 'w')
        w = graph_writer.GmlWriter(fp, [], [])

        for k, v in pathy.segments.items():
            w.add_vertex(k, v, [])

        for k, v in pathy.adjacencies.items():
            for edge in v:
                w.add_edge(k, edge, [])