def main():
    args = sanitize_help(get_parser()).parse_args()

    ksize = args.ksize
    filenames = args.input_filenames
    nodegraph = Nodegraph(ksize, 1, 1)

    partitionmap_file = args.graphbase + '.pmap.merged'

    check_input_files(partitionmap_file, args.force)
    for _ in filenames:
        check_input_files(_, args.force)

    check_space(filenames, args.force)

    print('loading partition map from:', partitionmap_file, file=sys.stderr)
    nodegraph.load_partitionmap(partitionmap_file)

    for infile in filenames:
        print('outputting partitions for', infile, file=sys.stderr)
        outfile = os.path.basename(infile) + '.part'
        part_count = nodegraph.output_partitions(infile, outfile)
        print('output %d partitions for %s' % (part_count, infile),
              file=sys.stderr)
        print('partitions are in', outfile, file=sys.stderr)
Beispiel #2
0
def test_read_cleaning_abundance_distribution(Countingtype):
    infile = utils.get_test_data('valid-read-testing.fq')

    x = Countingtype(15, *params_1m)
    y = Nodegraph(15, 1, 1, primes=PRIMES_1m)

    x.consume_seqfile(infile)

    dist = x.abundance_distribution(infile, y)
    assert dist[1] == 35  # k-mers with non-ACGTN => ignored.
    assert dist[2] == 69
Beispiel #3
0
def test_abund_dist_A(AnyTabletype):
    A_filename = utils.get_test_data('all-A.fa')

    kh = AnyTabletype(4)
    tracking = Nodegraph(4, 1, 1, primes=PRIMES_1m)

    kh.consume_seqfile(A_filename)
    dist = kh.abundance_distribution(A_filename, tracking)

    print(dist[:10])
    assert sum(dist) == 1
    assert dist[0] == 0
def main():
    info('filter-stoptags.py', ['graph'])
    args = sanitize_help(get_parser()).parse_args()
    stoptags = args.stoptags_file
    infiles = args.input_filenames

    for _ in infiles:
        check_input_files(_, args.force)

    check_space(infiles, args.force)

    print('loading stop tags, with K', args.ksize, file=sys.stderr)
    nodegraph = Nodegraph(args.ksize, 1, 1)
    nodegraph.load_stop_tags(stoptags)

    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = nodegraph.trim_on_stoptags(seq)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print('filtering', infile, file=sys.stderr)
        outfile = os.path.basename(infile) + '.stopfilt'

        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print('output in', outfile, file=sys.stderr)
Beispiel #5
0
def test_get_labels_save_load():
    lb_pre = GraphLabels.NodeGraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('test-labels.fa')
    lb_pre.consume_seqfile_and_tag_with_labels(filename)

    # save labels to a file
    savepath = utils.get_temp_filename('saved.labels')
    lb_pre.save_labels_and_tags(savepath)

    # trash the old GraphLabels
    del lb_pre

    # create new, load labels & tags
    graph = Nodegraph(20, 1e7, 4)
    lb = GraphLabels.load(savepath, graph)

    labels = list(lb.labels())
    expected = [0, 1, 2, 3]
    for e_label in expected:
        assert e_label in labels
    for a_label in labels:
        assert a_label in expected
Beispiel #6
0
def test_bad_primes():
    try:
        Nodegraph(6, 1, 1, primes=["a", "b", "c"])
        assert 0, "this should fail"
    except TypeError as e:
        print(str(e))