def main(): args = sanitize_help(get_parser()).parse_args() ksize = args.ksize filenames = args.input_filenames nodegraph = Nodegraph(ksize, 1, 1) partitionmap_file = args.graphbase + '.pmap.merged' check_input_files(partitionmap_file, args.force) for _ in filenames: check_input_files(_, args.force) check_space(filenames, args.force) print('loading partition map from:', partitionmap_file, file=sys.stderr) nodegraph.load_partitionmap(partitionmap_file) for infile in filenames: print('outputting partitions for', infile, file=sys.stderr) outfile = os.path.basename(infile) + '.part' part_count = nodegraph.output_partitions(infile, outfile) print('output %d partitions for %s' % (part_count, infile), file=sys.stderr) print('partitions are in', outfile, file=sys.stderr)
def test_read_cleaning_abundance_distribution(Countingtype): infile = utils.get_test_data('valid-read-testing.fq') x = Countingtype(15, *params_1m) y = Nodegraph(15, 1, 1, primes=PRIMES_1m) x.consume_seqfile(infile) dist = x.abundance_distribution(infile, y) assert dist[1] == 35 # k-mers with non-ACGTN => ignored. assert dist[2] == 69
def test_abund_dist_A(AnyTabletype): A_filename = utils.get_test_data('all-A.fa') kh = AnyTabletype(4) tracking = Nodegraph(4, 1, 1, primes=PRIMES_1m) kh.consume_seqfile(A_filename) dist = kh.abundance_distribution(A_filename, tracking) print(dist[:10]) assert sum(dist) == 1 assert dist[0] == 0
def main(): info('filter-stoptags.py', ['graph']) args = sanitize_help(get_parser()).parse_args() stoptags = args.stoptags_file infiles = args.input_filenames for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print('loading stop tags, with K', args.ksize, file=sys.stderr) nodegraph = Nodegraph(args.ksize, 1, 1) nodegraph.load_stop_tags(stoptags) def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = nodegraph.trim_on_stoptags(seq) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print('filtering', infile, file=sys.stderr) outfile = os.path.basename(infile) + '.stopfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print('output in', outfile, file=sys.stderr)
def test_get_labels_save_load(): lb_pre = GraphLabels.NodeGraphLabels(20, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lb_pre.consume_seqfile_and_tag_with_labels(filename) # save labels to a file savepath = utils.get_temp_filename('saved.labels') lb_pre.save_labels_and_tags(savepath) # trash the old GraphLabels del lb_pre # create new, load labels & tags graph = Nodegraph(20, 1e7, 4) lb = GraphLabels.load(savepath, graph) labels = list(lb.labels()) expected = [0, 1, 2, 3] for e_label in expected: assert e_label in labels for a_label in labels: assert a_label in expected
def test_bad_primes(): try: Nodegraph(6, 1, 1, primes=["a", "b", "c"]) assert 0, "this should fail" except TypeError as e: print(str(e))