def run(description): parser = argparse.ArgumentParser( description = 'Splits a multi sequence file into separate files. Does not split sequences. Puts up to max_bases into each split file. The exception is that any sequence longer than max_bases is put into its own file.', usage = 'fastaq split_by_base_count [options] <infile> <outprefix> <max_bases>') parser.add_argument('infile', help='Name of input file to be split') parser.add_argument('outprefix', help='Name of output file') parser.add_argument('max_bases', type=int, help='Max bases in each output split file', metavar='max_bases') parser.add_argument('--max_seqs', type=int, help='Max number of sequences in each output split file [no limit]', metavar='INT') options = parser.parse_args() tasks.split_by_base_count(options.infile, options.outprefix, options.max_bases, options.max_seqs)
def test_split_by_base_count(self): '''Check that fasta/q files get split by base count correctly''' infile = os.path.join(data_dir, 'sequences_test_split_test.fa') outprefix = 'tmp.sequences_test_split_test.fa.test' length2files = {2: ['1','2','3','4'], 3: ['1','2','3'], 4: ['1', '2', '3'], 6: ['1', '2']} for l in length2files: tasks.split_by_base_count(infile, outprefix, l) for x in range(len(length2files[l])): file_index = str(length2files[l][x]) fname = outprefix + '.' + file_index self.assertTrue(filecmp.cmp(fname, infile + '.' + str(l) + '.' + file_index)) os.unlink(fname) # check that limiting the number of files works tasks.split_by_base_count(infile, outprefix, 6, 2) for i in range(1,4): test_file = outprefix + '.' + str(i) self.assertTrue(filecmp.cmp(test_file, os.path.join(data_dir, 'sequences_test_split_test.fa.6.limit2.') + str(i))) os.unlink(test_file) # check big sequence not broken tasks.split_by_base_count(os.path.join(data_dir, 'sequences_test_split_test.long.fa'), outprefix, 2) self.assertTrue(filecmp.cmp(outprefix + '.1', os.path.join(data_dir, 'sequences_test_split_test.long.fa.2.1'))) self.assertTrue(filecmp.cmp(outprefix + '.2', os.path.join(data_dir, 'sequences_test_split_test.long.fa.2.2'))) os.unlink(outprefix + '.1') os.unlink(outprefix + '.2')