def run(description): parser = argparse.ArgumentParser( description = 'Splits a multi sequence file into separate files. Splits sequences into chunks of a fixed size. Aims for chunk_size chunks in each file, but allows a little extra, so chunk can be up to (chunk_size + tolerance), to prevent tiny chunks made from the ends of sequences', usage = 'fastaq chunker [options] <infile> <out> <chunk size> <tolerance>') parser.add_argument('infile', help='Name of input file to be split') parser.add_argument('out', help='Prefix of output file. If --onefile used, then name of single output file') parser.add_argument('chunk_size', type=int, help='Size of each chunk') parser.add_argument('tolerance', type=int, help='Tolerance allowed in chunk size') parser.add_argument('--onefile', action='store_true', help='Output all the sequences in one file') parser.add_argument('--skip_all_Ns', action='store_true', help='Do not output any sequence that consists of all Ns') options = parser.parse_args() if options.onefile: tasks.split_by_fixed_size_onefile( options.infile, options.out, options.chunk_size, options.tolerance, skip_if_all_Ns=options.skip_all_Ns ) else: tasks.split_by_fixed_size( options.infile, options.out, options.chunk_size, options.tolerance, skip_if_all_Ns=options.skip_all_Ns )
def test_split_by_fixed_size_exclude_Ns(self): infile = os.path.join(data_dir, 'sequences_test_split_fixed_size.fa') outprefix = 'tmp.sequences_test_split' tasks.split_by_fixed_size(infile, outprefix, 4, 1, skip_if_all_Ns=True) for i in range(1,5,1): correct = os.path.join(data_dir, 'sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.' + str(i)) test = outprefix + '.' + str(i) self.assertTrue(filecmp.cmp(test, correct)) os.unlink(test) test_coords = outprefix + '.coords' self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_split_fixed_size.fa.split.skip_if_all_Ns.coords'), test_coords)) os.unlink(test_coords)
def test_split_by_fixed_size(self): '''Test fasta/q file split by fixed size''' infile = os.path.join(data_dir, 'sequences_test_split_fixed_size.fa') outprefix = 'tmp.sequences_test_split' tasks.split_by_fixed_size(infile, outprefix, 4, 1) for i in range(1,7,1): correct = os.path.join(data_dir, 'sequences_test_split_fixed_size.fa.split.' + str(i)) test = outprefix + '.' + str(i) self.assertTrue(filecmp.cmp(test, correct)) os.unlink(test) test_coords = outprefix + '.coords' self.assertTrue(filecmp.cmp(os.path.join(data_dir, 'sequences_test_split_fixed_size.fa.split.coords'), test_coords)) os.unlink(test_coords)
def run(description): parser = argparse.ArgumentParser( description= 'Splits a multi sequence file into separate files. Splits sequences into chunks of a fixed size. Aims for chunk_size chunks in each file, but allows a little extra, so chunk can be up to (chunk_size + tolerance), to prevent tiny chunks made from the ends of sequences', usage='fastaq chunker [options] <infile> <out> <chunk size> <tolerance>' ) parser.add_argument('infile', help='Name of input file to be split') parser.add_argument( 'out', help= 'Prefix of output file. If --onefile used, then name of single output file' ) parser.add_argument('chunk_size', type=int, help='Size of each chunk') parser.add_argument('tolerance', type=int, help='Tolerance allowed in chunk size') parser.add_argument('--onefile', action='store_true', help='Output all the sequences in one file') parser.add_argument( '--skip_all_Ns', action='store_true', help='Do not output any sequence that consists of all Ns') options = parser.parse_args() if options.onefile: tasks.split_by_fixed_size_onefile(options.infile, options.out, options.chunk_size, options.tolerance, skip_if_all_Ns=options.skip_all_Ns) else: tasks.split_by_fixed_size(options.infile, options.out, options.chunk_size, options.tolerance, skip_if_all_Ns=options.skip_all_Ns)