def run(description):
    parser = argparse.ArgumentParser(
        description = 'Splits a multi sequence file into separate files. Does not split sequences. Puts up to max_bases into each split file. The exception is that any sequence longer than max_bases is put into its own file.',
        usage = 'fastaq split_by_base_count [options] <infile> <outprefix> <max_bases>')
    parser.add_argument('infile', help='Name of input file to be split')
    parser.add_argument('outprefix', help='Name of output file')
    parser.add_argument('max_bases', type=int, help='Max bases in each output split file', metavar='max_bases')
    parser.add_argument('--max_seqs', type=int, help='Max number of sequences in each output split file [no limit]', metavar='INT')

    options = parser.parse_args()
    tasks.split_by_base_count(options.infile, options.outprefix, options.max_bases, options.max_seqs)
Example #2
0
    def test_split_by_base_count(self):
        '''Check that fasta/q files get split by base count correctly'''
        infile = os.path.join(data_dir, 'sequences_test_split_test.fa')
        outprefix = 'tmp.sequences_test_split_test.fa.test'
        length2files = {2: ['1','2','3','4'],
                        3: ['1','2','3'],
                        4: ['1', '2', '3'],
                        6: ['1', '2']}
        for l in length2files:
            tasks.split_by_base_count(infile, outprefix, l)
            for x in range(len(length2files[l])):
                file_index = str(length2files[l][x])
                fname = outprefix + '.' + file_index
                self.assertTrue(filecmp.cmp(fname, infile + '.' + str(l) + '.' + file_index))
                os.unlink(fname)

        # check that limiting the number of files works
        tasks.split_by_base_count(infile, outprefix, 6, 2)
        for i in range(1,4):
            test_file = outprefix + '.' + str(i)
            self.assertTrue(filecmp.cmp(test_file, os.path.join(data_dir, 'sequences_test_split_test.fa.6.limit2.') + str(i)))
            os.unlink(test_file)

        # check big sequence not broken
        tasks.split_by_base_count(os.path.join(data_dir, 'sequences_test_split_test.long.fa'), outprefix, 2)
        self.assertTrue(filecmp.cmp(outprefix + '.1', os.path.join(data_dir, 'sequences_test_split_test.long.fa.2.1')))
        self.assertTrue(filecmp.cmp(outprefix + '.2', os.path.join(data_dir, 'sequences_test_split_test.long.fa.2.2')))
        os.unlink(outprefix + '.1')
        os.unlink(outprefix + '.2')
Example #3
0
    def test_split_by_base_count(self):
        '''Check that fasta/q files get split by base count correctly'''
        infile = os.path.join(data_dir, 'sequences_test_split_test.fa')
        outprefix = 'tmp.sequences_test_split_test.fa.test'
        length2files = {2: ['1','2','3','4'],
                        3: ['1','2','3'],
                        4: ['1', '2', '3'],
                        6: ['1', '2']}
        for l in length2files:
            tasks.split_by_base_count(infile, outprefix, l)
            for x in range(len(length2files[l])):
                file_index = str(length2files[l][x])
                fname = outprefix + '.' + file_index
                self.assertTrue(filecmp.cmp(fname, infile + '.' + str(l) + '.' + file_index))
                os.unlink(fname)

        # check that limiting the number of files works
        tasks.split_by_base_count(infile, outprefix, 6, 2)
        for i in range(1,4):
            test_file = outprefix + '.' + str(i)
            self.assertTrue(filecmp.cmp(test_file, os.path.join(data_dir, 'sequences_test_split_test.fa.6.limit2.') + str(i)))
            os.unlink(test_file)

        # check big sequence not broken
        tasks.split_by_base_count(os.path.join(data_dir, 'sequences_test_split_test.long.fa'), outprefix, 2)
        self.assertTrue(filecmp.cmp(outprefix + '.1', os.path.join(data_dir, 'sequences_test_split_test.long.fa.2.1')))
        self.assertTrue(filecmp.cmp(outprefix + '.2', os.path.join(data_dir, 'sequences_test_split_test.long.fa.2.2')))
        os.unlink(outprefix + '.1')
        os.unlink(outprefix + '.2')