Exemple #1
0
    def __init__(self,
                 bams,
                 sample_ids,
                 jar,
                 iter_per_percentage,
                 min_reads,
                 max_reads,
                 step_size,
                 reads_multiplier,
                 random_seed_base,
                 out_dir,
                 name,
                 out_sh=None,
                 submit=True,
                 queue_type='PBS'):
        """Any CamelCase here is directly copied from the STAR inputs for
        complete compatibility
        """
        # Make the directory if it's not there already
        try:
            os.mkdir(out_dir)
        except OSError:
            pass

        downsample_command = 'java -jar {}'.format(jar)

        commands = []
        for bam, sample_id in zip(bams, sample_ids):
            flagstat = 'samtools flagstat {}'.format(bam)
            p = subprocess.Popen(shlex.split(flagstat),
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            out, err = p.communicate()
            n_read1 = float(out.split('\n')[4].split()[0]) / reads_multiplier

            vmin = min_reads
            vmax = n_read1 if max_reads is None else max_reads

            for reads in np.arange(vmin, vmax, step_size):
                downsample_prob = reads / n_read1
                for i in range(iter_per_percentage):
                    out_bam = '{}_{:.1e}reads_iter{}.bam'.format(
                        sample_id, reads, i)
                    random_seed = random_seed_base + i
                    commands.append(
                        '{} INPUT={} OUTPUT={} RANDOM_SEED={} PROBABILITY={} CREATE_INDEX=true'
                        .format(downsample_command, bam, out_bam, random_seed,
                                downsample_prob))

        sub = Submitter(sh_filename=out_sh,
                        queue_type=queue_type,
                        commands=commands,
                        job_name=name,
                        walltime='1:00:00',
                        nodes=1,
                        ppn=1,
                        queue='home',
                        array=True,
                        max_running=20)
        sub.write_sh(submit=submit)
    def __init__(self, job_name, out_sh, directory='./', queue_type='PBS',
                 submit=False, downsampled=False):
        """
        Given a base folder which has a directory called "miso" where all the
        miso output is, search for bad events in the subfolders and then
        write them to a "nan_events.txt" file for that sample and event type.

        Parameters
        ----------
        job_name : str
            Name of the array job to be submitted
        out_sh : str
            Filename to write all the submitter commands to
        directory : str
            Base directory, which has a "miso" directory there. This assumes
            the following directory structure:
            <directory>/miso/<sample_id>/<event_type>
            Where "<directory>" is the location specified through this
            variable. If you ran your MISO samples using the Yeo Lab
            pipeline, you're fine.

        """
        downsampled = '--downsampled' if downsampled else ''
        commands = ['python concatenate_miso.py --directory {} {}'.format(
            directory, downsampled)]
        sub = Submitter(queue_type=queue_type, job_name=job_name,
                        sh_filename=out_sh,
                        commands=commands,
                        nodes=1, ppn=1, queue='home',
                        array=False,
                        max_running=20, )
        sub.write_sh(submit=submit)
Exemple #3
0
    def __init__(self, fasta, kmer_size, job_name='sailfish_index',
                 num_processors=8,
                 out_sh=None, out_dir=None, submit=False):
        if num_processors > 16:
            raise ValueError('At most 16 processors can be specified, '
                             'but you '
                             'asked for {}'.format(num_processors))
        if kmer_size > 31:
            raise ValueError('Maximum kmer size is 31 due to memory '
                             'limitations but "{}" was specified'.format(
                kmer_size))

        if out_dir is None:
            out_dir = '{}_sailfish_index_k{}'.format(fasta,
                                                     kmer_size)
        else:
            out_dir = out_dir

        if out_sh is None:
            out_sh = job_name + '.sh'
        else:
            out_sh = out_sh

        command = 'sailfish index --transcripts {0} --out {1} --kmerSize {2} ' \
                  '--threads {3}'.format(fasta,
                                         out_dir,
                                         kmer_size,
                                         num_processors)

        sub = Submitter(queue_type='PBS', sh_filename=out_sh,
                        commands=[command], job_name=job_name,
                        nodes=1, ppn=num_processors,
                        queue='home',
                        walltime='0:30:00')
        sub.write_sh(submit=submit)
Exemple #4
0
    def __init__(self, bams, sample_ids, jar, iter_per_percentage,
                 min_reads, max_reads, step_size, reads_multiplier,
                 random_seed_base, out_dir, name, out_sh=None, submit=True,
                 queue_type='PBS'):
        """Any CamelCase here is directly copied from the STAR inputs for
        complete compatibility
        """
        # Make the directory if it's not there already
        try:
            os.mkdir(out_dir)
        except OSError:
            pass

        downsample_command = 'java -jar {}'.format(jar)

        commands = []
        for bam, sample_id in zip(bams, sample_ids):
            flagstat = 'samtools flagstat {}'.format(bam)
            p = subprocess.Popen(shlex.split(flagstat),
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            out, err = p.communicate()
            n_read1 = float(out.split('\n')[4].split()[0]) / reads_multiplier

            vmin = min_reads
            vmax = n_read1 if max_reads is None else max_reads

            for reads in np.arange(vmin, vmax, step_size):
                downsample_prob = reads / n_read1
                for i in range(iter_per_percentage):
                    out_bam = '{}_{:.1e}reads_iter{}.bam'.format(sample_id,
                                                                 reads, i)
                    random_seed = random_seed_base + i
                    commands.append(
                        '{} INPUT={} OUTPUT={} RANDOM_SEED={} PROBABILITY={} CREATE_INDEX=true'.format(
                            downsample_command,
                            bam,
                            out_bam,
                            random_seed,
                            downsample_prob))

        sub = Submitter(sh_filename=out_sh, queue_type=queue_type,
                        commands=commands, job_name=name,
                        walltime='1:00:00', nodes=1, ppn=1, queue='home',
                        array=True,
                        max_running=20)
        sub.write_sh(submit=submit)
    def __init__(self,
                 job_name,
                 out_sh,
                 directory='./',
                 queue_type='PBS',
                 submit=False):
        """
        Given a base folder which has a directory called "miso" where all the
        miso output is, search for bad events in the subfolders and then
        write them to a "nan_events.txt" file for that sample and event type.

        Parameters
        ----------
        job_name : str
            Name of the array job to be submitted
        out_sh : str
            Filename to write all the submitter commands to
        directory : str
            Base directory, which has a "miso" directory there. This assumes
            the following directory structure:
            <directory>/miso/<sample_id>/<event_type>
            Where "<directory>" is the location specified through this
            variable. If you ran your MISO samples using the Yeo Lab
            pipeline, you're fine.

        """
        commands = []

        glob_command = '{}/miso/*/*'.format(directory.rstrip('/'))
        for folder in iglob(glob_command):
            command = 'cd %s ; grep -m 1 nan chr*/*.miso > nan_events.txt' % \
                      folder
            commands.append(command)

        sub = Submitter(
            queue_type=queue_type,
            job_name=job_name,
            sh_filename=out_sh,
            commands=commands,
            nodes=1,
            ppn=1,
            queue='home',
            array=True,
            max_running=20,
        )
        sub.write_sh(submit=submit)
Exemple #6
0
    def __init__(self,
                 job_name,
                 out_sh,
                 directory='./',
                 queue_type='PBS',
                 submit=False,
                 downsampled=False):
        """
        Given a base folder which has a directory called "miso" where all the
        miso output is, search for bad events in the subfolders and then
        write them to a "nan_events.txt" file for that sample and event type.

        Parameters
        ----------
        job_name : str
            Name of the array job to be submitted
        out_sh : str
            Filename to write all the submitter commands to
        directory : str
            Base directory, which has a "miso" directory there. This assumes
            the following directory structure:
            <directory>/miso/<sample_id>/<event_type>
            Where "<directory>" is the location specified through this
            variable. If you ran your MISO samples using the Yeo Lab
            pipeline, you're fine.

        """
        downsampled = '--downsampled' if downsampled else ''
        commands = [
            'python concatenate_miso.py --directory {} {}'.format(
                directory, downsampled)
        ]
        sub = Submitter(
            queue_type=queue_type,
            job_name=job_name,
            sh_filename=out_sh,
            commands=commands,
            nodes=1,
            ppn=1,
            queue='home',
            array=False,
            max_running=20,
        )
        sub.write_sh(submit=submit)
    def test_pbs(self):
        """Test PBS queue (TSCC)
        """
        job_name = 'test_qtools_submitter_pbs'
        submit_sh = '{}/{}.sh'.format(self.out_dir, job_name)
        sub = Submitter(queue_type='PBS',
                        sh_filename=submit_sh,
                        commands=self.commands,
                        job_name=job_name,
                        nodes=1,
                        ppn=1,
                        queue='home-yeo',
                        walltime='0:01:00')
        job_id = sub.job(submit=False)
        true_result_string = '''#!/bin/bash
#PBS -N test_qtools_submitter_pbs
#PBS -o {0}/test_qtools_submitter_pbs.sh.out
#PBS -e {0}/test_qtools_submitter_pbs.sh.err
#PBS -V
#PBS -l walltime=0:01:00
#PBS -l nodes=1:ppn=1
#PBS -A yeo-group
#PBS -q home-yeo

# Go to the directory from which the script was called
cd $PBS_O_WORKDIR
date
echo testing
'''.format(self.out_dir)
        true_result = true_result_string.split('\n')

        # with open(submit_sh) as f:
        #     for x in f.readlines():
        #         print x,

        for true, test in zip(true_result, open(submit_sh)):
            self.assertEqual(true.strip().split(), test.strip().split())

        # Make sure the job ID is a single (potentially multi-digit) integer
        # But only do this if we're on TSCC or oolite
        if ON_SERVER:
            self.assertRegexpMatches(job_id, '^\d+$')
            subprocess.Popen(["qdel", job_id], stdout=PIPE)
    def test_wait_for_array_pbs(self):
        commands = ['date', 'echo testing PBS']
        job_name = 'test_qtools_submitter_wait_for_pbs'
        submit_sh = '%s/%s.sh' % (tests.get_test_dir(), job_name)
        sub = Submitter(queue_type='PBS',
                        sh_file=submit_sh,
                        command_list=commands,
                        job_name=job_name,
                        wait_for_array=['11111'])
        job_id = sub.write_sh(submit=True,
                              nodes=1,
                              ppn=16,
                              queue='home-yeo',
                              walltime='0:01:00')
        true_result_string = '''#!/bin/sh
#PBS -N test_qtools_submitter_wait_for_pbs
#PBS -o %s/test_qtools_submitter_wait_for_pbs.sh.out
#PBS -e %s/test_qtools_submitter_wait_for_pbs.sh.err
#PBS -V
#PBS -l walltime=0:01:00
#PBS -l nodes=1:ppn=16
#PBS -A yeo-group
#PBS -q home-yeo
#PBS -W depend=afterokarray:11111

# Go to the directory from which the script was called
cd $PBS_O_WORKDIR
date
echo testing PBS
''' % (tests.get_test_dir(), tests.get_test_dir())
        true_result = true_result_string.split('\n')

        # with open(submit_sh) as f:
        #     for x in f.readlines():
        #         print x,

        for true, test in zip(true_result, open(submit_sh)):
            self.assertEqual(true.strip().split(), test.strip().split())

        # Make sure the job ID is a single (potentially multi-digit) integer
        self.assertRegexpMatches(job_id, '^\d+$')
        subprocess.Popen(["qdel", job_id], stdout=PIPE)
Exemple #9
0
    def __init__(self, read1, read2, out_dir,
                 index, stranded=False,
                 not_gzipped=False,
                 job_name='sailfish_quant',
                 num_processors=8,
                 out_sh=None, submit=False, queue_name='home'):
        paired_end = True if read2 is not None else False
        library_parameters = ['TYPE=PE', 'ORIENTATION=><'] if paired_end \
            else ['TYPE=SE']
        if stranded:
            if paired_end:
                strand = 'STRAND=AS'
            else:
                strand = 'STRAND=A'
        else:
            strand = 'STRAND=U'

        library_parameters.append(strand)
        library_string = ':'.join(library_parameters)
        if not_gzipped:
            read_template = r'{}'
        else:
            read_template = r'<(gunzip -c {})'

        if read2 is not None:
            read1 = '-1 {}'.format(read_template.format(read1))
            read2 = '-2 {}'.format(read_template.format(read2))
            reads = '{} {}'.format(read1, read2)

        else:
            reads = '-r {}'.format(read_template.format(read1))

        command = 'sailfish quant --index {0} -l "{1}" {2} --out {3} --threads ' \
                  '{4}'.format(index, library_string, reads, out_dir,
                               num_processors)

        sub = Submitter(queue_type='PBS', sh_filename=out_sh,
                        commands=[command], job_name=job_name,
                        nodes=1, ppn=num_processors,
                        queue=queue_name,
                        walltime='0:30:00')
        sub.write_sh(submit=submit)
    def test_pbs(self):
        """Test PBS queue (TSCC)
        """
        job_name = 'test_qtools_submitter_pbs'
        submit_sh = '{}/{}.sh'.format(self.out_dir, job_name)
        sub = Submitter(queue_type='PBS', sh_filename=submit_sh,
                        commands=self.commands,
                        job_name=job_name, nodes=1, ppn=1,
                        queue='home-yeo', walltime='0:01:00'
        )
        job_id = sub.job(submit=False)
        true_result_string = '''#!/bin/bash
#PBS -N test_qtools_submitter_pbs
#PBS -o {0}/test_qtools_submitter_pbs.sh.out
#PBS -e {0}/test_qtools_submitter_pbs.sh.err
#PBS -V
#PBS -l walltime=0:01:00
#PBS -l nodes=1:ppn=1
#PBS -A yeo-group
#PBS -q home-yeo

# Go to the directory from which the script was called
cd $PBS_O_WORKDIR
date
echo testing
'''.format(self.out_dir)
        true_result = true_result_string.split('\n')

        # with open(submit_sh) as f:
        #     for x in f.readlines():
        #         print x,

        for true, test in zip(true_result, open(submit_sh)):
            self.assertEqual(true.strip().split(), test.strip().split())

        # Make sure the job ID is a single (potentially multi-digit) integer
        # But only do this if we're on TSCC or oolite
        if ON_SERVER:
            self.assertRegexpMatches(job_id, '^\d+$')
            subprocess.Popen(["qdel", job_id],
                             stdout=PIPE)
    def __init__(self, genomeDir, genomeFastaFiles, sjdb,
                 sjdbOverhang, job_name, out_sh=None, submit=True):
        """Any CamelCase here is directly copied from the STAR inputs for
        complete compatibility
        """
        # Make the directory if it's not there already
        try:
            os.mkdir(genomeDir)
        except OSError:
            pass

        commands = []
        commands.append('STAR --runMode genomeGenerate --genomeDir {0} '
                        '--genomeFastaFiles {1} --runThreadN 16 {2} '
                        '--sjdbOverhang {3}'.format(
            genomeDir, genomeFastaFiles, sjdb, sjdbOverhang))

        sub = Submitter(queue_type='PBS', sh_filename=out_sh,
                        commands=commands,
                        job_name=job_name, nodes=1, ppn=16, queue='home',
                        walltime='4:00:00')
        sub.job(submit=submit)
    def test_sge(self):
        """Test SGE queue (oolite)
        """
        job_name = 'test_qtools_submitter_sge'
        submit_sh = '{}/{}.sh'.format(self.out_dir, job_name)
        sub = Submitter(queue_type='SGE',
                        sh_filename=submit_sh,
                        commands=self.commands,
                        job_name=job_name,
                        nodes=1,
                        ppn=1,
                        queue='home-yeo',
                        walltime='0:01:00')
        job_id = sub.job(submit=False)
        true_result_string = '''#!/bin/bash
#$ -N test_qtools_submitter_sge
#$ -o {0}/test_qtools_submitter_sge.sh.out
#$ -e {0}/test_qtools_submitter_sge.sh.err
#$ -V
#$ -S /bin/bash
#$ -cwd
#$ -l bigmem
#$ -l h_vmem=16G
date
echo testing
'''.format(self.out_dir)
        true_result = true_result_string.split('\n')

        # with open(submit_sh) as f:
        #     for x in f.readlines():
        #         print x,
        for true, test in zip(true_result, open(submit_sh)):
            self.assertEqual(true.strip().split(), test.strip().split())

        # Make sure the job ID is a single (potentially multi-digit) integer
        # But only do this if we're on TSCC or oolite
        if ON_SERVER:
            self.assertRegexpMatches(job_id, '^\d+$')
            subprocess.Popen(["qdel", job_id], stdout=PIPE)
    def __init__(self, job_name, out_sh, directory='./', queue_type='PBS',
                 submit=False):
        """
        Given a base folder which has a directory called "miso" where all the
        miso output is, search for bad events in the subfolders and then
        write them to a "nan_events.txt" file for that sample and event type.

        Parameters
        ----------
        job_name : str
            Name of the array job to be submitted
        out_sh : str
            Filename to write all the submitter commands to
        directory : str
            Base directory, which has a "miso" directory there. This assumes
            the following directory structure:
            <directory>/miso/<sample_id>/<event_type>
            Where "<directory>" is the location specified through this
            variable. If you ran your MISO samples using the Yeo Lab
            pipeline, you're fine.

        """
        commands = []

        glob_command = '{}/miso/*/*'.format(directory.rstrip('/'))
        for folder in iglob(glob_command):
            command = 'cd %s ; grep -m 1 nan chr*/*.miso > nan_events.txt' % \
                      folder
            commands.append(command)

        sub = Submitter(queue_type=queue_type, job_name=job_name,
                        sh_filename=out_sh,
                        commands=commands,
                        nodes=1, ppn=1, queue='home',
                        array=True,
                        max_running=20,
        )
        sub.write_sh(submit=submit)
    def test_wait_for_array_pbs(self):
        commands = ['date', 'echo testing PBS']
        job_name = 'test_qtools_submitter_wait_for_pbs'
        submit_sh = '%s/%s.sh' % (tests.get_test_dir(), job_name)
        sub = Submitter(queue_type='PBS', sh_file= submit_sh,
                        command_list=commands,
                        job_name=job_name, wait_for_array=['11111'])
        job_id = sub.write_sh(submit=True, nodes=1, ppn=16,
                                 queue='home-yeo', walltime='0:01:00')
        true_result_string = '''#!/bin/sh
#PBS -N test_qtools_submitter_wait_for_pbs
#PBS -o %s/test_qtools_submitter_wait_for_pbs.sh.out
#PBS -e %s/test_qtools_submitter_wait_for_pbs.sh.err
#PBS -V
#PBS -l walltime=0:01:00
#PBS -l nodes=1:ppn=16
#PBS -A yeo-group
#PBS -q home-yeo
#PBS -W depend=afterokarray:11111

# Go to the directory from which the script was called
cd $PBS_O_WORKDIR
date
echo testing PBS
''' % (tests.get_test_dir(), tests.get_test_dir())
        true_result = true_result_string.split('\n')

        # with open(submit_sh) as f:
        #     for x in f.readlines():
        #         print x,

        for true, test in zip(true_result, open(submit_sh)):
            self.assertEqual(true.strip().split(), test.strip().split())

        # Make sure the job ID is a single (potentially multi-digit) integer
        self.assertRegexpMatches(job_id, '^\d+$')
        subprocess.Popen(["qdel", job_id],
                                 stdout=PIPE)
    def test_sge(self):
        """Test SGE queue (oolite)
        """
        job_name = 'test_qtools_submitter_sge'
        submit_sh = '{}/{}.sh'.format(self.out_dir, job_name)
        sub = Submitter(queue_type='SGE', sh_filename=submit_sh,
                        commands=self.commands,
                        job_name=job_name, nodes=1, ppn=1,
                        queue='home-yeo', walltime='0:01:00'
        )
        job_id = sub.job(submit=False)
        true_result_string = '''#!/bin/bash
#$ -N test_qtools_submitter_sge
#$ -o {0}/test_qtools_submitter_sge.sh.out
#$ -e {0}/test_qtools_submitter_sge.sh.err
#$ -V
#$ -S /bin/bash
#$ -cwd
#$ -l bigmem
#$ -l h_vmem=16G
date
echo testing
'''.format(self.out_dir)
        true_result = true_result_string.split('\n')

        # with open(submit_sh) as f:
        #     for x in f.readlines():
        #         print x,
        for true, test in zip(true_result, open(submit_sh)):
            self.assertEqual(true.strip().split(), test.strip().split())

        # Make sure the job ID is a single (potentially multi-digit) integer
        # But only do this if we're on TSCC or oolite
        if ON_SERVER:
            self.assertRegexpMatches(job_id, '^\d+$')
            subprocess.Popen(["qdel", job_id],
                             stdout=PIPE)
    def __init__(self, bam, sample_info_file,
                 sample_id, output_sh,
                 genome, walltime,
                 submit=False):
        """
        Parameters
        ----------


        Returns
        -------


        Raises
        ------
        """
        self.sample_info_file = sample_info_file

        if self.sample_info_file is not None:
            sample_info = pd.read_table(self.sample_info_file, header=None)
            self.bams = sample_info[0]
            self.sample_ids = sample_info[1]
            self.sh_files = ['{}.miso.sh'.format(bam) for bam in self.bams]
            self.multiple_samples = True
        else:
            self.sample_ids = [sample_id]
            self.bams = [bam]
            self.sh_files = [output_sh]
            self.multiple_samples = False

        self.genome = genome
        self.walltime = walltime
        self.submit = submit

        all_samples_commands = []

        for bam, sample_id, sh_file in zip(self.bams, self.sample_ids,
                                           self.sh_files):
            self._write_single_sample(bam, sample_id, sh_file)

            sh_command = 'bash {}'.format(sh_file)
            if self.submit and not self.multiple_samples:
                commands = [sh_command]
                sub = Submitter(commands, job_name='miso',
                                sh_filename='{}.qsub.sh'.format(sh_file),
                                ppn=16, walltime=self.walltime)
                sub.job(submit=self.submit)

            if self.multiple_samples:
                all_samples_commands.append(sh_command)

        if self.multiple_samples:
            sub = Submitter(all_samples_commands, job_name='miso',
                            sh_filename='miso.qsub.sh',
                            array=True, ppn=16, walltime=self.walltime)
            sub.job(submit=self.submit)
Exemple #17
0
    def __init__(self, bam, sample_info_file,
                 sample_id, output_sh,
                 genome, walltime, nodes=1, ppn=16,
                 submit=False, read_length=None):
        """
        Parameters
        ----------


        Returns
        -------


        Raises
        ------
        """
        self.sample_info_file = sample_info_file

        if self.sample_info_file is not None:
            sample_info = pd.read_table(self.sample_info_file, header=None)
            self.bams = sample_info[0]
            self.sample_ids = sample_info[1]
            self.sh_files = ['{}.miso.sh'.format(bam) for bam in self.bams]
            self.multiple_samples = True
        else:
            self.sample_ids = [sample_id]
            self.bams = [bam]
            self.sh_files = [output_sh]
            self.multiple_samples = False

        self.genome = genome
        self.walltime = walltime
        self.submit = submit

        self.nodes = nodes
        self.ppn = ppn
        self.read_length = self.read_length

        all_samples_commands = []

        for bam, sample_id, sh_file in zip(self.bams, self.sample_ids,
                                           self.sh_files):
            self._write_single_sample(bam, sample_id, sh_file)

            sh_command = 'bash {}'.format(sh_file)
            if self.submit and not self.multiple_samples:
                commands = [sh_command]
                sub = Submitter(commands, job_name='miso',
                                sh_filename='{}.qsub.sh'.format(sh_file),
                                ppn=self.ppn, nodes=self.nodes,
                                walltime=self.walltime)
                sub.job(submit=self.submit)

            if self.multiple_samples:
                all_samples_commands.append(sh_command)

        if self.multiple_samples:
            sub = Submitter(all_samples_commands, job_name='miso',
                            sh_filename='miso.qsub.sh',
                            array=True,
                            ppn=self.ppn, nodes=self.nodes,
                            walltime=self.walltime)
            sub.job(submit=self.submit)
Exemple #18
0
    def summary(self):
        summary_commands = []

        job_name_base = '%s_summary' % (self.job_name_prefix)
        job_name = job_name_base
        submit_sh = '%s/%s.sh' \
            % (self.sh_scripts_dir, job_name_base)

        # all_submit_sh = []
        summary_commands = []

        for bam, sample_id, psi_output_dir, summary_output_dir in \
                zip(self.bams, self.sample_ids, self.psi_output_dirs,
                    self.summary_output_dirs):
            # Okay, now we are ready to write to the submitter script
            summary_commands.append('\n\n# --- %s --- #' % sample_id)

            # add a line of padding and the sample id to the output file
            summary_commands.append('\necho\necho "--- %s ----"' %
                                        sample_id)
            summary_commands.append('date')
            summary_command = 'python %s/run_miso.py --summarize-samples %s ' \
                              '%s >%s/summary.out 2>%s/summary.err' \
                              % (self.miso_scripts_dir, psi_output_dir,
                                 psi_output_dir, psi_output_dir,
                                 psi_output_dir)
            summary_commands.append(summary_command)

            summary_commands.append('# Copy over the summary files AFTERWARD '
                                    'to prevent'
                                    ' overloading the home directory')
            temp_summary_file = '%s/summary/%s.miso_summary' % (
                psi_output_dir, sample_id)
            final_summary_file = '%s/summary/%s.miso_summary' % (
                summary_output_dir, sample_id)
            summary_commands.append('mkdir -p %s/summary' % (
                summary_output_dir))
            summary_commands.append('cp %s %s' % (temp_summary_file,
                                                  final_summary_file))
        
            # Put the submitter script wherever the command was run from
    #        if self.submit_sh_suffix:

    #        else:
    #            job_name = 'miso_%s_summary' % self.event_type
    #         job_name = '%s_%s' % (sample_id, job_name_base)
            # submit_sh = '%s_%s.sh' \
            #             % (submit_sh_base, sample_id)
            # all_submit_sh.append('\n# --- %s --- #\nqsub %s\n' %
            #                      (sample_id, submit_sh))

            # if self.num_cores > 1:
        additional_resources = {'-t': '1-%d%%%d'
                                  % (len(self.sample_ids), 8)}
        # else:
        #     additional_resources = None

        # if self.psi_job_id[sample_id] is not None:
        #     sub = Submitter(queue_type='PBS', sh_file=submit_sh,
        #                     command_list=summary_commands,
        #                     job_name=job_name,
        #                     wait_for=[self.psi_job_id[sample_id]],
        #                     # Tell the queue to parallelize this job
        #                          # into a job array
        #                     additional_resources=additional_resources)
        # else:
        sub = Submitter(queue_type='PBS', sh_file=submit_sh,
                        command_list=summary_commands, job_name=job_name,
                        # Tell the queue to parallelize this job
                        # into a job array
                        additional_resources=additional_resources,
                        wait_for_array=self.psi_job_id)

        self.summary_job_id = sub.write_sh(submit=True,
                                           nodes=self.num_cores,
                                           ppn=2,
                                 queue=self.queue,
                                 walltime=self.summary_walltime)

        print self.summary_job_id
        # Save all the qsub commands in one file
        # with open('%s.sh' % submit_sh_base, 'w') as f:
        #     # f.write('#!/bin/bash\n\n')
        #     f.writelines(all_submit_sh)
Exemple #19
0
    def psi(self):
        """
        Submit a job to the cluster to compute 'psi' (percent spliced-in)
        scores of the splicing events and bam files provided.
        """

        psi_name = '%s_psi' % self.job_name_prefix
        job_name = psi_name
        submit_sh_base = '%s/%s' % (self.sh_scripts_dir, psi_name)
        submit_sh = '%s.sh' % submit_sh_base

        # all_submit_sh = ['#!/bin/bash\n\n']

        psi_commands = []

        # Make a different submit file for each sample, because MISO doesn't
        # take THAT long on its own for one sample, and that way we won't get
        #  charged. Plus then we can track failures of individual samples
        for bam, sample_id, output_dir in zip(self.bams, self.sample_ids,
                                              self.psi_output_dirs):


            # Establish which files we're working with
            insert_len_file = bam + '.insert_len'
            # bam_dir = os.path.dirname(bam)
            # output_dir = '%s/miso/%s/%s' % (bam_dir, self.event_type,
            #                                 sample_id)

            insert_len_commands, insert_len_arguments = self\
                ._get_psi_insert_len_argument(sample_id, insert_len_file)

            # Okay, now we are ready to write to the submitter script
            psi_commands.append('\n\n# --- %s --- #' % sample_id)

            # Need to **extend** with a list, not append.
            psi_commands.extend(insert_len_commands)

            # add a line of padding and the sample id to the output file
            psi_commands.append('\necho\necho "--- %s ----"' % sample_id)
            psi_commands.append('date')


            # Get the read length. Gonna keep this as bash because samtools
            # and less are very fast
            read_len = '%s_READ_LEN' % sample_id
            psi_commands.append(
                '\n# Assuming that the first read of the bam file is '
                'representative, such that all the reads in the '
                '\n# file are exactly the same length, we can take the first '
                'read from the bam file and measure its length, '
                '\n# and use that for our algorithm')
            psi_commands.append(
                "%s=$(samtools view %s | head -n 1 | cut -f 10 | awk '{ print"
                " length }')" % (read_len, bam))

            # Finally we are ready to write the actual miso command!
            log_filename = 'psi'
            stderr = '%s/%s.err' % (output_dir, log_filename)
            stdout = '%s/%s.out' % (output_dir, log_filename)


            psi_command = 'python %s --run %s %s --output-dir %s ' \
                                  '--read-len $%s %s -p %d %s >' \
                                  ' %s 2> %s' \
                                  % (self.miso, self.event_type_index, bam,
                                     output_dir, read_len,
                                     insert_len_arguments, self.num_processes,
                                     self.extra_miso_arguments, stdout,
                                     stderr)
            psi_commands.append('date')
            psi_commands.append("echo Starting ...... '%s'"
                                    % psi_command)
            psi_commands.append(psi_command)

        # Put the submitter script wherever the command was run from
#        if self.submit_sh_suffix:

#        else:
#            psi_name = 'miso_%s_psi' % (self.event_type)


            # job_name = '%s_%s' % (sample_id, psi_name)

            # submit_sh = '%s_%s.sh' % (submit_sh_base, sample_id)
            # all_submit_sh.append('\n# --- %s --- #\nqsub %s\n' %
            #                          (sample_id, submit_sh))

        # if self.insert_len_job_id is not None:
        #     sub = Submitter(queue_type='PBS', sh_file=submit_sh,
        #                 command_list=psi_commands, job_name=job_name,
        #                 wait_for=[self.insert_len_job_id[sample_id]])
        # else:
        sub = Submitter(queue_type='PBS', sh_file=submit_sh,
                    command_list=psi_commands, job_name=job_name)
        # if self.num_cores == 1:
        #     self.psi_job_is_array = False
        #     self.psi_job_id[sample_id] = sub.write_sh(submit=True,
        #                                    nodes=self.num_cores,
        #                                    ppn=self.num_processes,
        #                                    queue=self.queue,
        #                                    walltime=self.psi_walltime)
        # else:
        self.psi_job_is_array = True
        self.psi_job_id = sub.write_sh(
            submit=True, nodes=self.num_cores, ppn=self.num_processes,
            queue=self.queue, walltime=self.psi_walltime,
            additional_resources={'-t': '1-%d%%%d'
                                        % (len(self.sample_ids), 8)})
        print self.psi_job_id
Exemple #20
0
    def insert_len(self):
        """
        For the provided .bam files, checks if there is an insert length file
        associated with it (....bam.insert_len), and if not, adds the command to
        compute its insert length to a list.

        Outputs the job ID of the insert_len script
        """
        # If we are treating these as single-ended reads, don't do anything
        if self.read_type == 'single_end':
            return


        constitutive_exons_dir = '%s/%s_constitutive' % (
            self.base_annotation_dir, self.event_type)

        # Bug: there may be more than one constitutive exons GFF in this
        # folder, and we only grab the first one
        constitutive_exons_gff = glob('%s/*.gff' % constitutive_exons_dir)[0]

        insert_len_name = '%s_insert_len%s' % (self.job_name_prefix,
                                               self.submit_sh_suffix)
        insert_len_sh_base = '%s/%s' % (self.sh_scripts_dir,
                                           insert_len_name)
        all_insert_len_sh = ['#!/bin/bash\n\n']

        for bam, sample_id in zip(self.bams, self.sample_ids):
            # Command-line commands to submit to the cluster
            insert_len_commands = []
            bam_dir = os.path.dirname(bam)
            insert_len_file = bam + '.insert_len'
            try:
                open(insert_len_file)
            except IOError:
                # There is no insert length file, so create it
                insert_len_command = 'python %s/pe_utils.py ' \
                                      '--compute-insert-len %s %s ' \
                                      ' --output-dir %s ' \
                                      '>%s.out 2>%s'\
                                      % (self.miso_scripts_dir, bam,
                                         constitutive_exons_gff, bam_dir,
                                         insert_len_file, insert_len_file)
                insert_len_commands.append('date')
                insert_len_commands.append("echo Starting ... '%s'" %
                                            insert_len_command)
                insert_len_commands.append(insert_len_command)

    #        if self.submit_sh_suffix:

    #        else:
    #            insert_len_name = 'miso_insert_len'

            insert_len_sh = '%s_%s.sh' % (insert_len_sh_base, sample_id)
            all_insert_len_sh.append('\n# --- %s --- #\nqsub %s\n' %
                                     (sample_id, insert_len_sh))

            sub = Submitter(queue_type='PBS', sh_file=insert_len_sh,
                            command_list=insert_len_commands,
                            job_name=insert_len_name)
            self.insert_len_job_id[sample_id] = sub.write_sh(submit=True,
                                                  nodes=self.num_cores,
                                                  ppn=self.num_processes,
                                     queue=self.queue, walltime='0:30:00')
Exemple #21
0
    print species
except:
    print "usage: submit_parse_oldsplice.py <species>"


assert (species != None) and (len(species) > 0)

for filename in files:

    filenames.append(filename)
    sample = filename.replace(".splices", "").replace(".flip", "_flip")
    samples.append(sample)


from gscripts.qtools import Submitter

sub = Submitter()

cmd = "parse_oldsplice.py --species %s" %species

for filename, sample in zip(filenames, samples):
    cmd += " --sample %s %s " %(filename, sample)

#print cmd


cmd = [cmd]


sub.job(command_list=cmd, array=False, sh_file="parse.sh", job_name="parse", submit=True, queue="home", ppn=1)
    def insert_len(self):
        """
        For the provided .bam files, checks if there is an insert length file
        associated with it (....bam.insert_len), and if not, adds the command to
        compute its insert length to a list.

        Outputs the job ID of the insert_len script
        """
        # If we are treating these as single-ended reads, don't do anything
        if self.read_type == 'single_end':
            return

        constitutive_exons_dir = '%s/%s_constitutive' % (
            self.base_annotation_dir, self.event_type)

        # Bug: there may be more than one constitutive exons GFF in this
        # folder, and we only grab the first one
        constitutive_exons_gff = glob('%s/*.gff' % constitutive_exons_dir)[0]

        insert_len_name = '%s_insert_len%s' % (self.job_name_prefix,
                                               self.submit_sh_suffix)
        insert_len_sh_base = '%s/%s' % (self.sh_scripts_dir, insert_len_name)
        all_insert_len_sh = ['#!/bin/bash\n\n']

        for bam, sample_id in zip(self.bams, self.sample_ids):
            # Command-line commands to submit to the cluster
            insert_len_commands = []
            bam_dir = os.path.dirname(bam)
            insert_len_file = bam + '.insert_len'
            try:
                open(insert_len_file)
            except IOError:
                # There is no insert length file, so create it
                insert_len_command = 'python %s/pe_utils.py ' \
                                      '--compute-insert-len %s %s ' \
                                      ' --output-dir %s ' \
                                      '>%s.out 2>%s'\
                                      % (self.miso_scripts_dir, bam,
                                         constitutive_exons_gff, bam_dir,
                                         insert_len_file, insert_len_file)
                insert_len_commands.append('date')
                insert_len_commands.append("echo Starting ... '%s'" %
                                           insert_len_command)
                insert_len_commands.append(insert_len_command)

    #        if self.submit_sh_suffix:

    #        else:
    #            insert_len_name = 'miso_insert_len'

            insert_len_sh = '%s_%s.sh' % (insert_len_sh_base, sample_id)
            all_insert_len_sh.append('\n# --- %s --- #\nqsub %s\n' %
                                     (sample_id, insert_len_sh))

            sub = Submitter(queue_type='PBS',
                            sh_file=insert_len_sh,
                            command_list=insert_len_commands,
                            job_name=insert_len_name)
            self.insert_len_job_id[sample_id] = sub.write_sh(
                submit=True,
                nodes=self.num_cores,
                ppn=self.num_processes,
                queue=self.queue,
                walltime='0:30:00')
    def psi(self):
        """
        Submit a job to the cluster to compute 'psi' (percent spliced-in)
        scores of the splicing events and bam files provided.
        """

        psi_name = '%s_psi' % self.job_name_prefix
        job_name = psi_name
        submit_sh_base = '%s/%s' % (self.sh_scripts_dir, psi_name)
        submit_sh = '%s.sh' % submit_sh_base

        # all_submit_sh = ['#!/bin/bash\n\n']

        psi_commands = []

        # Make a different submit file for each sample, because MISO doesn't
        # take THAT long on its own for one sample, and that way we won't get
        #  charged. Plus then we can track failures of individual samples
        for bam, sample_id, output_dir in zip(self.bams, self.sample_ids,
                                              self.psi_output_dirs):

            # Establish which files we're working with
            insert_len_file = bam + '.insert_len'
            # bam_dir = os.path.dirname(bam)
            # output_dir = '%s/miso/%s/%s' % (bam_dir, self.event_type,
            #                                 sample_id)

            insert_len_commands, insert_len_arguments = self\
                ._get_psi_insert_len_argument(sample_id, insert_len_file)

            # Okay, now we are ready to write to the submitter script
            psi_commands.append('\n\n# --- %s --- #' % sample_id)

            # Need to **extend** with a list, not append.
            psi_commands.extend(insert_len_commands)

            # add a line of padding and the sample id to the output file
            psi_commands.append('\necho\necho "--- %s ----"' % sample_id)
            psi_commands.append('date')

            # Get the read length. Gonna keep this as bash because samtools
            # and less are very fast
            read_len = '%s_READ_LEN' % sample_id
            psi_commands.append(
                '\n# Assuming that the first read of the bam file is '
                'representative, such that all the reads in the '
                '\n# file are exactly the same length, we can take the first '
                'read from the bam file and measure its length, '
                '\n# and use that for our algorithm')
            psi_commands.append(
                "%s=$(samtools view %s | head -n 1 | cut -f 10 | awk '{ print"
                " length }')" % (read_len, bam))

            # Finally we are ready to write the actual miso command!
            log_filename = 'psi'
            stderr = '%s/%s.err' % (output_dir, log_filename)
            stdout = '%s/%s.out' % (output_dir, log_filename)


            psi_command = 'python %s --run %s %s --output-dir %s ' \
                                  '--read-len $%s %s -p %d %s >' \
                                  ' %s 2> %s' \
                                  % (self.miso, self.event_type_index, bam,
                                     output_dir, read_len,
                                     insert_len_arguments, self.num_processes,
                                     self.extra_miso_arguments, stdout,
                                     stderr)
            psi_commands.append('date')
            psi_commands.append("echo Starting ...... '%s'" % psi_command)
            psi_commands.append(psi_command)

        # Put the submitter script wherever the command was run from
#        if self.submit_sh_suffix:

#        else:
#            psi_name = 'miso_%s_psi' % (self.event_type)

# job_name = '%s_%s' % (sample_id, psi_name)

# submit_sh = '%s_%s.sh' % (submit_sh_base, sample_id)
# all_submit_sh.append('\n# --- %s --- #\nqsub %s\n' %
#                          (sample_id, submit_sh))

# if self.insert_len_job_id is not None:
#     sub = Submitter(queue_type='PBS', sh_file=submit_sh,
#                 command_list=psi_commands, job_name=job_name,
#                 wait_for=[self.insert_len_job_id[sample_id]])
# else:
        sub = Submitter(queue_type='PBS',
                        sh_file=submit_sh,
                        command_list=psi_commands,
                        job_name=job_name)
        # if self.num_cores == 1:
        #     self.psi_job_is_array = False
        #     self.psi_job_id[sample_id] = sub.write_sh(submit=True,
        #                                    nodes=self.num_cores,
        #                                    ppn=self.num_processes,
        #                                    queue=self.queue,
        #                                    walltime=self.psi_walltime)
        # else:
        self.psi_job_is_array = True
        self.psi_job_id = sub.write_sh(submit=True,
                                       nodes=self.num_cores,
                                       ppn=self.num_processes,
                                       queue=self.queue,
                                       walltime=self.psi_walltime,
                                       additional_resources={
                                           '-t':
                                           '1-%d%%%d' %
                                           (len(self.sample_ids), 8)
                                       })
        print self.psi_job_id
    def summary(self):
        summary_commands = []

        job_name_base = '%s_summary' % (self.job_name_prefix)
        job_name = job_name_base
        submit_sh = '%s/%s.sh' \
            % (self.sh_scripts_dir, job_name_base)

        # all_submit_sh = []
        summary_commands = []

        for bam, sample_id, psi_output_dir, summary_output_dir in \
                zip(self.bams, self.sample_ids, self.psi_output_dirs,
                    self.summary_output_dirs):
            # Okay, now we are ready to write to the submitter script
            summary_commands.append('\n\n# --- %s --- #' % sample_id)

            # add a line of padding and the sample id to the output file
            summary_commands.append('\necho\necho "--- %s ----"' % sample_id)
            summary_commands.append('date')
            summary_command = 'python %s/run_miso.py --summarize-samples %s ' \
                              '%s >%s/summary.out 2>%s/summary.err' \
                              % (self.miso_scripts_dir, psi_output_dir,
                                 psi_output_dir, psi_output_dir,
                                 psi_output_dir)
            summary_commands.append(summary_command)

            summary_commands.append('# Copy over the summary files AFTERWARD '
                                    'to prevent'
                                    ' overloading the home directory')
            temp_summary_file = '%s/summary/%s.miso_summary' % (psi_output_dir,
                                                                sample_id)
            final_summary_file = '%s/summary/%s.miso_summary' % (
                summary_output_dir, sample_id)
            summary_commands.append('mkdir -p %s/summary' %
                                    (summary_output_dir))
            summary_commands.append('cp %s %s' %
                                    (temp_summary_file, final_summary_file))

            # Put the submitter script wherever the command was run from

    #        if self.submit_sh_suffix:

    #        else:
    #            job_name = 'miso_%s_summary' % self.event_type
    #         job_name = '%s_%s' % (sample_id, job_name_base)
    # submit_sh = '%s_%s.sh' \
    #             % (submit_sh_base, sample_id)
    # all_submit_sh.append('\n# --- %s --- #\nqsub %s\n' %
    #                      (sample_id, submit_sh))

    # if self.num_cores > 1:
        additional_resources = {'-t': '1-%d%%%d' % (len(self.sample_ids), 8)}
        # else:
        #     additional_resources = None

        # if self.psi_job_id[sample_id] is not None:
        #     sub = Submitter(queue_type='PBS', sh_file=submit_sh,
        #                     command_list=summary_commands,
        #                     job_name=job_name,
        #                     wait_for=[self.psi_job_id[sample_id]],
        #                     # Tell the queue to parallelize this job
        #                          # into a job array
        #                     additional_resources=additional_resources)
        # else:
        sub = Submitter(
            queue_type='PBS',
            sh_file=submit_sh,
            command_list=summary_commands,
            job_name=job_name,
            # Tell the queue to parallelize this job
            # into a job array
            additional_resources=additional_resources,
            wait_for_array=self.psi_job_id)

        self.summary_job_id = sub.write_sh(submit=True,
                                           nodes=self.num_cores,
                                           ppn=2,
                                           queue=self.queue,
                                           walltime=self.summary_walltime)

        print self.summary_job_id