Ejemplo n.º 1
0
    def cluster_pcc(self):
        """
        Creates co-expression clusters using mcl.
        """
        filename, jobname = self.write_submission_script("cluster_pcc_%d",
                                                         self.mcl_module,
                                                         self.mcl_cmd,
                                                         "cluster_pcc_%d.sh")

        for g in self.genomes:
            mcl_out = self.dp[g]['pcc_mcl_output']          # This is the PCC table in mcl format
            mcl_clusters = self.dp[g]['mcl_cluster_output'] # Desired path for the clusters

            command = ["qsub"] + self.qsub_mcl + ["-v", "in=%s,out=%s" % (mcl_out, mcl_clusters), filename]
            subprocess.call(command)

        # wait for all jobs to complete
        wait_for_job(jobname, sleep_time=1)

        # remove the submission script
        os.remove(filename)

        # remove OUT_ files
        PipelineBase.clean_out_files(jobname)

        print("Done\n\n")
Ejemplo n.º 2
0
    def run_orthofinder(self):
        """
        Runs orthofinder for all genomes
        """
        orthofinder_dir = self.dp['GLOBAL']['orthofinder_output']

        os.makedirs(orthofinder_dir, exist_ok=True)

        filename, jobname = self.write_submission_script(
            "orthofinder_%d", self.python_module + ' ' + self.blast_module +
            ' ' + self.mcl_module, self.orthofinder_cmd, "orthofinder_%d.sh")

        for g in self.genomes:
            print('çopying', self.dp[g]['protein_fasta'], 'to',
                  os.path.join(orthofinder_dir, g + '.fasta'))
            copy(self.dp[g]['protein_fasta'],
                 os.path.join(orthofinder_dir, g + '.fasta'))

        command = ["qsub"] + self.qsub_orthofinder + [
            "-v", "fasta_dir=" + orthofinder_dir, filename
        ]
        subprocess.call(command)

        # wait for all jobs to complete
        wait_for_job(jobname)

        # remove the submission script
        os.remove(filename)

        # remove OUT_ files
        PipelineBase.clean_out_files(jobname)

        print("Done\n\n")
Ejemplo n.º 3
0
    def run_interproscan(self):
        """
        Runs interproscan for all or
        """
        def split_fasta(file,
                        chunks,
                        output_directory,
                        filenames="proteins_%d.fasta"):
            """
            Splits a fasta file into a number of chuncks

            :param file: input fasta file
            :param chunks: number of parts to split the file into
            :param output_directory: output directory
            :param filenames: template for the filenames, should contain %d for the number
            """
            fasta = Fasta()
            fasta.readfile(file)

            for k in fasta.sequences.keys():
                fasta.sequences[k] = fasta.sequences[k].replace('*', '')

            seq_per_chunk = ceil(len(fasta.sequences.keys()) / chunks)

            if not os.path.exists(output_directory):
                os.makedirs(output_directory)

            for i in range(1, chunks + 1):
                subset = fasta.remove_subset(seq_per_chunk)
                filename = filenames % i
                filename = os.path.join(output_directory, filename)

                subset.writefile(filename)

        filename, jobname = self.write_batch_submission_script(
            "interproscan_%d", self.interproscan_module, self.interproscan_cmd,
            "interproscan_%d.sh")

        for g in self.genomes:
            tmp_dir = os.path.join(self.dp[g]['interpro_output'], 'tmp')
            os.makedirs(self.dp[g]['interpro_output'], exist_ok=True)
            os.makedirs(tmp_dir, exist_ok=True)

            split_fasta(self.dp[g]['protein_fasta'],
                        100,
                        tmp_dir,
                        filenames="interpro_in_%d")
            command = ["qsub"] + self.qsub_interproscan + [
                "-v",
                "in_dir=%s,in_prefix=%s,out_dir=%s,out_prefix=%s" %
                (tmp_dir, "interpro_in_", self.dp[g]['interpro_output'],
                 "output_"), filename
            ]
            subprocess.call(command)

        wait_for_job(jobname, sleep_time=1)

        os.remove(filename)
        PipelineBase.clean_out_files(jobname)
Ejemplo n.º 4
0
    def __run_htseq_count_tophat(self, keep_previous=False):
        """
        Based on the gff file and sam file counts the number of reads that map to a given gene

        :param keep_previous: when true sam files output will not be removed after htseq-count completes
        """
        filename, jobname = self.write_submission_script("htseq_count_%d",
                                                         (self.samtools_module + '\t' + self.python_module),
                                                         self.htseq_count_cmd,
                                                         "htseq_count_%d.sh")

        for g in self.genomes:
            tophat_output = self.dp[g]['alignment_output']
            htseq_output = self.dp[g]['htseq_output']
            os.makedirs(htseq_output, exist_ok=True)

            gff_file = self.dp[g]['gff_file']
            gff_feature = self.dp[g]['gff_feature']
            gff_id = self.dp[g]['gff_id']

            dirs = [o for o in os.listdir(tophat_output) if os.path.isdir(os.path.join(tophat_output, o))]
            bam_files = []
            for d in dirs:
                bam_file = os.path.join(tophat_output, d, 'accepted_hits.bam')
                if os.path.exists(bam_file):
                    bam_files.append((d, bam_file))

            for d, bam_file in bam_files:
                htseq_out = os.path.join(htseq_output, d + '.htseq')
                print(d, bam_file, htseq_out)

                command = ["qsub"] + self.qsub_htseq_count + ["-v", "itype=bam,feature=%s,field=%s,bam=%s,gff=%s,out=%s"
                                                              % (gff_feature, gff_id, bam_file, gff_file, htseq_out),
                                                              filename]
                subprocess.call(command)

        # wait for all jobs to complete
        wait_for_job(jobname, sleep_time=1)

        # remove all tophat files files when keep_previous is disabled
        # NOTE: only the large bam file is removed (for now)
        if not keep_previous:
            for g in self.genomes:
                tophat_output = self.dp[g]['alignment_output']
                dirs = [o for o in os.listdir(tophat_output) if os.path.isdir(os.path.join(tophat_output, o))]
                for d in dirs:
                    bam_file = os.path.join(tophat_output, d, 'accepted_hits.bam')
                    if os.path.exists(bam_file):
                        os.remove(bam_file)

        # remove the submission script
        os.remove(filename)

        # remove OUT_ files
        PipelineBase.clean_out_files(jobname)
Ejemplo n.º 5
0
    def __run_htseq_count_hisat2(self, keep_previous=False):
        filename, jobname = self.write_submission_script("htseq_count_%d",
                                                         (self.samtools_module + '\t' + self.python_module),
                                                         self.htseq_count_cmd,
                                                         "htseq_count_%d.sh")
        for g in self.genomes:
            alignment_output = self.dp[g]['alignment_output']
            htseq_output = self.dp[g]['htseq_output']
            os.makedirs(htseq_output, exist_ok=True)

            gff_file = self.dp[g]['gff_file']
            gff_feature = self.dp[g]['gff_feature']
            gff_id = self.dp[g]['gff_id']

            sam_files = [o for o in os.listdir(alignment_output) if os.path.isfile(os.path.join(alignment_output, o)) and
                         o.endswith('.sam')]

            for sam_file in sam_files:
                htseq_out = os.path.join(htseq_output, sam_file.replace('.sam', '.htseq'))
                print(sam_file, htseq_out)

                command = ["qsub"] + self.qsub_htseq_count + ["-v", "itype=sam,feature=%s,field=%s,bam=%s,gff=%s,out=%s"
                                                              % (gff_feature, gff_id,
                                                                 os.path.join(alignment_output, sam_file),
                                                                 gff_file, htseq_out),
                                                              filename]
                subprocess.call(command)

        # wait for all jobs to complete
        wait_for_job(jobname, sleep_time=1)

        if not keep_previous:
            for g in self.genomes:
                alignment_output = self.dp[g]['alignment_output']
                sam_files = [os.path.isfile(os.path.join(alignment_output, o)) for o in os.listdir(alignment_output) if
                             os.path.isfile(os.path.join(alignment_output, o)) and
                             o.endswith('.sam')]
                for sam_file in sam_files:
                    if os.path.exists(sam_file):
                        os.remove(sam_file)

        # remove the submission script
        os.remove(filename)

        # remove OUT_ files
        PipelineBase.clean_out_files(jobname)
Ejemplo n.º 6
0
    def run_pcc(self, matrix_type='tpm'):
        """
        Calculates pcc values on the cluster using the pcc.py script included in RSTrAP.

        :param matrix_type: tpm or rpkm, select the desired matrix
        """
        filename, jobname = self.write_submission_script("pcc_wrapper_%d",
                                                         self.python3_module,
                                                         self.pcc_cmd,
                                                         "pcc_wrapper_%d.sh")

        for g in self.genomes:
            pcc_out = self.dp[g]['pcc_output']
            mcl_out = self.dp[g]['pcc_mcl_output']

            os.makedirs(os.path.dirname(self.dp[g]['pcc_output']), exist_ok=True)
            os.makedirs(os.path.dirname(self.dp[g]['pcc_mcl_output']), exist_ok=True)

            if matrix_type == 'tpm':
                htseq_matrix = self.dp[g]['exp_matrix_tpm_output']
            elif matrix_type == 'rpkm':
                htseq_matrix = self.dp[g]['exp_matrix_rpkm_output']
            else:
                print('Matrix type %s unknown, quiting...' % matrix_type)
                quit()

            command = ["qsub"] + self.qsub_pcc + ["-v", "in=%s,out=%s,mcl_out=%s" % (htseq_matrix, pcc_out, mcl_out), filename]
            subprocess.call(command)

        # wait for all jobs to complete
        wait_for_job(jobname, sleep_time=1)

        # remove the submission script
        os.remove(filename)

        # remove OUT_ files
        PipelineBase.clean_out_files(jobname)

        print("Done\n\n")
Ejemplo n.º 7
0
    def prepare_genome(self):
        """
        Runs bowtie-build for each genome on the cluster. All settings are obtained from the settings fasta file
        """
        if self.use_hisat2:
            filename, jobname = self.write_submission_script("build_index_%d",
                                                             self.hisat2_module,
                                                             self.hisat2_build_cmd,
                                                             "build_index_%d.sh")
        else:
            filename, jobname = self.write_submission_script("build_index_%d",
                                                             self.bowtie_module,
                                                             self.bowtie_build_cmd,
                                                             "build_index_%d.sh")

        for g in self.genomes:
            con_file = self.dp[g]['genome_fasta']
            output = self.dp[g]['indexing_output']

            os.makedirs(os.path.dirname(output), exist_ok=True)
            shutil.copy(con_file, output + '.fa')

            command = ["qsub"] + self.qsub_indexing + ["-v", "in=" + con_file + ",out=" + output, filename]

            subprocess.call(command)

        print("Preparing the genomic fasta file...")

        # wait for all jobs to complete
        wait_for_job(jobname)

        # remove the submission script
        os.remove(filename)

        # remove OUT_ files
        PipelineBase.clean_out_files(jobname)

        print("Done\n\n")
Ejemplo n.º 8
0
    def run_mcl(self):
        """
        Runs MCL clustering on OrthoFinder output to obtain homologous families (without re-running blast)
        """
        orthofinder_dir = self.dp['GLOBAL']['orthofinder_output']

        try:
            orthofinder_results_dir = list(
                filter(lambda x: 'Results_' in x,
                       os.listdir(orthofinder_dir)))[0]
        except IndexError:
            print('No results found in orthofinder directory!',
                  file=sys.stderr)
            quit()

        # Concatenate OrthoFinder blast files
        working_dir = os.path.join(orthofinder_dir, orthofinder_results_dir,
                                   'WorkingDirectory')
        orthofinder_blast_files = list(
            filter(lambda x: x.startswith('Blast'), os.listdir(working_dir)))
        full_blast = os.path.join(working_dir, 'full_blast.out')
        full_blast_abc = os.path.join(working_dir, 'full_blast.abc')
        mcl_families_out = os.path.join(orthofinder_dir,
                                        'mcl_families.unprocessed.txt')

        with open(full_blast, 'w') as outfile:
            for fname in orthofinder_blast_files:
                with open(os.path.join(working_dir, fname)) as infile:
                    for line in infile:
                        outfile.write(line)

        filename, jobname = self.write_submission_script(
            "mcl_%d", self.mcl_module,
            self.mcxdeblast_cmd + '\n' + self.mcl_cmd, "mcl_%d.sh")
        # submit job
        command = ["qsub"] + self.qsub_mcxdeblast + \
                  ["-v", "blast_in=" + full_blast +
                   ",abc_out=" + full_blast_abc +
                   ",in=" + full_blast_abc +
                   ",out=" + mcl_families_out, filename]
        subprocess.call(command)

        # wait for all jobs to complete
        wait_for_job(jobname)

        id_conversion = {}
        with open(os.path.join(working_dir, 'SequenceIDs.txt')) as infile:
            for line in infile:
                parts = line.strip().split()
                id = parts[0].strip(':')
                gene = parts[1]

                id_conversion[id] = gene

        with open(mcl_families_out, 'r') as infile, open(
                os.path.join(orthofinder_dir, 'mcl_families.processed.txt'),
                'w') as outfile:
            for l in infile:
                parts = [
                    id_conversion[id]
                    if id in id_conversion.keys() else '!error!'
                    for id in l.strip().split()
                ]
                print('\t'.join(parts), file=outfile)

        # remove the submission script
        os.remove(filename)

        # remove OUT_ files
        PipelineBase.clean_out_files(jobname)

        print("Done\n\n")
Ejemplo n.º 9
0
    def trim_fastq(self, overwrite=False):
        """
        Runs Trimmomatic on all fastq files
        """
        filename_se, jobname = self.write_submission_script("trimmomatic_%d",
                                                            None,
                                                            self.trimmomatic_se_cmd,
                                                            "trimmomatic_se_%d.sh")
        filename_pe, jobname = self.write_submission_script("trimmomatic_%d",
                                                            None,
                                                            self.trimmomatic_pe_cmd,
                                                            "trimmomatic_pe_%d.sh")

        for g in self.genomes:
            fastq_input_dir = self.dp[g]['fastq_dir']
            trimmed_output = self.dp[g]['trimmomatic_output']
            os.makedirs(trimmed_output, exist_ok=True)

            fastq_files = []

            for file in os.listdir(fastq_input_dir):
                if file.endswith('.fq.gz') or file.endswith('.fastq.gz'):
                    fastq_files.append(file)

            # sort required to make sure _1 files are before _2
            fastq_files.sort()

            while len(fastq_files) > 0:
                file = fastq_files.pop(0)

                if '_1.' in file:
                    pair_file = file.replace('_1.', '_2.')
                    if pair_file in fastq_files:
                        fastq_files.remove(pair_file)

                        ina = os.path.join(fastq_input_dir, file)
                        inb = os.path.join(fastq_input_dir, pair_file)

                        outap = file.replace('.fq.gz', '.trimmed.paired.fq.gz') if file.endswith('.fq.gz') else file.replace('.fastq.gz', '.trimmed.paired.fastq.gz')
                        outau = file.replace('.fq.gz', '.trimmed.unpaired.fq.gz') if file.endswith('.fq.gz') else file.replace('.fastq.gz', '.trimmed.unpaired.fastq.gz')

                        outbp = pair_file.replace('.fq.gz', '.trimmed.paired.fq.gz') if pair_file.endswith('.fq.gz') else pair_file.replace('.fastq.gz', '.trimmed.paired.fastq.gz')
                        outbu = pair_file.replace('.fq.gz', '.trimmed.unpaired.fq.gz') if pair_file.endswith('.fq.gz') else pair_file.replace('.fastq.gz', '.trimmed.unpaired.fastq.gz')

                        outap = os.path.join(trimmed_output, outap)
                        outau = os.path.join(trimmed_output, outau)

                        outbp = os.path.join(trimmed_output, outbp)
                        outbu = os.path.join(trimmed_output, outbu)
                        if overwrite or not os.path.exists(outap):
                            print('Submitting pair %s, %s' % (file, pair_file))
                            command = ["qsub"] + self.qsub_trimmomatic + \
                                      ["-v", "ina=%s,inb=%s,outap=%s,outau=%s,outbp=%s,outbu=%s,jar=%s" % (ina, inb, outap, outau, outbp, outbu, self.trimmomatic_path), filename_pe]
                            subprocess.call(command)
                        else:
                            print('Found', outap, 'skipping')
                    else:
                        outfile = file.replace('.fq.gz', '.trimmed.fq.gz') if file.endswith('.fq.gz') else file.replace('.fastq.gz', '.trimmed.fastq.gz')
                        if overwrite or not os.path.exists(os.path.join(trimmed_output, outfile)):
                            print('Submitting single %s' % file)
                            command = ["qsub"] + self.qsub_trimmomatic + ["-v", "in=" + os.path.join(fastq_input_dir, file) + ",out=" + os.path.join(trimmed_output, outfile) +
                                                ",jar=" + self.trimmomatic_path, filename_se]
                            subprocess.call(command)
                        else:
                            print('Found', outfile, 'skipping')
                else:
                    outfile = file.replace('.fq.gz', '.trimmed.fq.gz') if file.endswith('.fq.gz') else file.replace('.fastq.gz', '.trimmed.fastq.gz')
                    if overwrite or not os.path.exists(os.path.join(trimmed_output, outfile)):
                        print('Submitting single %s' % file)
                        command = ["qsub"] + self.qsub_trimmomatic + ["-v", "in=" + os.path.join(fastq_input_dir, file) + ",out=" + os.path.join(trimmed_output, outfile) +
                                         ",jar=" + self.trimmomatic_path, filename_se]
                        subprocess.call(command)
                    else:
                        print('Found', outfile, 'skipping')

        print('Trimming fastq files...')

        # wait for all jobs to complete
        wait_for_job(jobname, sleep_time=1)

        # remove the submission script
        os.remove(filename_se)
        os.remove(filename_pe)

        # remove OUT_ files
        PipelineBase.clean_out_files(jobname)

        print("Done\n\n")
Ejemplo n.º 10
0
    def __run_hisat2(self, overwrite=False, keep_previous=False):
        """
        Maps the reads from the trimmed fastq files to the bowtie-indexed genome

        :param overwrite: when true the pipeline will start tophat even if the output exists
        :param keep_previous: when true trimmed fastq files will not be removed after tophat completes
        """
        filename_se, jobname = self.write_submission_script("hisat2_%d",
                                                            self.hisat2_module,
                                                            self.hisat2_se_cmd,
                                                            "hisat2_se_%d.sh")

        filename_pe, jobname = self.write_submission_script("hisat2_%d",
                                                            self.hisat2_module,
                                                            self.hisat2_pe_cmd,
                                                            "hisat2_pe_%d.sh")

        print('Mapping reads with HISAT2...')

        for g in self.genomes:
            alignment_output = self.dp[g]['alignment_output']
            indexing_output = self.dp[g]['indexing_output']
            trimmed_fastq_dir = self.dp[g]['trimmomatic_output']
            os.makedirs(alignment_output, exist_ok=True)

            pe_files = []
            se_files = []

            for file in os.listdir(trimmed_fastq_dir):
                if file.endswith('.paired.fq.gz') or file.endswith('.paired.fastq.gz'):
                    pe_files.append(file)
                elif not (file.endswith('.unpaired.fq.gz') or file.endswith('.unpaired.fastq.gz')):
                    se_files.append(file)

            # sort required to make sure _1 files are before _2
            pe_files.sort()
            se_files.sort()

            for pe_file in pe_files:
                if '_1.trimmed.paired.' in pe_file:
                    pair_file = pe_file.replace('_1.trimmed.paired.', '_2.trimmed.paired.')

                    output_sam = pe_file.replace('_1.trimmed.paired.fq.gz', '').replace('_1.trimmed.paired.fastq.gz', '') + '.sam'
                    output_stats = pe_file.replace('_1.trimmed.paired.fq.gz', '').replace('_1.trimmed.paired.fastq.gz', '') + '.stats'
                    output_sam = os.path.join(alignment_output, output_sam)
                    output_stats = os.path.join(alignment_output, output_stats)
                    forward = os.path.join(trimmed_fastq_dir, pe_file)
                    reverse = os.path.join(trimmed_fastq_dir, pair_file)
                    if overwrite or not os.path.exists(output_sam):
                        print('Submitting pair %s, %s' % (pe_file, pair_file))
                        command = ["qsub"] + self.qsub_tophat + \
                                  ["-v", "out=%s,genome=%s,forward=%s,reverse=%s,stats=%s" %
                                   (output_sam, indexing_output, forward, reverse, output_stats), filename_pe]
                        subprocess.call(command)
                    else:
                        print('Output exists, skipping', pe_file)

            for se_file in se_files:
                output_sam = se_file.replace('.trimmed.fq.gz', '').replace('.trimmed.fastq.gz', '') + '.sam'
                output_sam = os.path.join(alignment_output, output_sam)
                output_stats = se_file.replace('.trimmed.fq.gz', '').replace('.trimmed.fastq.gz', '') + '.stats'
                output_stats = os.path.join(alignment_output, output_stats)

                if overwrite or not os.path.exists(output_sam):
                    print('Submitting single %s' % se_file)
                    command = ["qsub"] + self.qsub_tophat + ["-v",
                                                             "out=%s,genome=%s,fq=%s,stats=%s" %
                                                             (output_sam, indexing_output,
                                                              os.path.join(trimmed_fastq_dir, se_file), output_stats),
                                                             filename_se]
                    subprocess.call(command)
                else:
                    print('Output exists, skipping', se_file)

        # wait for all jobs to complete
        wait_for_job(jobname, sleep_time=1)

        # remove the submission script
        os.remove(filename_se)
        os.remove(filename_pe)

        # remove OUT_ files
        PipelineBase.clean_out_files(jobname)