def cluster_pcc(self): """ Creates co-expression clusters using mcl. """ filename, jobname = self.write_submission_script("cluster_pcc_%d", self.mcl_module, self.mcl_cmd, "cluster_pcc_%d.sh") for g in self.genomes: mcl_out = self.dp[g]['pcc_mcl_output'] # This is the PCC table in mcl format mcl_clusters = self.dp[g]['mcl_cluster_output'] # Desired path for the clusters command = ["qsub"] + self.qsub_mcl + ["-v", "in=%s,out=%s" % (mcl_out, mcl_clusters), filename] subprocess.call(command) # wait for all jobs to complete wait_for_job(jobname, sleep_time=1) # remove the submission script os.remove(filename) # remove OUT_ files PipelineBase.clean_out_files(jobname) print("Done\n\n")
def run_orthofinder(self): """ Runs orthofinder for all genomes """ orthofinder_dir = self.dp['GLOBAL']['orthofinder_output'] os.makedirs(orthofinder_dir, exist_ok=True) filename, jobname = self.write_submission_script( "orthofinder_%d", self.python_module + ' ' + self.blast_module + ' ' + self.mcl_module, self.orthofinder_cmd, "orthofinder_%d.sh") for g in self.genomes: print('çopying', self.dp[g]['protein_fasta'], 'to', os.path.join(orthofinder_dir, g + '.fasta')) copy(self.dp[g]['protein_fasta'], os.path.join(orthofinder_dir, g + '.fasta')) command = ["qsub"] + self.qsub_orthofinder + [ "-v", "fasta_dir=" + orthofinder_dir, filename ] subprocess.call(command) # wait for all jobs to complete wait_for_job(jobname) # remove the submission script os.remove(filename) # remove OUT_ files PipelineBase.clean_out_files(jobname) print("Done\n\n")
def run_interproscan(self): """ Runs interproscan for all or """ def split_fasta(file, chunks, output_directory, filenames="proteins_%d.fasta"): """ Splits a fasta file into a number of chuncks :param file: input fasta file :param chunks: number of parts to split the file into :param output_directory: output directory :param filenames: template for the filenames, should contain %d for the number """ fasta = Fasta() fasta.readfile(file) for k in fasta.sequences.keys(): fasta.sequences[k] = fasta.sequences[k].replace('*', '') seq_per_chunk = ceil(len(fasta.sequences.keys()) / chunks) if not os.path.exists(output_directory): os.makedirs(output_directory) for i in range(1, chunks + 1): subset = fasta.remove_subset(seq_per_chunk) filename = filenames % i filename = os.path.join(output_directory, filename) subset.writefile(filename) filename, jobname = self.write_batch_submission_script( "interproscan_%d", self.interproscan_module, self.interproscan_cmd, "interproscan_%d.sh") for g in self.genomes: tmp_dir = os.path.join(self.dp[g]['interpro_output'], 'tmp') os.makedirs(self.dp[g]['interpro_output'], exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) split_fasta(self.dp[g]['protein_fasta'], 100, tmp_dir, filenames="interpro_in_%d") command = ["qsub"] + self.qsub_interproscan + [ "-v", "in_dir=%s,in_prefix=%s,out_dir=%s,out_prefix=%s" % (tmp_dir, "interpro_in_", self.dp[g]['interpro_output'], "output_"), filename ] subprocess.call(command) wait_for_job(jobname, sleep_time=1) os.remove(filename) PipelineBase.clean_out_files(jobname)
def __run_htseq_count_tophat(self, keep_previous=False): """ Based on the gff file and sam file counts the number of reads that map to a given gene :param keep_previous: when true sam files output will not be removed after htseq-count completes """ filename, jobname = self.write_submission_script("htseq_count_%d", (self.samtools_module + '\t' + self.python_module), self.htseq_count_cmd, "htseq_count_%d.sh") for g in self.genomes: tophat_output = self.dp[g]['alignment_output'] htseq_output = self.dp[g]['htseq_output'] os.makedirs(htseq_output, exist_ok=True) gff_file = self.dp[g]['gff_file'] gff_feature = self.dp[g]['gff_feature'] gff_id = self.dp[g]['gff_id'] dirs = [o for o in os.listdir(tophat_output) if os.path.isdir(os.path.join(tophat_output, o))] bam_files = [] for d in dirs: bam_file = os.path.join(tophat_output, d, 'accepted_hits.bam') if os.path.exists(bam_file): bam_files.append((d, bam_file)) for d, bam_file in bam_files: htseq_out = os.path.join(htseq_output, d + '.htseq') print(d, bam_file, htseq_out) command = ["qsub"] + self.qsub_htseq_count + ["-v", "itype=bam,feature=%s,field=%s,bam=%s,gff=%s,out=%s" % (gff_feature, gff_id, bam_file, gff_file, htseq_out), filename] subprocess.call(command) # wait for all jobs to complete wait_for_job(jobname, sleep_time=1) # remove all tophat files files when keep_previous is disabled # NOTE: only the large bam file is removed (for now) if not keep_previous: for g in self.genomes: tophat_output = self.dp[g]['alignment_output'] dirs = [o for o in os.listdir(tophat_output) if os.path.isdir(os.path.join(tophat_output, o))] for d in dirs: bam_file = os.path.join(tophat_output, d, 'accepted_hits.bam') if os.path.exists(bam_file): os.remove(bam_file) # remove the submission script os.remove(filename) # remove OUT_ files PipelineBase.clean_out_files(jobname)
def __run_htseq_count_hisat2(self, keep_previous=False): filename, jobname = self.write_submission_script("htseq_count_%d", (self.samtools_module + '\t' + self.python_module), self.htseq_count_cmd, "htseq_count_%d.sh") for g in self.genomes: alignment_output = self.dp[g]['alignment_output'] htseq_output = self.dp[g]['htseq_output'] os.makedirs(htseq_output, exist_ok=True) gff_file = self.dp[g]['gff_file'] gff_feature = self.dp[g]['gff_feature'] gff_id = self.dp[g]['gff_id'] sam_files = [o for o in os.listdir(alignment_output) if os.path.isfile(os.path.join(alignment_output, o)) and o.endswith('.sam')] for sam_file in sam_files: htseq_out = os.path.join(htseq_output, sam_file.replace('.sam', '.htseq')) print(sam_file, htseq_out) command = ["qsub"] + self.qsub_htseq_count + ["-v", "itype=sam,feature=%s,field=%s,bam=%s,gff=%s,out=%s" % (gff_feature, gff_id, os.path.join(alignment_output, sam_file), gff_file, htseq_out), filename] subprocess.call(command) # wait for all jobs to complete wait_for_job(jobname, sleep_time=1) if not keep_previous: for g in self.genomes: alignment_output = self.dp[g]['alignment_output'] sam_files = [os.path.isfile(os.path.join(alignment_output, o)) for o in os.listdir(alignment_output) if os.path.isfile(os.path.join(alignment_output, o)) and o.endswith('.sam')] for sam_file in sam_files: if os.path.exists(sam_file): os.remove(sam_file) # remove the submission script os.remove(filename) # remove OUT_ files PipelineBase.clean_out_files(jobname)
def run_pcc(self, matrix_type='tpm'): """ Calculates pcc values on the cluster using the pcc.py script included in RSTrAP. :param matrix_type: tpm or rpkm, select the desired matrix """ filename, jobname = self.write_submission_script("pcc_wrapper_%d", self.python3_module, self.pcc_cmd, "pcc_wrapper_%d.sh") for g in self.genomes: pcc_out = self.dp[g]['pcc_output'] mcl_out = self.dp[g]['pcc_mcl_output'] os.makedirs(os.path.dirname(self.dp[g]['pcc_output']), exist_ok=True) os.makedirs(os.path.dirname(self.dp[g]['pcc_mcl_output']), exist_ok=True) if matrix_type == 'tpm': htseq_matrix = self.dp[g]['exp_matrix_tpm_output'] elif matrix_type == 'rpkm': htseq_matrix = self.dp[g]['exp_matrix_rpkm_output'] else: print('Matrix type %s unknown, quiting...' % matrix_type) quit() command = ["qsub"] + self.qsub_pcc + ["-v", "in=%s,out=%s,mcl_out=%s" % (htseq_matrix, pcc_out, mcl_out), filename] subprocess.call(command) # wait for all jobs to complete wait_for_job(jobname, sleep_time=1) # remove the submission script os.remove(filename) # remove OUT_ files PipelineBase.clean_out_files(jobname) print("Done\n\n")
def prepare_genome(self): """ Runs bowtie-build for each genome on the cluster. All settings are obtained from the settings fasta file """ if self.use_hisat2: filename, jobname = self.write_submission_script("build_index_%d", self.hisat2_module, self.hisat2_build_cmd, "build_index_%d.sh") else: filename, jobname = self.write_submission_script("build_index_%d", self.bowtie_module, self.bowtie_build_cmd, "build_index_%d.sh") for g in self.genomes: con_file = self.dp[g]['genome_fasta'] output = self.dp[g]['indexing_output'] os.makedirs(os.path.dirname(output), exist_ok=True) shutil.copy(con_file, output + '.fa') command = ["qsub"] + self.qsub_indexing + ["-v", "in=" + con_file + ",out=" + output, filename] subprocess.call(command) print("Preparing the genomic fasta file...") # wait for all jobs to complete wait_for_job(jobname) # remove the submission script os.remove(filename) # remove OUT_ files PipelineBase.clean_out_files(jobname) print("Done\n\n")
def run_mcl(self): """ Runs MCL clustering on OrthoFinder output to obtain homologous families (without re-running blast) """ orthofinder_dir = self.dp['GLOBAL']['orthofinder_output'] try: orthofinder_results_dir = list( filter(lambda x: 'Results_' in x, os.listdir(orthofinder_dir)))[0] except IndexError: print('No results found in orthofinder directory!', file=sys.stderr) quit() # Concatenate OrthoFinder blast files working_dir = os.path.join(orthofinder_dir, orthofinder_results_dir, 'WorkingDirectory') orthofinder_blast_files = list( filter(lambda x: x.startswith('Blast'), os.listdir(working_dir))) full_blast = os.path.join(working_dir, 'full_blast.out') full_blast_abc = os.path.join(working_dir, 'full_blast.abc') mcl_families_out = os.path.join(orthofinder_dir, 'mcl_families.unprocessed.txt') with open(full_blast, 'w') as outfile: for fname in orthofinder_blast_files: with open(os.path.join(working_dir, fname)) as infile: for line in infile: outfile.write(line) filename, jobname = self.write_submission_script( "mcl_%d", self.mcl_module, self.mcxdeblast_cmd + '\n' + self.mcl_cmd, "mcl_%d.sh") # submit job command = ["qsub"] + self.qsub_mcxdeblast + \ ["-v", "blast_in=" + full_blast + ",abc_out=" + full_blast_abc + ",in=" + full_blast_abc + ",out=" + mcl_families_out, filename] subprocess.call(command) # wait for all jobs to complete wait_for_job(jobname) id_conversion = {} with open(os.path.join(working_dir, 'SequenceIDs.txt')) as infile: for line in infile: parts = line.strip().split() id = parts[0].strip(':') gene = parts[1] id_conversion[id] = gene with open(mcl_families_out, 'r') as infile, open( os.path.join(orthofinder_dir, 'mcl_families.processed.txt'), 'w') as outfile: for l in infile: parts = [ id_conversion[id] if id in id_conversion.keys() else '!error!' for id in l.strip().split() ] print('\t'.join(parts), file=outfile) # remove the submission script os.remove(filename) # remove OUT_ files PipelineBase.clean_out_files(jobname) print("Done\n\n")
def trim_fastq(self, overwrite=False): """ Runs Trimmomatic on all fastq files """ filename_se, jobname = self.write_submission_script("trimmomatic_%d", None, self.trimmomatic_se_cmd, "trimmomatic_se_%d.sh") filename_pe, jobname = self.write_submission_script("trimmomatic_%d", None, self.trimmomatic_pe_cmd, "trimmomatic_pe_%d.sh") for g in self.genomes: fastq_input_dir = self.dp[g]['fastq_dir'] trimmed_output = self.dp[g]['trimmomatic_output'] os.makedirs(trimmed_output, exist_ok=True) fastq_files = [] for file in os.listdir(fastq_input_dir): if file.endswith('.fq.gz') or file.endswith('.fastq.gz'): fastq_files.append(file) # sort required to make sure _1 files are before _2 fastq_files.sort() while len(fastq_files) > 0: file = fastq_files.pop(0) if '_1.' in file: pair_file = file.replace('_1.', '_2.') if pair_file in fastq_files: fastq_files.remove(pair_file) ina = os.path.join(fastq_input_dir, file) inb = os.path.join(fastq_input_dir, pair_file) outap = file.replace('.fq.gz', '.trimmed.paired.fq.gz') if file.endswith('.fq.gz') else file.replace('.fastq.gz', '.trimmed.paired.fastq.gz') outau = file.replace('.fq.gz', '.trimmed.unpaired.fq.gz') if file.endswith('.fq.gz') else file.replace('.fastq.gz', '.trimmed.unpaired.fastq.gz') outbp = pair_file.replace('.fq.gz', '.trimmed.paired.fq.gz') if pair_file.endswith('.fq.gz') else pair_file.replace('.fastq.gz', '.trimmed.paired.fastq.gz') outbu = pair_file.replace('.fq.gz', '.trimmed.unpaired.fq.gz') if pair_file.endswith('.fq.gz') else pair_file.replace('.fastq.gz', '.trimmed.unpaired.fastq.gz') outap = os.path.join(trimmed_output, outap) outau = os.path.join(trimmed_output, outau) outbp = os.path.join(trimmed_output, outbp) outbu = os.path.join(trimmed_output, outbu) if overwrite or not os.path.exists(outap): print('Submitting pair %s, %s' % (file, pair_file)) command = ["qsub"] + self.qsub_trimmomatic + \ ["-v", "ina=%s,inb=%s,outap=%s,outau=%s,outbp=%s,outbu=%s,jar=%s" % (ina, inb, outap, outau, outbp, outbu, self.trimmomatic_path), filename_pe] subprocess.call(command) else: print('Found', outap, 'skipping') else: outfile = file.replace('.fq.gz', '.trimmed.fq.gz') if file.endswith('.fq.gz') else file.replace('.fastq.gz', '.trimmed.fastq.gz') if overwrite or not os.path.exists(os.path.join(trimmed_output, outfile)): print('Submitting single %s' % file) command = ["qsub"] + self.qsub_trimmomatic + ["-v", "in=" + os.path.join(fastq_input_dir, file) + ",out=" + os.path.join(trimmed_output, outfile) + ",jar=" + self.trimmomatic_path, filename_se] subprocess.call(command) else: print('Found', outfile, 'skipping') else: outfile = file.replace('.fq.gz', '.trimmed.fq.gz') if file.endswith('.fq.gz') else file.replace('.fastq.gz', '.trimmed.fastq.gz') if overwrite or not os.path.exists(os.path.join(trimmed_output, outfile)): print('Submitting single %s' % file) command = ["qsub"] + self.qsub_trimmomatic + ["-v", "in=" + os.path.join(fastq_input_dir, file) + ",out=" + os.path.join(trimmed_output, outfile) + ",jar=" + self.trimmomatic_path, filename_se] subprocess.call(command) else: print('Found', outfile, 'skipping') print('Trimming fastq files...') # wait for all jobs to complete wait_for_job(jobname, sleep_time=1) # remove the submission script os.remove(filename_se) os.remove(filename_pe) # remove OUT_ files PipelineBase.clean_out_files(jobname) print("Done\n\n")
def __run_hisat2(self, overwrite=False, keep_previous=False): """ Maps the reads from the trimmed fastq files to the bowtie-indexed genome :param overwrite: when true the pipeline will start tophat even if the output exists :param keep_previous: when true trimmed fastq files will not be removed after tophat completes """ filename_se, jobname = self.write_submission_script("hisat2_%d", self.hisat2_module, self.hisat2_se_cmd, "hisat2_se_%d.sh") filename_pe, jobname = self.write_submission_script("hisat2_%d", self.hisat2_module, self.hisat2_pe_cmd, "hisat2_pe_%d.sh") print('Mapping reads with HISAT2...') for g in self.genomes: alignment_output = self.dp[g]['alignment_output'] indexing_output = self.dp[g]['indexing_output'] trimmed_fastq_dir = self.dp[g]['trimmomatic_output'] os.makedirs(alignment_output, exist_ok=True) pe_files = [] se_files = [] for file in os.listdir(trimmed_fastq_dir): if file.endswith('.paired.fq.gz') or file.endswith('.paired.fastq.gz'): pe_files.append(file) elif not (file.endswith('.unpaired.fq.gz') or file.endswith('.unpaired.fastq.gz')): se_files.append(file) # sort required to make sure _1 files are before _2 pe_files.sort() se_files.sort() for pe_file in pe_files: if '_1.trimmed.paired.' in pe_file: pair_file = pe_file.replace('_1.trimmed.paired.', '_2.trimmed.paired.') output_sam = pe_file.replace('_1.trimmed.paired.fq.gz', '').replace('_1.trimmed.paired.fastq.gz', '') + '.sam' output_stats = pe_file.replace('_1.trimmed.paired.fq.gz', '').replace('_1.trimmed.paired.fastq.gz', '') + '.stats' output_sam = os.path.join(alignment_output, output_sam) output_stats = os.path.join(alignment_output, output_stats) forward = os.path.join(trimmed_fastq_dir, pe_file) reverse = os.path.join(trimmed_fastq_dir, pair_file) if overwrite or not os.path.exists(output_sam): print('Submitting pair %s, %s' % (pe_file, pair_file)) command = ["qsub"] + self.qsub_tophat + \ ["-v", "out=%s,genome=%s,forward=%s,reverse=%s,stats=%s" % (output_sam, indexing_output, forward, reverse, output_stats), filename_pe] subprocess.call(command) else: print('Output exists, skipping', pe_file) for se_file in se_files: output_sam = se_file.replace('.trimmed.fq.gz', '').replace('.trimmed.fastq.gz', '') + '.sam' output_sam = os.path.join(alignment_output, output_sam) output_stats = se_file.replace('.trimmed.fq.gz', '').replace('.trimmed.fastq.gz', '') + '.stats' output_stats = os.path.join(alignment_output, output_stats) if overwrite or not os.path.exists(output_sam): print('Submitting single %s' % se_file) command = ["qsub"] + self.qsub_tophat + ["-v", "out=%s,genome=%s,fq=%s,stats=%s" % (output_sam, indexing_output, os.path.join(trimmed_fastq_dir, se_file), output_stats), filename_se] subprocess.call(command) else: print('Output exists, skipping', se_file) # wait for all jobs to complete wait_for_job(jobname, sleep_time=1) # remove the submission script os.remove(filename_se) os.remove(filename_pe) # remove OUT_ files PipelineBase.clean_out_files(jobname)