def work(self): """ Worker function for splitting the FASTQ file into smaller chunks Parameters ---------- genome_fa : str Location of the FASTA file of the genome to align the reads to genome_idx : str Location of the index files in .tar.gz file prepared by the BWA indexer fastq_file_1 : str Location of the FASTQ file fastq_file_2 : str Location of the FASTQ file output_bam : str Location of the aligned reads in bam format """ bwa_handle = bwaAlignerMEMTool({"no-untar": True}) bwa_handle.bwa_aligner_paired(self.genome_fa, self.fastq_file_1, self.fastq_file_2, self.output_bam, self.genome_idx, {}) bam_handle = bamUtilsTask() bam_handle.bam_sort(self.output_bam)
def work(self): """ Worker function for aligning single ended FASTQ reads using Bowtie2 Parameters ---------- genome_fa : str Location of the FASTA file of the genome to align the reads to genome_idx : str Location of the index files in .tar.gz file prepared by the BWA indexer fastq_file : str Location of the FASTQ file output_bam : str Location of the aligned reads in bam format """ bowtie2_handle = bowtie2AlignerTool({"no-untar" : True}) bowtie2_handle.bowtie2_aligner_paired( self.genome_fa, self.fastq_file_1, self.fastq_file_2, self.output_bam, self.genome_idx, {} ) bam_handle = bamUtilsTask() bam_handle.bam_sort(self.output_bam)
def run(self, input_files, input_metadata, output_files): # pylint: disable=too-many-locals,too-many-branches,too-many-statements """ Tool for indexing the genome assembly using BS-Seeker2. In this case it is using Bowtie2 Parameters ---------- input_files : list FASTQ file output_files : list Results files. metadata : list Returns ------- array : list Location of the filtered FASTQ file """ try: if "bss_path" in self.configuration: bss_path = self.configuration["bss_path"] else: raise KeyError if "aligner_path" in self.configuration: aligner_path = self.configuration["aligner_path"] else: raise KeyError if "aligner" in self.configuration: aligner = self.configuration["aligner"] else: raise KeyError except KeyError: logger.fatal( "WGBS - BS SEEKER2: Unassigned configuration variables") genome_fasta = input_files["genome"] genome_idx = input_files["index"] sources = [input_files["genome"]] fqs = fastq_splitter() fastq1 = input_files["fastq1"] sources.append(input_files["fastq1"]) fastq_file_gz = fastq1 + ".tar.gz" if "fastq2" in input_files: fastq2 = input_files["fastq2"] sources.append(input_files["fastq2"]) fastq_file_list = fqs.paired_splitter(fastq1, fastq2, fastq_file_gz) aln_params = self.get_aln_params(self.configuration, True) else: fastq_file_list = fqs.single_splitter(fastq1, fastq_file_gz) aln_params = self.get_aln_params(self.configuration) # Required to prevent iterating over the future objects fastq_file_list = compss_wait_on(fastq_file_list) if not fastq_file_list: logger.fatal("FASTQ SPLITTER: run failed") return {}, {} if hasattr(sys, '_run_from_cmdl') is True: pass else: with compss_open(fastq_file_gz, "rb") as f_in: with open(fastq_file_gz, "wb") as f_out: f_out.write(f_in.read()) gz_data_path = fastq_file_gz.split("/") gz_data_path = "/".join(gz_data_path[:-1]) try: tar = tarfile.open(fastq_file_gz) tar.extractall(path=gz_data_path) tar.close() except tarfile.TarError: logger.fatal("Split FASTQ files: Malformed tar file") return {}, {} # input and output share most metadata output_metadata = {} output_bam_file = output_files["bam"] output_bai_file = output_files["bai"] output_bam_list = [] for fastq_file_pair in fastq_file_list: logger.info("TMP DIR: " + gz_data_path + "/tmp/") if "fastq2" in input_files: tmp_fq1 = gz_data_path + "/tmp/" + fastq_file_pair[0] tmp_fq2 = gz_data_path + "/tmp/" + fastq_file_pair[1] logger.info("TMP_FQ1: " + fastq_file_pair[0]) logger.info("TMP_FQ2: " + fastq_file_pair[1]) output_bam_file_tmp = tmp_fq1 + ".bam" output_bam_list.append(output_bam_file_tmp) self.bs_seeker_aligner(tmp_fq1, tmp_fq2, aligner, aligner_path, bss_path, aln_params, genome_fasta, genome_idx, output_bam_file_tmp) else: tmp_fq = gz_data_path + "/tmp/" + fastq_file_pair[0] logger.info("TMP_FQ: " + fastq_file_pair[0]) output_bam_file_tmp = tmp_fq + ".bam" output_bam_list.append(output_bam_file_tmp) self.bs_seeker_aligner_single(tmp_fq, aligner, aligner_path, bss_path, aln_params, genome_fasta, genome_idx, output_bam_file_tmp) bam_handle = bamUtilsTask() logger.info("Merging bam files") bam_handle.bam_merge(output_bam_list) logger.info("Sorting merged bam file") bam_handle.bam_sort(output_bam_list[0]) logger.info("Copying bam file into the output file") bam_handle.bam_copy(output_bam_list[0], output_bam_file) logger.info("Creating output bam index file") bam_handle.bam_index(output_bam_file, output_bai_file) output_metadata = { "bam": Metadata(data_type="data_wgbs", file_type="BAM", file_path=output_bam_file, sources=sources, taxon_id=input_metadata["genome"].taxon_id, meta_data={ "assembly": input_metadata["genome"].meta_data["assembly"], "tool": "bs_seeker_aligner" }), "bai": Metadata(data_type="data_wgbs", file_type="BAI", file_path=output_bai_file, sources=[input_metadata["genome"].file_path], taxon_id=input_metadata["genome"].taxon_id, meta_data={ "assembly": input_metadata["genome"].meta_data["assembly"], "tool": "bs_seeker_aligner" }) } return (output_files, output_metadata)
def run(self, input_files, input_metadata, output_files): """ Tool for methylation calling using BS-Seeker2. Parameters ---------- input_files : list Sorted BAM file with the sequence alignments metadata : list Returns ------- array : list Location of the output wig file """ try: if "bss_path" in self.configuration: bss_path = self.configuration["bss_path"] else: raise KeyError except KeyError: logger.fatal( "WGBS - BS SEEKER2: Unassigned configuration variables") bam_handler = bamUtilsTask() bam_handler.check_header(input_files["bam"]) self.bss_methylation_caller(bss_path, input_files["bam"], input_files["index"], self.get_params(self.configuration), output_files["wig_file"], output_files["cgmap_file"], output_files["atcgmap_file"]) output_metadata = { "wig_file": Metadata(data_type="data_wgbs", file_type="wig", file_path=output_files["wig_file"], sources=input_metadata["bam"].sources, taxon_id=input_metadata["genome"].taxon_id, meta_data={ "assembly": input_metadata["genome"].meta_data["assembly"], "tool": "bs_seeker_methylation_caller" }), "cgmap_file": Metadata(data_type="data_wgbs", file_type="tsv", file_path=output_files["cgmap_file"], sources=input_metadata["bam"].sources, taxon_id=input_metadata["genome"].taxon_id, meta_data={ "assembly": input_metadata["genome"].meta_data["assembly"], "tool": "bs_seeker_methylation_caller" }), "atcgmap_file": Metadata(data_type="data_wgbs", file_type="tsv", file_path=output_files["atcgmap_file"], sources=input_metadata["bam"].sources, taxon_id=input_metadata["genome"].taxon_id, meta_data={ "assembly": input_metadata["genome"].meta_data["assembly"], "tool": "bs_seeker_methylation_caller" }) } return (output_files, output_metadata)
def run(self, input_files, input_metadata, output_files): """ The main function to align bam files to a genome using BWA Parameters ---------- input_files : dict File 0 is the genome file location, file 1 is the FASTQ file metadata : dict output_files : dict Returns ------- output_files : dict First element is a list of output_bam_files, second element is the matching meta data output_metadata : dict """ sources = [input_files["genome"]] fqs = fastq_splitter() fastq1 = input_files["loc"] sources.append(input_files["loc"]) fastq_file_gz = str(fastq1 + ".tar.gz") if "fastq2" in input_files: fastq2 = input_files["fastq2"] sources.append(input_files["fastq2"]) fastq_file_list = fqs.paired_splitter(fastq1, fastq2, fastq_file_gz) else: fastq_file_list = fqs.single_splitter(fastq1, fastq_file_gz) # Required to prevent iterating over the future objects fastq_file_list = compss_wait_on(fastq_file_list) if not fastq_file_list: logger.fatal("FASTQ SPLITTER: run failed") return {}, {} if hasattr(sys, '_run_from_cmdl') is True: pass else: logger.info("Getting the tar file") with compss_open(fastq_file_gz, "rb") as f_in: with open(fastq_file_gz, "wb") as f_out: f_out.write(f_in.read()) gz_data_path = fastq_file_gz.split("/") gz_data_path = "/".join(gz_data_path[:-1]) try: tar = tarfile.open(fastq_file_gz) tar.extractall(path=gz_data_path) tar.close() except tarfile.TarError: logger.fatal("Split FASTQ files: Malformed tar file") return {}, {} # input and output share most metadata output_metadata = {} output_bam_file = output_files["output"] # output_bai_file = output_files["bai"] logger.info("BWA ALIGNER: Aligning sequence reads to the genome") output_bam_list = [] for fastq_file_pair in fastq_file_list: if "fastq2" in input_files: tmp_fq1 = gz_data_path + "/tmp/" + fastq_file_pair[0] tmp_fq2 = gz_data_path + "/tmp/" + fastq_file_pair[1] output_bam_file_tmp = tmp_fq1 + ".bam" output_bam_list.append(output_bam_file_tmp) logger.info("BWA MEM FILES: " + tmp_fq1 + " - " + tmp_fq2) self.bwa_aligner_paired( str(input_files["genome"]), tmp_fq1, tmp_fq2, output_bam_file_tmp, str(input_files["index"]), self.get_mem_params(self.configuration)) else: tmp_fq = gz_data_path + "/tmp/" + fastq_file_pair[0] output_bam_file_tmp = tmp_fq + ".bam" output_bam_list.append(output_bam_file_tmp) logger.info("BWA MEM FILES: " + tmp_fq) self.bwa_aligner_single( str(input_files["genome"]), tmp_fq, output_bam_file_tmp, str(input_files["index"]), self.get_mem_params(self.configuration)) bam_handle = bamUtilsTask() logger.info("Merging bam files") bam_handle.bam_merge(output_bam_list) logger.info("Sorting merged bam file") bam_handle.bam_sort(output_bam_list[0]) logger.info("Copying bam file into the output file") bam_handle.bam_copy(output_bam_list[0], output_bam_file) logger.info("BWA ALIGNER: Alignments complete") output_metadata = { "bam": Metadata(data_type=input_metadata['loc'].data_type, file_type="BAM", file_path=output_files["output"], sources=[ input_metadata["genome"].file_path, input_metadata['loc'].file_path ], taxon_id=input_metadata["genome"].taxon_id, meta_data={ "assembly": input_metadata["genome"].meta_data["assembly"], "tool": "bwa_aligner" }) } return ({"bam": output_files["output"]}, output_metadata)
def run(self, input_files, input_metadata, output_files): """ The main function to run MACS 2 for peak calling over a given BAM file and matching background BAM file. Parameters ---------- input_files : list List of input bam file locations where 0 is the bam data file and 1 is the matching background bam file metadata : dict Returns ------- output_files : list List of locations for the output files. output_metadata : list List of matching metadata dict objects """ root_name = input_files['bam'].split("/") root_name[-1] = root_name[-1].replace('.bam', '') name = root_name[-1] # input and output share most metadata output_bed_types = { 'narrow_peak': "bed4+1", 'summits': "bed6+4", 'broad_peak': "bed6+3", 'gapped_peak': "bed12+3" } command_params = self.get_macs2_params(self.configuration) bam_utils_handle = bamUtilsTask() bam_utils_handle.bam_index(input_files['bam'], input_files['bam'] + '.bai') if 'bam_bg' in input_files: bam_utils_handle.bam_index(input_files['bam_bg'], input_files['bam_bg'] + '.bai') chr_list = bam_utils_handle.bam_list_chromosomes(input_files['bam']) chr_list = compss_wait_on(chr_list) logger.info("MACS2 COMMAND PARAMS: " + ", ".join(command_params)) for chromosome in chr_list: if 'bam_bg' in input_files: self.macs2_peak_calling( name, str(input_files['bam']), str(input_files['bam']) + '.bai', str(input_files['bam_bg']), str(input_files['bam_bg']) + '.bai', command_params, str(output_files['narrow_peak']) + "." + str(chromosome), str(output_files['summits']) + "." + str(chromosome), str(output_files['broad_peak']) + "." + str(chromosome), str(output_files['gapped_peak']) + "." + str(chromosome), chromosome) else: self.macs2_peak_calling_nobgd( name, str(input_files['bam']), str(input_files['bam']) + '.bai', command_params, str(output_files['narrow_peak']) + "." + str(chromosome), str(output_files['summits']) + "." + str(chromosome), str(output_files['broad_peak']) + "." + str(chromosome), str(output_files['gapped_peak']) + "." + str(chromosome), chromosome) # Merge the results files into single files. with open(output_files['narrow_peak'], 'w') as file_np_handle: with open(output_files['summits'], 'w') as file_s_handle: with open(output_files['broad_peak'], 'w') as file_bp_handle: with open(output_files['gapped_peak'], 'w') as file_gp_handle: for chromosome in chr_list: if hasattr(sys, '_run_from_cmdl') is True: with open( output_files['narrow_peak'] + "." + str(chromosome), 'rb') as file_in_handle: file_np_handle.write(file_in_handle.read()) with open( output_files['summits'] + "." + str(chromosome), 'rb') as file_in_handle: file_s_handle.write(file_in_handle.read()) with open( output_files['broad_peak'] + "." + str(chromosome), 'rb') as file_in_handle: file_bp_handle.write(file_in_handle.read()) with open( output_files['gapped_peak'] + "." + str(chromosome), 'rb') as file_in_handle: file_gp_handle.write(file_in_handle.read()) else: with compss_open( output_files['narrow_peak'] + "." + str(chromosome), 'rb') as file_in_handle: file_np_handle.write(file_in_handle.read()) with compss_open( output_files['summits'] + "." + str(chromosome), 'rb') as file_in_handle: file_s_handle.write(file_in_handle.read()) with compss_open( output_files['broad_peak'] + "." + str(chromosome), 'rb') as file_in_handle: file_bp_handle.write(file_in_handle.read()) with compss_open( output_files['gapped_peak'] + "." + str(chromosome), 'rb') as file_in_handle: file_gp_handle.write(file_in_handle.read()) output_files_created = {} output_metadata = {} for result_file in output_files: if (os.path.isfile(output_files[result_file]) is True and os.path.getsize(output_files[result_file]) > 0): output_files_created[result_file] = output_files[result_file] sources = [input_metadata["bam"].file_path] if 'bam_bg' in input_files: sources.append(input_metadata["bam_bg"].file_path) output_metadata[result_file] = Metadata( data_type="data_chip_seq", file_type="BED", file_path=output_files[result_file], sources=sources, taxon_id=input_metadata["bam"].taxon_id, meta_data={ "assembly": input_metadata["bam"].meta_data["assembly"], "tool": "macs2", "bed_type": output_bed_types[result_file] }) logger.info('MACS2: GENERATED FILES:', output_files) return (output_files_created, output_metadata)