def work(self): """ Worker function for splitting the FASTQ file into smaller chunks Parameters ---------- in_fastq_file_1 : str Location of the FASTQ file to split in_fastq_file_2 : str Location of the FASTQ file to split fastq_chunk_size : int Number of reads that each FASTQ chunk should contain """ fqs = fastq_splitter({ "fastq_chunk_size": self.fastq_chunk_size, "no-untar": True }) results = fqs.paired_splitter(self.in_fastq_file_1, self.in_fastq_file_2, self.in_fastq_file_1 + ".tar.gz") root_name = self.in_fastq_file_1.split("/") with open("/".join(root_name[0:-1]) + "/tmp/fastq_file_log.txt", "w") as f_out: for fastq_file in results: file_1 = "/".join(root_name[0:-1]) + "/tmp/" + fastq_file[0] file_2 = "/".join(root_name[0:-1]) + "/tmp/" + fastq_file[1] f_out.write(file_1 + "\t" + file_2 + "\n")
def test_paired_splitter(): """ Function to test paired splitter """ resource_path = os.path.join(os.path.dirname(__file__), "data/") fastq_1file = resource_path + "bsSeeker.Mouse.SRR892982_1.fastq" fastq_2file = resource_path + "bsSeeker.Mouse.SRR892982_2.fastq" fqs_handle = fastq_splitter() results = fqs_handle.run( { "fastq1" : fastq_1file, "fastq2" : fastq_2file }, { "fastq1": Metadata( "data_rnaseq", "fastq", [], None, {'assembly' : 'test'}), "fastq2": Metadata( "data_rnaseq", "fastq", [], None, {'assembly' : 'test'}) }, {"output" : fastq_1file + ".tar.gz"} ) print("WGBS - PAIRED RESULTS:", results) assert os.path.isfile(results[0]["output"]) is True assert os.path.getsize(results[0]["output"]) > 0
def test_single_splitter(): """ Function to test single splitter """ resource_path = os.path.join(os.path.dirname(__file__), "data/") fastq_2file = resource_path + "bsSeeker.Mouse.GRCm38_2.fastq" fqs_handle = fastq_splitter() results = fqs_handle.run([fastq_2file], [], {}) print("WGBS - SINGLE RESULTS:", results) assert os.path.isfile(results[0]) is True assert os.path.getsize(results[0]) > 0
def run(self, input_files, input_metadata, output_files): # pylint: disable=too-many-locals,too-many-branches,too-many-statements """ Tool for indexing the genome assembly using BS-Seeker2. In this case it is using Bowtie2 Parameters ---------- input_files : list FASTQ file output_files : list Results files. metadata : list Returns ------- array : list Location of the filtered FASTQ file """ try: if "bss_path" in self.configuration: bss_path = self.configuration["bss_path"] else: raise KeyError if "aligner_path" in self.configuration: aligner_path = self.configuration["aligner_path"] else: raise KeyError if "aligner" in self.configuration: aligner = self.configuration["aligner"] else: raise KeyError except KeyError: logger.fatal( "WGBS - BS SEEKER2: Unassigned configuration variables") genome_fasta = input_files["genome"] genome_idx = input_files["index"] sources = [input_files["genome"]] fqs = fastq_splitter() fastq1 = input_files["fastq1"] sources.append(input_files["fastq1"]) fastq_file_gz = fastq1 + ".tar.gz" if "fastq2" in input_files: fastq2 = input_files["fastq2"] sources.append(input_files["fastq2"]) fastq_file_list = fqs.paired_splitter(fastq1, fastq2, fastq_file_gz) aln_params = self.get_aln_params(self.configuration, True) else: fastq_file_list = fqs.single_splitter(fastq1, fastq_file_gz) aln_params = self.get_aln_params(self.configuration) # Required to prevent iterating over the future objects fastq_file_list = compss_wait_on(fastq_file_list) if not fastq_file_list: logger.fatal("FASTQ SPLITTER: run failed") return {}, {} if hasattr(sys, '_run_from_cmdl') is True: pass else: with compss_open(fastq_file_gz, "rb") as f_in: with open(fastq_file_gz, "wb") as f_out: f_out.write(f_in.read()) gz_data_path = fastq_file_gz.split("/") gz_data_path = "/".join(gz_data_path[:-1]) try: tar = tarfile.open(fastq_file_gz) tar.extractall(path=gz_data_path) tar.close() except tarfile.TarError: logger.fatal("Split FASTQ files: Malformed tar file") return {}, {} # input and output share most metadata output_metadata = {} output_bam_file = output_files["bam"] output_bai_file = output_files["bai"] output_bam_list = [] for fastq_file_pair in fastq_file_list: logger.info("TMP DIR: " + gz_data_path + "/tmp/") if "fastq2" in input_files: tmp_fq1 = gz_data_path + "/tmp/" + fastq_file_pair[0] tmp_fq2 = gz_data_path + "/tmp/" + fastq_file_pair[1] logger.info("TMP_FQ1: " + fastq_file_pair[0]) logger.info("TMP_FQ2: " + fastq_file_pair[1]) output_bam_file_tmp = tmp_fq1 + ".bam" output_bam_list.append(output_bam_file_tmp) self.bs_seeker_aligner(tmp_fq1, tmp_fq2, aligner, aligner_path, bss_path, aln_params, genome_fasta, genome_idx, output_bam_file_tmp) else: tmp_fq = gz_data_path + "/tmp/" + fastq_file_pair[0] logger.info("TMP_FQ: " + fastq_file_pair[0]) output_bam_file_tmp = tmp_fq + ".bam" output_bam_list.append(output_bam_file_tmp) self.bs_seeker_aligner_single(tmp_fq, aligner, aligner_path, bss_path, aln_params, genome_fasta, genome_idx, output_bam_file_tmp) bam_handle = bamUtilsTask() logger.info("Merging bam files") bam_handle.bam_merge(output_bam_list) logger.info("Sorting merged bam file") bam_handle.bam_sort(output_bam_list[0]) logger.info("Copying bam file into the output file") bam_handle.bam_copy(output_bam_list[0], output_bam_file) logger.info("Creating output bam index file") bam_handle.bam_index(output_bam_file, output_bai_file) output_metadata = { "bam": Metadata(data_type="data_wgbs", file_type="BAM", file_path=output_bam_file, sources=sources, taxon_id=input_metadata["genome"].taxon_id, meta_data={ "assembly": input_metadata["genome"].meta_data["assembly"], "tool": "bs_seeker_aligner" }), "bai": Metadata(data_type="data_wgbs", file_type="BAI", file_path=output_bai_file, sources=[input_metadata["genome"].file_path], taxon_id=input_metadata["genome"].taxon_id, meta_data={ "assembly": input_metadata["genome"].meta_data["assembly"], "tool": "bs_seeker_aligner" }) } return (output_files, output_metadata)
def run(self, input_files, input_metadata, output_files): """ The main function to align bam files to a genome using BWA Parameters ---------- input_files : dict File 0 is the genome file location, file 1 is the FASTQ file metadata : dict output_files : dict Returns ------- output_files : dict First element is a list of output_bam_files, second element is the matching meta data output_metadata : dict """ sources = [input_files["genome"]] fqs = fastq_splitter() fastq1 = input_files["loc"] sources.append(input_files["loc"]) fastq_file_gz = str(fastq1 + ".tar.gz") if "fastq2" in input_files: fastq2 = input_files["fastq2"] sources.append(input_files["fastq2"]) fastq_file_list = fqs.paired_splitter(fastq1, fastq2, fastq_file_gz) else: fastq_file_list = fqs.single_splitter(fastq1, fastq_file_gz) # Required to prevent iterating over the future objects fastq_file_list = compss_wait_on(fastq_file_list) if not fastq_file_list: logger.fatal("FASTQ SPLITTER: run failed") return {}, {} if hasattr(sys, '_run_from_cmdl') is True: pass else: logger.info("Getting the tar file") with compss_open(fastq_file_gz, "rb") as f_in: with open(fastq_file_gz, "wb") as f_out: f_out.write(f_in.read()) gz_data_path = fastq_file_gz.split("/") gz_data_path = "/".join(gz_data_path[:-1]) try: tar = tarfile.open(fastq_file_gz) tar.extractall(path=gz_data_path) tar.close() except tarfile.TarError: logger.fatal("Split FASTQ files: Malformed tar file") return {}, {} # input and output share most metadata output_metadata = {} output_bam_file = output_files["output"] # output_bai_file = output_files["bai"] logger.info("BWA ALIGNER: Aligning sequence reads to the genome") output_bam_list = [] for fastq_file_pair in fastq_file_list: if "fastq2" in input_files: tmp_fq1 = gz_data_path + "/tmp/" + fastq_file_pair[0] tmp_fq2 = gz_data_path + "/tmp/" + fastq_file_pair[1] output_bam_file_tmp = tmp_fq1 + ".bam" output_bam_list.append(output_bam_file_tmp) logger.info("BWA MEM FILES: " + tmp_fq1 + " - " + tmp_fq2) self.bwa_aligner_paired( str(input_files["genome"]), tmp_fq1, tmp_fq2, output_bam_file_tmp, str(input_files["index"]), self.get_mem_params(self.configuration)) else: tmp_fq = gz_data_path + "/tmp/" + fastq_file_pair[0] output_bam_file_tmp = tmp_fq + ".bam" output_bam_list.append(output_bam_file_tmp) logger.info("BWA MEM FILES: " + tmp_fq) self.bwa_aligner_single( str(input_files["genome"]), tmp_fq, output_bam_file_tmp, str(input_files["index"]), self.get_mem_params(self.configuration)) bam_handle = bamUtilsTask() logger.info("Merging bam files") bam_handle.bam_merge(output_bam_list) logger.info("Sorting merged bam file") bam_handle.bam_sort(output_bam_list[0]) logger.info("Copying bam file into the output file") bam_handle.bam_copy(output_bam_list[0], output_bam_file) logger.info("BWA ALIGNER: Alignments complete") output_metadata = { "bam": Metadata(data_type=input_metadata['loc'].data_type, file_type="BAM", file_path=output_files["output"], sources=[ input_metadata["genome"].file_path, input_metadata['loc'].file_path ], taxon_id=input_metadata["genome"].taxon_id, meta_data={ "assembly": input_metadata["genome"].meta_data["assembly"], "tool": "bwa_aligner" }) } return ({"bam": output_files["output"]}, output_metadata)