Ejemplo n.º 1
0
    def work(self):
        """
        Worker function for splitting the FASTQ file into smaller chunks

        Parameters
        ----------
        genome_fa : str
            Location of the FASTA file of the genome to align the reads to
        genome_idx : str
            Location of the index files in .tar.gz file prepared by the BWA
            indexer
        fastq_file_1 : str
            Location of the FASTQ file
        fastq_file_2 : str
            Location of the FASTQ file
        output_bam : str
            Location of the aligned reads in bam format
        """
        bwa_handle = bwaAlignerMEMTool({"no-untar": True})
        bwa_handle.bwa_aligner_paired(self.genome_fa, self.fastq_file_1,
                                      self.fastq_file_2, self.output_bam,
                                      self.genome_idx, {})

        bam_handle = bamUtilsTask()
        bam_handle.bam_sort(self.output_bam)
Ejemplo n.º 2
0
    def work(self):
        """
        Worker function for aligning single ended FASTQ reads using Bowtie2

        Parameters
        ----------
        genome_fa : str
            Location of the FASTA file of the genome to align the reads to
        genome_idx : str
            Location of the index files in .tar.gz file prepared by the BWA
            indexer
        fastq_file : str
            Location of the FASTQ file
        output_bam : str
            Location of the aligned reads in bam format
        """

        bowtie2_handle = bowtie2AlignerTool({"no-untar" : True})
        bowtie2_handle.bowtie2_aligner_paired(
            self.genome_fa,
            self.fastq_file_1,
            self.fastq_file_2,
            self.output_bam,
            self.genome_idx,
            {}
        )

        bam_handle = bamUtilsTask()
        bam_handle.bam_sort(self.output_bam)
Ejemplo n.º 3
0
    def run(self, input_files, input_metadata, output_files):  # pylint: disable=too-many-locals,too-many-branches,too-many-statements
        """
        Tool for indexing the genome assembly using BS-Seeker2. In this case it
        is using Bowtie2

        Parameters
        ----------
        input_files : list
            FASTQ file
        output_files : list
            Results files.
        metadata : list

        Returns
        -------
        array : list
            Location of the filtered FASTQ file
        """

        try:
            if "bss_path" in self.configuration:
                bss_path = self.configuration["bss_path"]
            else:
                raise KeyError
            if "aligner_path" in self.configuration:
                aligner_path = self.configuration["aligner_path"]
            else:
                raise KeyError
            if "aligner" in self.configuration:
                aligner = self.configuration["aligner"]
            else:
                raise KeyError
        except KeyError:
            logger.fatal(
                "WGBS - BS SEEKER2: Unassigned configuration variables")

        genome_fasta = input_files["genome"]
        genome_idx = input_files["index"]

        sources = [input_files["genome"]]

        fqs = fastq_splitter()

        fastq1 = input_files["fastq1"]
        sources.append(input_files["fastq1"])

        fastq_file_gz = fastq1 + ".tar.gz"
        if "fastq2" in input_files:
            fastq2 = input_files["fastq2"]
            sources.append(input_files["fastq2"])
            fastq_file_list = fqs.paired_splitter(fastq1, fastq2,
                                                  fastq_file_gz)
            aln_params = self.get_aln_params(self.configuration, True)
        else:
            fastq_file_list = fqs.single_splitter(fastq1, fastq_file_gz)
            aln_params = self.get_aln_params(self.configuration)

        # Required to prevent iterating over the future objects
        fastq_file_list = compss_wait_on(fastq_file_list)
        if not fastq_file_list:
            logger.fatal("FASTQ SPLITTER: run failed")
            return {}, {}

        if hasattr(sys, '_run_from_cmdl') is True:
            pass
        else:
            with compss_open(fastq_file_gz, "rb") as f_in:
                with open(fastq_file_gz, "wb") as f_out:
                    f_out.write(f_in.read())

        gz_data_path = fastq_file_gz.split("/")
        gz_data_path = "/".join(gz_data_path[:-1])

        try:
            tar = tarfile.open(fastq_file_gz)
            tar.extractall(path=gz_data_path)
            tar.close()
        except tarfile.TarError:
            logger.fatal("Split FASTQ files: Malformed tar file")
            return {}, {}

        # input and output share most metadata
        output_metadata = {}

        output_bam_file = output_files["bam"]
        output_bai_file = output_files["bai"]

        output_bam_list = []
        for fastq_file_pair in fastq_file_list:
            logger.info("TMP DIR: " + gz_data_path + "/tmp/")
            if "fastq2" in input_files:
                tmp_fq1 = gz_data_path + "/tmp/" + fastq_file_pair[0]
                tmp_fq2 = gz_data_path + "/tmp/" + fastq_file_pair[1]
                logger.info("TMP_FQ1: " + fastq_file_pair[0])
                logger.info("TMP_FQ2: " + fastq_file_pair[1])
                output_bam_file_tmp = tmp_fq1 + ".bam"
                output_bam_list.append(output_bam_file_tmp)

                self.bs_seeker_aligner(tmp_fq1, tmp_fq2, aligner, aligner_path,
                                       bss_path, aln_params, genome_fasta,
                                       genome_idx, output_bam_file_tmp)
            else:
                tmp_fq = gz_data_path + "/tmp/" + fastq_file_pair[0]
                logger.info("TMP_FQ: " + fastq_file_pair[0])
                output_bam_file_tmp = tmp_fq + ".bam"
                output_bam_list.append(output_bam_file_tmp)

                self.bs_seeker_aligner_single(tmp_fq, aligner, aligner_path,
                                              bss_path, aln_params,
                                              genome_fasta, genome_idx,
                                              output_bam_file_tmp)

        bam_handle = bamUtilsTask()

        logger.info("Merging bam files")
        bam_handle.bam_merge(output_bam_list)

        logger.info("Sorting merged bam file")
        bam_handle.bam_sort(output_bam_list[0])

        logger.info("Copying bam file into the output file")
        bam_handle.bam_copy(output_bam_list[0], output_bam_file)

        logger.info("Creating output bam index file")
        bam_handle.bam_index(output_bam_file, output_bai_file)

        output_metadata = {
            "bam":
            Metadata(data_type="data_wgbs",
                     file_type="BAM",
                     file_path=output_bam_file,
                     sources=sources,
                     taxon_id=input_metadata["genome"].taxon_id,
                     meta_data={
                         "assembly":
                         input_metadata["genome"].meta_data["assembly"],
                         "tool": "bs_seeker_aligner"
                     }),
            "bai":
            Metadata(data_type="data_wgbs",
                     file_type="BAI",
                     file_path=output_bai_file,
                     sources=[input_metadata["genome"].file_path],
                     taxon_id=input_metadata["genome"].taxon_id,
                     meta_data={
                         "assembly":
                         input_metadata["genome"].meta_data["assembly"],
                         "tool": "bs_seeker_aligner"
                     })
        }

        return (output_files, output_metadata)
    def run(self, input_files, input_metadata, output_files):
        """
        Tool for methylation calling using BS-Seeker2.

        Parameters
        ----------
        input_files : list
            Sorted BAM file with the sequence alignments
        metadata : list

        Returns
        -------
        array : list
            Location of the output wig file
        """

        try:
            if "bss_path" in self.configuration:
                bss_path = self.configuration["bss_path"]
            else:
                raise KeyError
        except KeyError:
            logger.fatal(
                "WGBS - BS SEEKER2: Unassigned configuration variables")

        bam_handler = bamUtilsTask()
        bam_handler.check_header(input_files["bam"])

        self.bss_methylation_caller(bss_path, input_files["bam"],
                                    input_files["index"],
                                    self.get_params(self.configuration),
                                    output_files["wig_file"],
                                    output_files["cgmap_file"],
                                    output_files["atcgmap_file"])

        output_metadata = {
            "wig_file":
            Metadata(data_type="data_wgbs",
                     file_type="wig",
                     file_path=output_files["wig_file"],
                     sources=input_metadata["bam"].sources,
                     taxon_id=input_metadata["genome"].taxon_id,
                     meta_data={
                         "assembly":
                         input_metadata["genome"].meta_data["assembly"],
                         "tool": "bs_seeker_methylation_caller"
                     }),
            "cgmap_file":
            Metadata(data_type="data_wgbs",
                     file_type="tsv",
                     file_path=output_files["cgmap_file"],
                     sources=input_metadata["bam"].sources,
                     taxon_id=input_metadata["genome"].taxon_id,
                     meta_data={
                         "assembly":
                         input_metadata["genome"].meta_data["assembly"],
                         "tool": "bs_seeker_methylation_caller"
                     }),
            "atcgmap_file":
            Metadata(data_type="data_wgbs",
                     file_type="tsv",
                     file_path=output_files["atcgmap_file"],
                     sources=input_metadata["bam"].sources,
                     taxon_id=input_metadata["genome"].taxon_id,
                     meta_data={
                         "assembly":
                         input_metadata["genome"].meta_data["assembly"],
                         "tool": "bs_seeker_methylation_caller"
                     })
        }

        return (output_files, output_metadata)
    def run(self, input_files, input_metadata, output_files):
        """
        The main function to align bam files to a genome using BWA

        Parameters
        ----------
        input_files : dict
            File 0 is the genome file location, file 1 is the FASTQ file
        metadata : dict
        output_files : dict

        Returns
        -------
        output_files : dict
            First element is a list of output_bam_files, second element is the
            matching meta data
        output_metadata : dict
        """

        sources = [input_files["genome"]]

        fqs = fastq_splitter()

        fastq1 = input_files["loc"]
        sources.append(input_files["loc"])

        fastq_file_gz = str(fastq1 + ".tar.gz")
        if "fastq2" in input_files:
            fastq2 = input_files["fastq2"]
            sources.append(input_files["fastq2"])
            fastq_file_list = fqs.paired_splitter(fastq1, fastq2,
                                                  fastq_file_gz)
        else:
            fastq_file_list = fqs.single_splitter(fastq1, fastq_file_gz)

        # Required to prevent iterating over the future objects
        fastq_file_list = compss_wait_on(fastq_file_list)
        if not fastq_file_list:
            logger.fatal("FASTQ SPLITTER: run failed")
            return {}, {}

        if hasattr(sys, '_run_from_cmdl') is True:
            pass
        else:
            logger.info("Getting the tar file")
            with compss_open(fastq_file_gz, "rb") as f_in:
                with open(fastq_file_gz, "wb") as f_out:
                    f_out.write(f_in.read())

        gz_data_path = fastq_file_gz.split("/")
        gz_data_path = "/".join(gz_data_path[:-1])

        try:
            tar = tarfile.open(fastq_file_gz)
            tar.extractall(path=gz_data_path)
            tar.close()
        except tarfile.TarError:
            logger.fatal("Split FASTQ files: Malformed tar file")
            return {}, {}

        # input and output share most metadata
        output_metadata = {}

        output_bam_file = output_files["output"]
        # output_bai_file = output_files["bai"]

        logger.info("BWA ALIGNER: Aligning sequence reads to the genome")

        output_bam_list = []
        for fastq_file_pair in fastq_file_list:
            if "fastq2" in input_files:
                tmp_fq1 = gz_data_path + "/tmp/" + fastq_file_pair[0]
                tmp_fq2 = gz_data_path + "/tmp/" + fastq_file_pair[1]
                output_bam_file_tmp = tmp_fq1 + ".bam"
                output_bam_list.append(output_bam_file_tmp)

                logger.info("BWA MEM FILES: " + tmp_fq1 + " - " + tmp_fq2)
                self.bwa_aligner_paired(
                    str(input_files["genome"]), tmp_fq1, tmp_fq2,
                    output_bam_file_tmp, str(input_files["index"]),
                    self.get_mem_params(self.configuration))
            else:
                tmp_fq = gz_data_path + "/tmp/" + fastq_file_pair[0]
                output_bam_file_tmp = tmp_fq + ".bam"
                output_bam_list.append(output_bam_file_tmp)

                logger.info("BWA MEM FILES: " + tmp_fq)
                self.bwa_aligner_single(
                    str(input_files["genome"]), tmp_fq, output_bam_file_tmp,
                    str(input_files["index"]),
                    self.get_mem_params(self.configuration))

        bam_handle = bamUtilsTask()

        logger.info("Merging bam files")
        bam_handle.bam_merge(output_bam_list)

        logger.info("Sorting merged bam file")
        bam_handle.bam_sort(output_bam_list[0])

        logger.info("Copying bam file into the output file")
        bam_handle.bam_copy(output_bam_list[0], output_bam_file)

        logger.info("BWA ALIGNER: Alignments complete")

        output_metadata = {
            "bam":
            Metadata(data_type=input_metadata['loc'].data_type,
                     file_type="BAM",
                     file_path=output_files["output"],
                     sources=[
                         input_metadata["genome"].file_path,
                         input_metadata['loc'].file_path
                     ],
                     taxon_id=input_metadata["genome"].taxon_id,
                     meta_data={
                         "assembly":
                         input_metadata["genome"].meta_data["assembly"],
                         "tool": "bwa_aligner"
                     })
        }

        return ({"bam": output_files["output"]}, output_metadata)
Ejemplo n.º 6
0
    def run(self, input_files, input_metadata, output_files):
        """
        The main function to run MACS 2 for peak calling over a given BAM file
        and matching background BAM file.

        Parameters
        ----------
        input_files : list
            List of input bam file locations where 0 is the bam data file and 1
            is the matching background bam file
        metadata : dict


        Returns
        -------
        output_files : list
            List of locations for the output files.
        output_metadata : list
            List of matching metadata dict objects

        """
        root_name = input_files['bam'].split("/")
        root_name[-1] = root_name[-1].replace('.bam', '')
        name = root_name[-1]

        # input and output share most metadata
        output_bed_types = {
            'narrow_peak': "bed4+1",
            'summits': "bed6+4",
            'broad_peak': "bed6+3",
            'gapped_peak': "bed12+3"
        }

        command_params = self.get_macs2_params(self.configuration)

        bam_utils_handle = bamUtilsTask()
        bam_utils_handle.bam_index(input_files['bam'],
                                   input_files['bam'] + '.bai')
        if 'bam_bg' in input_files:
            bam_utils_handle.bam_index(input_files['bam_bg'],
                                       input_files['bam_bg'] + '.bai')

        chr_list = bam_utils_handle.bam_list_chromosomes(input_files['bam'])
        chr_list = compss_wait_on(chr_list)

        logger.info("MACS2 COMMAND PARAMS: " + ", ".join(command_params))

        for chromosome in chr_list:
            if 'bam_bg' in input_files:
                self.macs2_peak_calling(
                    name, str(input_files['bam']),
                    str(input_files['bam']) + '.bai',
                    str(input_files['bam_bg']),
                    str(input_files['bam_bg']) + '.bai', command_params,
                    str(output_files['narrow_peak']) + "." + str(chromosome),
                    str(output_files['summits']) + "." + str(chromosome),
                    str(output_files['broad_peak']) + "." + str(chromosome),
                    str(output_files['gapped_peak']) + "." + str(chromosome),
                    chromosome)
            else:
                self.macs2_peak_calling_nobgd(
                    name, str(input_files['bam']),
                    str(input_files['bam']) + '.bai', command_params,
                    str(output_files['narrow_peak']) + "." + str(chromosome),
                    str(output_files['summits']) + "." + str(chromosome),
                    str(output_files['broad_peak']) + "." + str(chromosome),
                    str(output_files['gapped_peak']) + "." + str(chromosome),
                    chromosome)

        # Merge the results files into single files.
        with open(output_files['narrow_peak'], 'w') as file_np_handle:
            with open(output_files['summits'], 'w') as file_s_handle:
                with open(output_files['broad_peak'], 'w') as file_bp_handle:
                    with open(output_files['gapped_peak'],
                              'w') as file_gp_handle:
                        for chromosome in chr_list:
                            if hasattr(sys, '_run_from_cmdl') is True:
                                with open(
                                        output_files['narrow_peak'] + "." +
                                        str(chromosome),
                                        'rb') as file_in_handle:
                                    file_np_handle.write(file_in_handle.read())
                                with open(
                                        output_files['summits'] + "." +
                                        str(chromosome),
                                        'rb') as file_in_handle:
                                    file_s_handle.write(file_in_handle.read())
                                with open(
                                        output_files['broad_peak'] + "." +
                                        str(chromosome),
                                        'rb') as file_in_handle:
                                    file_bp_handle.write(file_in_handle.read())
                                with open(
                                        output_files['gapped_peak'] + "." +
                                        str(chromosome),
                                        'rb') as file_in_handle:
                                    file_gp_handle.write(file_in_handle.read())
                            else:
                                with compss_open(
                                        output_files['narrow_peak'] + "." +
                                        str(chromosome),
                                        'rb') as file_in_handle:
                                    file_np_handle.write(file_in_handle.read())
                                with compss_open(
                                        output_files['summits'] + "." +
                                        str(chromosome),
                                        'rb') as file_in_handle:
                                    file_s_handle.write(file_in_handle.read())
                                with compss_open(
                                        output_files['broad_peak'] + "." +
                                        str(chromosome),
                                        'rb') as file_in_handle:
                                    file_bp_handle.write(file_in_handle.read())
                                with compss_open(
                                        output_files['gapped_peak'] + "." +
                                        str(chromosome),
                                        'rb') as file_in_handle:
                                    file_gp_handle.write(file_in_handle.read())

        output_files_created = {}
        output_metadata = {}
        for result_file in output_files:
            if (os.path.isfile(output_files[result_file]) is True
                    and os.path.getsize(output_files[result_file]) > 0):
                output_files_created[result_file] = output_files[result_file]

                sources = [input_metadata["bam"].file_path]
                if 'bam_bg' in input_files:
                    sources.append(input_metadata["bam_bg"].file_path)
                output_metadata[result_file] = Metadata(
                    data_type="data_chip_seq",
                    file_type="BED",
                    file_path=output_files[result_file],
                    sources=sources,
                    taxon_id=input_metadata["bam"].taxon_id,
                    meta_data={
                        "assembly":
                        input_metadata["bam"].meta_data["assembly"],
                        "tool": "macs2",
                        "bed_type": output_bed_types[result_file]
                    })

        logger.info('MACS2: GENERATED FILES:', output_files)

        return (output_files_created, output_metadata)