Example #1
0
def test_bs_seeker_filter_02():
    """
    Test that it is possible to call the BSseeker filter
    """
    resource_path = os.path.join(os.path.dirname(__file__), "data/")
    home = os.path.expanduser('~')

    input_files = {"fastq": resource_path + "bsSeeker.Mouse.SRR892982_2.fastq"}

    output_files = {
        "fastq_filtered":
        resource_path + "bsSeeker.Mouse.SRR892982_2.filtered.fastq"
    }

    metadata = {
        "fastq":
        Metadata("data_wgbs", "fastq", input_files["fastq"], None,
                 {'assembly': 'test'})
    }

    config_param = {
        "aligner": "bowtie2",
        "aligner_path": home + "/lib/bowtie2-2.3.4-linux-x86_64",
        "bss_path": home + "/lib/BSseeker2"
    }

    bsi = bs_seeker_filter.filterReadsTool(config_param)
    bsi.run(input_files, metadata, output_files)

    assert os.path.isfile(output_files["fastq_filtered"]) is True
    assert os.path.getsize(output_files["fastq_filtered"]) > 0
Example #2
0
    def work(self):
        """
        Worker function for aligning single ended FASTQ reads using Bowtie2

        Parameters
        ----------
        genome_fa : str
            Location of the FASTA file of the genome to align the reads to
        genome_idx : str
            Location of the index files in .tar.gz file prepared by the BWA
            indexer
        fastq_file : str
            Location of the FASTQ file
        output_bam : str
            Location of the aligned reads in bam format
        """

        frt = filterReadsTool()
        frt.bss_seeker_filter(self.fastq_file, self.fastq_filtered,
                              self.bss_path)

        bss_aligner = bssAlignerTool({"no-untar": True})
        bss_aligner.bs_seeker_aligner_single(self.fastq_filtered, self.aligner,
                                             self.aligner_path, self.bss_path,
                                             [], self.genome_fa,
                                             self.genome_idx, self.output_bam)

        bam_handle = bamUtils()
        bam_handle.bam_sort(self.output_bam)
    def run(self, input_files, metadata, output_files):
        """
        This pipeline processes paired-end FASTQ files to identify
        methylated regions within the genome.

        Parameters
        ----------
        input_files : dict
            List of strings for the locations of files. These should include:

            genome_fa : str
                Genome assembly in FASTA

            fastq1 : str
                Location for the first FASTQ file for single or paired end
                reads

            fastq2 : str
                [OPTIONAL]Location for the second FASTQ file if paired end
                reads


        metadata : dict
            Input file meta data associated with their roles

            genome_fa : str
            fastq1 : str

            fastq2 : str
                [OPTIONAL]

        output_files : dict
            index : str
            fastq1_filtered : str

            fastq2_filtered : str
                [OPTIONAL]

            bam : str
            bai : str
            wig_file : str
            cgmap_file : str
            atcgmap_file : str

        Returns
        -------

        fastq1_filtered|fastq1_filtered : str
            Locations of the filtered FASTQ files from which alignments were
            made

        bam|bai : str
            Location of the alignment bam file and the associated index

        wig_file : str
            Location of the wig file containing the methylation peak calls

        cgmap_file : str
            Location of the CGmap file generated by BS-Seeker2

        atcgmap_file : str
            Location of the ATCGmap file generated by BS-Seeker2

        """

        output_results_files = {}
        output_metadata = {}

        logger.info("WGBS - BS-Seeker2 Index")
        # Build the matching WGBS genome index
        builder = bssIndexerTool(self.configuration)
        genome_idx, gidx_meta = builder.run(remap(input_files, "genome"),
                                            remap(metadata, "genome"),
                                            remap(output_files, "index"))
        output_results_files["index"] = genome_idx["index"]
        output_metadata["index"] = gidx_meta["index"]

        # Filter the FASTQ reads to remove duplicates
        logger.info("WGBS - Filter")
        frt = filterReadsTool(self.configuration)
        fastq1f, filter1_meta = frt.run(
            {"fastq": input_files["fastq1"]}, {"fastq": metadata["fastq1"]},
            {"fastq_filtered": output_files["fastq1_filtered"]})

        try:
            output_results_files["fastq1_filtered"] = fastq1f["fastq_filtered"]
            output_metadata["fastq1_filtered"] = filter1_meta["fastq_filtered"]
            tool_name = output_metadata["fastq1_filtered"].meta_data["tool"]
            output_metadata["fastq1_filtered"].meta_data[
                "tool_description"] = tool_name
            output_metadata["fastq1_filtered"].meta_data[
                "tool"] = "process_wgbs"
        except KeyError:
            logger.fatal("WGBS - FILTER: Error while filtering")
            return {}, {}

        if "fastq2" in input_files:
            logger.info("WGBS - Filter background")
            fastq2f, filter2_meta = frt.run(
                {"fastq": input_files["fastq2"]},
                {"fastq": metadata["fastq2"]},
                {"fastq_filtered": output_files["fastq2_filtered"]})

            try:
                output_results_files["fastq2_filtered"] = fastq2f[
                    "fastq_filtered"]
                output_metadata["fastq2_filtered"] = filter2_meta[
                    "fastq_filtered"]

                tool_name = output_metadata["fastq2_filtered"].meta_data[
                    "tool"]
                output_metadata["fastq2_filtered"].meta_data[
                    "tool_description"] = tool_name
                output_metadata["fastq2_filtered"].meta_data[
                    "tool"] = "process_wgbs"
            except KeyError:
                logger.fatal(
                    "WGBS - FILTER (background): Error while filtering")
                return {}, {}

        logger.info("WGBS - BS-Seeker2 Aligner")
        # Handles the alignment of all of the split packets then merges them
        # back together.
        bss_aligner = bssAlignerTool(self.configuration)
        aligner_input_files = {
            "genome": input_files["genome"],
            "fastq1": fastq1f["fastq_filtered"]
        }
        aligner_input_files["index"] = genome_idx["index"]

        aligner_meta = {
            "genome": metadata["genome"],
            "fastq1": filter1_meta["fastq_filtered"],
            "index": output_metadata["index"]
        }
        if "fastq2" in input_files:
            aligner_input_files["fastq2"] = fastq2f["fastq_filtered"]
            aligner_meta["fastq2"] = filter2_meta["fastq_filtered"]

        bam, bam_meta = bss_aligner.run(aligner_input_files, aligner_meta,
                                        remap(output_files, "bam", "bai"))

        try:
            output_results_files["bam"] = bam["bam"]
            output_results_files["bai"] = bam["bai"]
            output_metadata["bam"] = bam_meta["bam"]
            output_metadata["bai"] = bam_meta["bai"]

            tool_name = output_metadata["bam"].meta_data["tool"]
            output_metadata["bam"].meta_data["tool_description"] = tool_name
            output_metadata["bam"].meta_data["tool"] = "process_wgbs"

            tool_name = output_metadata["bai"].meta_data["tool"]
            output_metadata["bai"].meta_data["tool_description"] = tool_name
            output_metadata["bai"].meta_data["tool"] = "process_wgbs"
        except KeyError:
            logger.fatal("WGBS - Aligner failed")
            return {}, {}

        # Methylation peak caller
        peak_caller_handle = bssMethylationCallerTool(self.configuration)
        mct_input_files = {
            "genome": input_files["genome"],
            "index": genome_idx["index"],
            "fastq1": fastq1f["fastq_filtered"],
            "bam": bam["bam"],
            "bai": bam["bai"]
        }

        mct_meta = {
            "genome": metadata["genome"],
            "index": gidx_meta["index"],
            "fastq1": filter1_meta["fastq_filtered"],
            "bam": output_metadata["bam"],
            "bai": bam_meta["bai"]
        }

        if "fastq2" in input_files:
            mct_input_files["fastq2"] = fastq2f["fastq_filtered"]
            mct_meta["fastq2"] = filter2_meta["fastq_filtered"]

        peak_files, peak_meta = peak_caller_handle.run(
            mct_input_files, mct_meta,
            remap(output_files, "wig_file", "cgmap_file", "atcgmap_file"))
        # output_metadata["peak_calling"] = peak_meta

        try:
            output_results_files["wig_file"] = peak_files["wig_file"]
            output_results_files["cgmap_file"] = peak_files["cgmap_file"]
            output_results_files["atcgmap_file"] = peak_files["atcgmap_file"]
            output_metadata["wig_file"] = peak_meta["wig_file"]
            output_metadata["cgmap_file"] = peak_meta["cgmap_file"]
            output_metadata["atcgmap_file"] = peak_meta["atcgmap_file"]

            output_metadata["wig_file"].meta_data["tool_description"] = output_metadata["wig_file"].meta_data["tool"]  # pylint: disable=line-too-long
            output_metadata["wig_file"].meta_data["tool"] = "process_wgbs"
            output_metadata["cgmap_file"].meta_data["tool_description"] = output_metadata["cgmap_file"].meta_data["tool"]  # pylint: disable=line-too-long
            output_metadata["cgmap_file"].meta_data["tool"] = "process_wgbs"
            output_metadata["atcgmap_file"].meta_data["tool_description"] = output_metadata["atcgmap_file"].meta_data["tool"]  # pylint: disable=line-too-long
            output_metadata["atcgmap_file"].meta_data["tool"] = "process_wgbs"
        except KeyError:
            logger.fatal("WGBS - Peak caller failed")
            return {}, {}

        return (output_results_files, output_metadata)
Example #4
0
    def run(self, input_files, metadata, output_files):
        """
        This pipeline processes FASTQ files to filter duplicate entries

        Parameters
        ----------
        input_files : dict
            List of strings for the locations of files. These should include:

            fastq1 : str
                Location for the first FASTQ file for single or paired end reads

            fastq2 : str
                Location for the second FASTQ file if paired end reads [OPTIONAL]

        metadata : dict
            Input file meta data associated with their roles

            fastq1 : str

            fastq2 : str
                [OPTIONAL]

        output_files : dict

            fastq1_filtered : str

            fastq2_filtered : str
                [OPTIONAL]

        Returns
        -------

        fastq1_filtered|fastq1_filtered : str
            Locations of the filtered FASTQ files from which alignments were made

        fastq2_filtered|fastq2_filtered : str
            Locations of the filtered FASTQ files from which alignments were made

        """

        output_results_files = {}
        output_metadata = {}

        logger.info("BS-Filter")

        frt = filterReadsTool(self.configuration)
        fastq1f, filter1_meta = frt.run(
            {"fastq": input_files["fastq1"]}, {"fastq": metadata["fastq1"]},
            {"fastq_filtered": output_files["fastq1_filtered"]})

        try:
            output_results_files["fastq1_filtered"] = fastq1f["fastq_filtered"]
            output_metadata["fastq1_filtered"] = filter1_meta["fastq_filtered"]
            tool_name = output_metadata["fastq1_filtered"].meta_data["tool"]
            output_metadata["fastq1_filtered"].meta_data[
                "tool_description"] = tool_name
            output_metadata["fastq1_filtered"].meta_data[
                "tool"] = "process_wgbs"
        except KeyError:
            logger.fatal("WGBS - FILTER: Error while filtering")
            return {}, {}

        if "fastq2" in input_files:
            logger.info("WGBS - Filter background")
            fastq2f, filter2_meta = frt.run(
                {"fastq": input_files["fastq2"]},
                {"fastq": metadata["fastq2"]},
                {"fastq_filtered": output_files["fastq2_filtered"]})

            try:
                output_results_files["fastq2_filtered"] = fastq2f[
                    "fastq_filtered"]
                output_metadata["fastq2_filtered"] = filter2_meta[
                    "fastq_filtered"]

                tool_name = output_metadata["fastq2_filtered"].meta_data[
                    "tool"]
                output_metadata["fastq2_filtered"].meta_data[
                    "tool_description"] = tool_name
                output_metadata["fastq2_filtered"].meta_data[
                    "tool"] = "process_wgbs"
            except KeyError:
                logger.fatal(
                    "WGBS - FILTER (background): Error while filtering")
                return {}, {}

        return (output_results_files, output_metadata)
Example #5
0
    def _filter_reads_mp(self, fastq_in, fastq_out, filter_path):
        frt = filterReadsTool()
        frt.bss_seeker_filter(fastq_in, fastq_out, filter_path)

        fq_handle = fastqUtils()
        fq_handle.fastq_sort_file(fastq_out)