Beispiel #1
0
    def execute(self):

        # Defined the temporary directory.
        if not self.args["--tmpdir"]:
            self.args["--tmpdir"] = "./tmp"
        tmp_dir = mio.generate_temp_dir(self.args["--tmpdir"])

        # Defined the output directory and output file names.
        if not self.args["--outdir"]:
            self.args["--outdir"] = join(".",
                                         "contact_map_" + self.args["--name"])
        os.makedirs(self.args["--outdir"], exist_ok=True)

        mtc.generate_contact_map(
            self.args["--assembly"],
            self.args["--contig-data"],
            self.args["--enzyme"],
            self.args["--name"],
            self.args["--pairs"],
            self.args["--outdir"],
            tmp_dir,
            self.args["--filter"],
            self.args["--force"],
            self.args["--mat-fmt"],
            self.args["--object"],
            int(self.args["--min-size"]),
            self.args["--pcr-dup"],
            int(self.args["--threads"]),
        )

        # Delete the temporary folder.
        if not self.args["--no-clean-up"]:
            shutil.rmtree(tmp_dir)
            # Delete pyfastx index:
            os.remove(self.args["--assembly"] + ".fxi")
Beispiel #2
0
    def execute(self):

        # Defined the temporary directory.
        if not self.args["--tmpdir"]:
            tmp_dir = mio.generate_temp_dir("./tmp")
        else:
            tmp_dir = self.args["--tmpdir"]
            os.makedirs(tmp_dir, exist_ok=True)

        # Defined the output directory and output file names.
        if not self.args["--outdir"]:
            self.args["--outdir"] = "."
        os.makedirs(self.args["--outdir"], exist_ok=True)
        recursive_fasta_dir = join(self.args["--outdir"], "recursive_bin")
        final_fasta_dir = join(self.args["--outdir"], "final_bin")
        if not exists(recursive_fasta_dir):
            os.makedirs(recursive_fasta_dir)
        else:
            if self.args["--force"]:
                shutil.rmtree(recursive_fasta_dir)
                os.makedirs(recursive_fasta_dir)
            else:
                logger.error(
                    "%s already existed. Remove directory or use -F argument to overwrite it.",
                    recursive_fasta_dir,
                )
                raise ValueError
        if not exists(final_fasta_dir):
            os.makedirs(final_fasta_dir)
        else:
            if self.args["--force"]:
                shutil.rmtree(final_fasta_dir)
                os.makedirs(final_fasta_dir)
            else:
                logger.error(
                    "%s already existed. Remove directory or use -F argument to overwrite it.",
                    final_fasta_dir,
                )
                raise ValueError

        # Enable file logging
        now = time.strftime("%Y%m%d%H%M%S")
        log_file = join(self.args["--outdir"],
                        ("metator_validation_" + now + ".log"))
        mtl.set_file_handler(log_file)

        # Transform numeric variable as numeric
        iterations = int(self.args["--iterations"])
        size = int(self.args["--size"])
        threads = int(self.args["--threads"])
        overlapping_parameter = int(self.args["--overlap"]) / 100
        resolution_parameter = float(self.args["--res-param"])

        # Check checkM availabilitys
        if not mio.check_checkm():
            logger.error(
                "CheckM is not in the path. Could not make the iterations")
            raise NameError

        # Check correct algorithm value
        if self.args["--algorithm"] not in ["louvain", "leiden"]:
            logger.error('algorithm should be either "louvain" or "leiden"')
            raise ValueError

        _clustering_matrix_file = mtv.recursive_decontamination(
            self.args["--algorithm"],
            self.args["--assembly"],
            self.args["--cluster-matrix"],
            self.args["--contigs"],
            final_fasta_dir,
            self.args["--fasta"],
            iterations,
            self.args["--network"],
            self.args["--outdir"],
            overlapping_parameter,
            recursive_fasta_dir,
            resolution_parameter,
            size,
            tmp_dir,
            threads,
        )

        # Delete pyfastx index:
        os.remove(self.args["--assembly"] + ".fxi")
        # Delete the temporary folder
        if not self.args["--no-clean-up"]:
            shutil.rmtree(tmp_dir)
Beispiel #3
0
    def execute(self):

        # Defined the temporary directory.
        if not self.args["--tmpdir"]:
            tmp_dir = mio.generate_temp_dir("./tmp")
        else:
            tmp_dir = self.args["--tmpdir"]
            os.makedirs(tmp_dir, exist_ok=True)

        # Defined the output directory and output file names.
        if not self.args["--outdir"]:
            self.args["--outdir"] = "."
        os.makedirs(self.args["--outdir"], exist_ok=True)
        overlapping_fasta_dir = join(self.args["--outdir"], "overlapping_bin")
        if not exists(overlapping_fasta_dir):
            os.makedirs(overlapping_fasta_dir)
        else:
            if self.args["--force"]:
                shutil.rmtree(overlapping_fasta_dir)
                os.makedirs(overlapping_fasta_dir)
            else:
                print(self.args["--force"])
                logger.error(
                    "%s already existed. Remove directory or use -F argument to overwrite it.",
                    overlapping_fasta_dir,
                )
                raise ValueError

        # Enable file logging
        now = time.strftime("%Y%m%d%H%M%S")
        log_file = join(self.args["--outdir"], ("metator_" + now + ".log"))
        mtl.set_file_handler(log_file)

        # Define variable
        min_qual = int(self.args["--min-quality"])
        iterations = int(self.args["--iterations"])
        recursive_iterations = int(self.args["--rec-iter"])
        overlapping_parameter = int(self.args["--overlap"]) / 100
        recursive_overlapping_parameter = int(self.args["--rec-overlap"]) / 100
        size = int(self.args["--size"])
        threads = int(self.args["--threads"])
        resolution_parameter = float(self.args["--res-param"])

        # Check correct algorithm value
        if self.args["--algorithm"] not in ["louvain", "leiden"]:
            logger.error('algorithm should be either "louvain" or "leiden"')
            raise ValueError

        # Check if normalization in the list of possible normalization.
        list_normalization = [
            "None",
            "abundance",
            "length",
            "RS",
            "empirical_hit",
            "theoritical_hit",
        ]
        if self.args["--normalization"] not in list_normalization:
            logger.error(
                'Normalization should be among this list: "None", "abundance", "length", "RS", "empirical_hit", "theoritical_hit"'
            )
            raise ValueError
        enzyme_required = ["RS", "theoritical_hit"]
        if (self.args["--normalization"] in enzyme_required
                and not self.args["--enzyme"]):
            logger.error(
                'For "RS" and "theoritical_hit" normalization, enzyme is required.'
            )
            raise ValueError
        depth_required = ["abundance", "theoritical_hit"]
        if (self.args["--normalization"] in depth_required
                and not self.args["--depth"]):
            logger.error(
                'For "abundance" and "theoritical_hit" normalization, depth is required.'
            )
            raise ValueError

        # Sanity check for validation
        if not self.args["--skip-validation"]:
            recursive_fasta_dir = join(self.args["--outdir"], "recursive_bin")
            if not exists(recursive_fasta_dir):
                os.makedirs(recursive_fasta_dir)
            else:
                if self.args["--force"]:
                    shutil.rmtree(recursive_fasta_dir)
                    os.makedirs(recursive_fasta_dir)
                else:
                    logger.error(
                        "%s already existed. Remove directory or use -F argument to overwrite it",
                        recursive_fasta_dir,
                    )
                    raise ValueError
            final_fasta_dir = join(self.args["--outdir"], "final_bin")
            if not exists(final_fasta_dir):
                os.makedirs(final_fasta_dir)
            else:
                if self.args["--force"]:
                    shutil.rmtree(final_fasta_dir)
                    os.makedirs(final_fasta_dir)
                else:
                    logger.error(
                        "%s already existed. Remove directory or use -F argument to overwrite it.",
                        final_fasta_dir,
                    )
                    raise ValueError

            # Check checkM availability
            if not mio.check_checkm():
                logger.error(
                    "CheckM is not in the path. Could not make the iterations")
                raise NameError

        # Manage start point.
        if self.args["--start"] == "fastq":
            start = 1
        elif self.args["--start"] == "bam":
            start = 2
        elif self.args["--start"] == "pair":
            start = 3
        elif self.args["--start"] == "network":
            start = 4
        else:
            logger.error(
                "Start argument should be 'fastq', 'bam', 'pair' or 'network'."
            )
            raise ValueError

        # Check if forward and reverse reads are given for fastq and bam start.
        if (self.args["--start"] == "fastq" or
            (self.args["--start"] == "bam" and self.args["--aligner"]
             == "bowtie2")) and not self.args["--reverse"]:
            logger.error(
                "Forward and reverse arguments are necessary for fastq with %s start and %s aligner.",
                self.args["--start"],
                self.args["--aligner"],
            )
            raise ValueError

        # Print information of the workflow:
        if start == 1:
            logger.info("Minimum mapping quality: %d", min_qual)
        if start <= 2:
            logger.info("Enzyme: %s", self.args["--enzyme"])
            logger.info("Normalization: %s", self.args["--normalization"])
        logger.info("Aligner algorithm: %s", self.args["--aligner"])
        logger.info("Partition algorithm: %s", self.args["--algorithm"])
        logger.info("Partition iterations: %s", iterations)
        logger.info("Overlapping parameter: %s", overlapping_parameter)
        if not self.args["--skip-validation"]:
            logger.info("Recursive partition iterations: %d",
                        recursive_iterations)
            logger.info(
                "Recursive overlapping parameter: %s",
                recursive_overlapping_parameter,
            )

        # Extract index and genome file
        assembly = self.args["--assembly"]
        # Check what is the reference. If a fasta is given build the index. If a
        # bowtie2 index is given, retreive the fasta.
        index = mio.check_fasta_index(assembly, mode=self.args["--aligner"])
        if index is None:
            if mio.check_is_fasta(assembly):
                fasta = assembly
                if start == 1:
                    index = mio.generate_fasta_index(fasta,
                                                     self.args["--aligner"],
                                                     tmp_dir)
            else:
                logger.error(
                    "Please give as assembly argument a bowtie2 index or a fasta."
                )
                raise ValueError
        else:
            fasta = mio.retrieve_fasta(index, self.args["--aligner"], tmp_dir)

        # Run the whole workflow
        if start <= 3:
            if start <= 2:
                # Align pair-end reads with bowtie2
                alignment_files, contig_data, hit_data = mta.get_contact_pairs(
                    self.args["--forward"],
                    self.args["--reverse"],
                    index,
                    fasta,
                    self.args["--aligner"],
                    min_qual,
                    self.args["--start"],
                    self.args["--depth"],
                    self.args["--enzyme"],
                    self.args["--outdir"],
                    tmp_dir,
                    self.args["--threads"],
                )
            else:
                alignment_files = self.args["--forward"].split(",")
                nb_alignment = len(alignment_files)
                contig_data, hit_data = mtn.create_contig_data(
                    fasta,
                    nb_alignment,
                    self.args["--depth"],
                    self.args["--enzyme"],
                )
            # Build the network
            network_file, contigs_data_file = mtn.alignment_to_contacts(
                alignment_files,
                contig_data,
                hit_data,
                self.args["--outdir"],
                "network.txt",
                "contig_data_network.txt",
                tmp_dir,
                self.args["--threads"],
                self.args["--normalization"],
                False,
            )
        else:
            contigs_data_file = self.args["--contigs"]
            network_file = self.args["--network"]

        # Partition the network
        clustering_matrix_partition_file, contigs_data_file = mtp.partition(
            self.args["--algorithm"],
            fasta,
            self.args["--cluster-matrix"],
            contigs_data_file,
            iterations,
            network_file,
            self.args["--outdir"],
            overlapping_fasta_dir,
            overlapping_parameter,
            resolution_parameter,
            size,
            tmp_dir,
            threads,
        )

        # remove contig_data_network if not an input
        if start <= 2:
            contig_data_network_file = join(self.args["--outdir"],
                                            "contig_data_network.txt")
            os.remove(contig_data_network_file)

        # Launch validation if desired.
        if not self.args["--skip-validation"]:
            clustering_matrix_recursive_file = mtv.recursive_decontamination(
                self.args["--algorithm"],
                fasta,
                self.args["--cluster-matrix"],
                contigs_data_file,
                final_fasta_dir,
                overlapping_fasta_dir,
                recursive_iterations,
                network_file,
                self.args["--outdir"],
                recursive_overlapping_parameter,
                recursive_fasta_dir,
                resolution_parameter,
                size,
                tmp_dir,
                threads,
            )

            if self.args["--cluster-matrix"]:
                # Make the sum with the partiton clustering matrix and save it.
                clustering_matrix = load_npz(clustering_matrix_partition_file +
                                             ".npz")
                clustering_matrix_recursive = load_npz(
                    clustering_matrix_recursive_file + ".npz")
                clustering_matrix = (
                    (clustering_matrix + clustering_matrix_recursive) /
                    2).tocoo()
                clustering_matrix_file = join(self.args["--outdir"],
                                              "clustering_matrix")
                save_npz(clustering_matrix_file, clustering_matrix)

            # Remove contig_data_partition file
            contig_data_partition_file = join(self.args["--outdir"],
                                              "contig_data_partition.txt")
            os.remove(contig_data_partition_file)

        # Delete pyfastx index:
        os.remove(fasta + ".fxi")
        # Delete the temporary folder.
        if not self.args["--no-clean-up"]:
            shutil.rmtree(tmp_dir)
Beispiel #4
0
    def execute(self):

        # Defined the temporary directory.
        if not self.args["--tmpdir"]:
            tmp_dir = mio.generate_temp_dir("./tmp")
        else:
            tmp_dir = self.args["--tmpdir"]
            os.makedirs(tmp_dir, exist_ok=True)

        # Defined the output directory and output file names.
        if not self.args["--outdir"]:
            self.args["--outdir"] = "."
        os.makedirs(self.args["--outdir"], exist_ok=True)

        # Enable file logging
        now = time.strftime("%Y%m%d%H%M%S")
        log_file = join(self.args["--outdir"],
                        ("metator_network_" + now + ".log"))
        mtl.set_file_handler(log_file)

        # Transform integer variables as integer.
        min_qual = int(self.args["--min-quality"])

        # Defined boolean variables:
        self_contacts = self.args["--self-contacts"]

        # Check if forward and reverse arguments are given:
        if (self.args["--start"] == "fastq" or
            (self.args["--start"] == "bam" and self.args["--aligner"]
             == "bowtie2")) and not self.args["--reverse"]:
            logger.error(
                "Forward and reverse arguments are necessary for fastq with %s start and %s aligner.",
                self.args["--start"],
                self.args["--aligner"],
            )
            raise ValueError

        # Check if normalization in the list of possible normalization.
        list_normalization = [
            "None",
            "abundance",
            "length",
            "RS",
            "empirical_hit",
            "theoritical_hit",
        ]
        if self.args["--normalization"] not in list_normalization:
            logger.error(
                'Normalization should be among this list: "None", "abundance", "length", "RS", "empirical_hit", "theoritical_hit"'
            )
            raise ValueError
        enzyme_required = ["RS", "theoritical_hit"]
        if (self.args["--normalization"] in enzyme_required
                and not self.args["--enzyme"]):
            logger.error(
                'For "RS" and "theoritical_hit" normalization, enzyme is required.'
            )
            raise ValueError
        depth_required = ["abundance", "theoritical_hit"]
        if (self.args["--normalization"] in depth_required
                and not self.args["--depth"]):
            logger.error(
                'For "abundance" and "theoritical_hit" normalization, depth is required.'
            )
            raise ValueError
        if self.args["--start"] not in ["fastq", "bam", "pair", "network"]:
            logger.error(
                "Start argument should be 'fastq', 'bam', 'pair' or 'network'."
            )
            raise ValueError
        # Extract index and genome file
        assembly = self.args["--assembly"]
        # Check what is the reference. If a fasta is given build the index. If a
        # bowtie2 index is given, retreive the fasta.
        index = mio.check_fasta_index(assembly, mode=self.args["--aligner"])
        if index is None:
            if mio.check_is_fasta(assembly):
                fasta = assembly
                # If start at bam could skip the index generation.
                if self.args["--start"] == "fastq":
                    index = mio.generate_fasta_index(fasta,
                                                     self.args["--aligner"],
                                                     tmp_dir)
            else:
                logger.error(
                    "Please give as assembly argument a %s index or a fasta.",
                    self.args["--aligner"],
                )
                raise ValueError
        else:
            fasta = mio.retrieve_fasta(index, self.args["--aligner"], tmp_dir)

        # Print information of teh workflow:
        logger.info("Aligner algorithm: %s", self.args["--aligner"])
        logger.info("Enzyme: %s", self.args["--enzyme"])
        logger.info("Normalization: %s", self.args["--normalization"])
        logger.info("Minimum mapping quality: %s", self.args["--min-quality"])

        # Do not align if pair start
        if self.args["--start"] == "pair":
            alignment_files = self.args["--forward"].split(",")
            nb_alignment = len(alignment_files)
            contig_data, hit_data = mtn.create_contig_data(
                fasta,
                nb_alignment,
                self.args["--depth"],
                self.args["--enzyme"],
            )

        else:
            # Align pair-end reads with bowtie2
            alignment_files, contig_data, hit_data = mta.get_contact_pairs(
                self.args["--forward"],
                self.args["--reverse"],
                index,
                fasta,
                self.args["--aligner"],
                min_qual,
                self.args["--start"],
                self.args["--depth"],
                self.args["--enzyme"],
                self.args["--outdir"],
                tmp_dir,
                self.args["--threads"],
            )

        # Build the network
        mtn.alignment_to_contacts(
            alignment_files,
            contig_data,
            hit_data,
            self.args["--outdir"],
            "network.txt",
            "contig_data_network.txt",
            tmp_dir,
            self.args["--threads"],
            self.args["--normalization"],
            self_contacts,
        )

        # Delete the temporary folder
        if not self.args["--no-clean-up"]:
            shutil.rmtree(tmp_dir)