Ejemplo n.º 1
0
def retrieve_fasta(in_file, aligner, tmpdir):
    """
    Function to retrieve fasta from the given reference file. If index is given
    retrieve it using bowtie2 inspect. Thraw an error if not a fasta or bowtie2
    index.

    Parameters:
    -----------
    in_file : str
        Path to the reference file given.
    aligner : str
        Name of the aligner used. Either 'bowtie2' or 'bwa'.
    tmpdir : str
        Path to the temp directory to write the fasta if necessary.

    Returns:
    --------
    str:
        Path to the fasta file.
    """
    if check_is_fasta(in_file):
        fasta = in_file
    else:
        if check_fasta_index(in_file, aligner):
            if aligner == "bowtie2":
                logger.info("Retrieve fasta from bowtie2 index.")
                fasta = join(tmpdir, "assembly.fa")
                cmd = "bowtie2-inspect {0} > {1}".format(in_file, fasta)
                process = sp.Popen(cmd, shell=True, stdout=sp.PIPE)
                _out, _err = process.communicate()
            elif aligner == "bwa":
                if isfile(in_file + ".fa"):
                    if check_is_fasta(in_file + ".fa"):
                        fasta = in_file + ".fa"
                elif isfile(in_file + ".fasta"):
                    if check_is_fasta(in_file + ".fasta"):
                        fasta = in_file + ".fasta"
                else:
                    logger.error(
                        "If you give bwa index, please make sure the fasta exists with the same prefix."
                    )
                    raise ValueError
        else:
            logger.error(
                "Please give as a reference a bowtie2 index or a fasta.")
            raise ValueError
    return fasta
Ejemplo n.º 2
0
def check_checkm():
    """
    Function to test if CheckM is in the path.

    Returns:
    --------
    bool:
        True if checkM found in the path, False otherwise.
    """
    try:
        checkm = sp.check_output("checkm", stderr=sp.STDOUT, shell=True)
    except sp.CalledProcessError:
        logger.error(
            "Cannot find 'checkm' in your path please install it or add it in your path."
        )
        return False
    return True
Ejemplo n.º 3
0
def check_louvain_cpp(louvain_path):
    """Function to check is the Louvain functions are callable.

    Parameters:
    -----------
    louvain_path : str
        Path of the directory where the Louvain functions are.

    Returns:
    --------
    bool:
        Boolean value describing either the Louvain functions are callable or
        not.
    """

    # Look for the path to call the functions:
    louvain = join(louvain_path, "louvain")
    convert = join(louvain_path, "convert")
    hierarchy = join(louvain_path, "hierarchy")

    # Check convert:
    try:
        convert = sp.check_output("{0} --help".format(convert),
                                  stderr=sp.STDOUT,
                                  shell=True)
    except sp.CalledProcessError:
        logger.error("Cannot find the 'convert' function from Louvain path.")
        return False

    # Check louvain:
    try:
        louvain = sp.check_output("{0} --help".format(louvain),
                                  stderr=sp.STDOUT,
                                  shell=True)
    except sp.CalledProcessError:
        logger.error("Cannot find the 'louvain' function from Louvain path.")
        return False

    # Check hierarchy:
    try:
        hierarchy = sp.check_output("{0} --help".format(hierarchy),
                                    stderr=sp.STDOUT,
                                    shell=True)
    except sp.CalledProcessError:
        logger.error("Cannot find the convert_net function from Louvain path.")
        return False

    return True
Ejemplo n.º 4
0
def recursive_clustering(
    assembly,
    iterations,
    overlapping_parameter,
    resolution_parameter,
    outdir,
    recursive_fasta_dir,
    algorithm,
    tmpdir,
    checkm_file,
    taxonomy_file,
    contigs_data_file,
    network_file,
    cluster_matrix,
    size,
    threads,
):
    """Function to run recursive iterations on contaminated bins in order to try
    to improve the quality of the bins using Louvain or Leiden algorthm.

    Parameters:
    -----------
    assembly : str
        Path to the fasta file used as assembly.
    iterations : int
        Number of iterations to use for recursive iterations of Louvain or
        Leiden.
    overlapping_parameter : float
        Hamming distance threshold to consider two bins as the same bin.
    resolution parameter : float
        Resolution parameter of Leiden algorithm.
    outdir : str
        Path to the output directory.
    recursive_fasta_dir : str
        Path to the directory where to write the decontaminated fasta.
    algorithm : str
        Algorithm to use, either louvain or leiden.
    tmpdir : str
        Path the temp directory.
    checkm_file : str
        Path to the output file of CheckM from checkm function.
    taxonomy_file : str
        Path to the taxonomy CheckM file.
    contigs_data_file : str
        Path to the contigs data file from metator partition.
    network_file : str
        Path to the network file from metator network.
    cluster_matrix : bool
        If True, build the clustering matrix and save it.
    size : int
        Size threshodl in base pairs of the bins.
    threads : int
        Number of threads to use.

    Returns:
    --------
    boolean:
        True if at least one new recursive bin has been generated.
    pandas.DataFrame
        Updated dictionnary which has as keys the values of the iterations from
        the recursive partition separated by a semicolon and as values the list
        of the id of the contigs.
    scipy.sparse.coo.coo_matrix:
        Matrix with all the previously computed hamming distance between two
        contigs.
    """

    # Create temporary folders
    tmpdir_subnetwork = join(tmpdir, "recursive_bins")
    os.makedirs(tmpdir_subnetwork, exist_ok=True)
    tmpdir_clustering = join(tmpdir, "recursive_clustering")
    os.makedirs(tmpdir_clustering, exist_ok=True)
    tmpdir_binning = join(tmpdir, "recursive_bins")
    os.makedirs(tmpdir_binning, exist_ok=True)

    # Load CheckM result:
    checkm_summary = mio.read_results_checkm(checkm_file, taxonomy_file)

    # Load network:
    network = nx.read_edgelist(network_file,
                               nodetype=int,
                               data=(("weight", float), ))

    # Load contigs data:
    contigs_data = pd.read_csv(contigs_data_file,
                               sep="\t",
                               header=0,
                               index_col=False)

    # Add new coulumns for recursive information.
    contigs_data["Recursive_bin_ID"] = "0"
    contigs_data["Recursive_bin_contigs"] = "-"
    contigs_data["Recursive_bin_size"] = "-"
    contigs_data["Final_bin"] = "ND"

    # Default no contamination
    contamination = False

    # Create an empty matrix
    N = len(contigs_data.ID)
    clustering_matrix = sparse.coo_matrix((N + 1, N + 1), dtype=np.float32)

    # Iterate on chcekm summary to find conatminated bins:
    for bin_id in checkm_summary:
        if (float(checkm_summary[bin_id]["completness"]) >= 50) & (float(
                checkm_summary[bin_id]["contamination"]) >= 5):

            logger.info("Bin in progress: {0}".format(bin_id))
            subnetwork_file = join(tmpdir_subnetwork,
                                   "subnetwork_" + bin_id + ".txt")
            bin_id = str(bin_id.split("_")[1])

            # Extract contigs
            mask = contigs_data["Overlapping_bin_ID"].apply(str) == bin_id
            list_contigs = list(contigs_data.loc[mask, "ID"])

            # Extract subnetwork
            subnetwork = network.subgraph(list_contigs)

            # Write the new subnetwork
            nx.write_edgelist(subnetwork,
                              subnetwork_file,
                              delimiter="\t",
                              data=["weight"])

            # Stop to report info log
            logger.setLevel(logging.WARNING)

            # Use Louvain or Leiden algorithm the subnetwork.
            if algorithm == "leiden":
                LEIDEN_PATH = os.environ["LEIDEN_PATH"]
                output_partition = mtp.leiden_iterations_java(
                    subnetwork_file,
                    iterations,
                    resolution_parameter,
                    tmpdir_clustering,
                    LEIDEN_PATH,
                )
            elif algorithm == "louvain":
                LOUVAIN_PATH = os.environ["LOUVAIN_PATH"]
                output_partition = mtp.louvain_iterations_cpp(
                    subnetwork_file,
                    iterations,
                    tmpdir_clustering,
                    LOUVAIN_PATH,
                )
            else:
                logger.error(
                    'algorithm should be either "louvain" or "leiden"')
                raise ValueError

            # Detect core bins
            (
                recursive_core_bins,
                recursive_bins_iterations,
            ) = mtp.detect_core_bins(output_partition, iterations)

            # Compute the Hamming distance between core bins.
            hamming_distance = mtp.get_hamming_distance(
                recursive_bins_iterations,
                iterations,
                threads,
            )

            # Defined overlapping bins according to the threshold
            recursive_bins = mtp.defined_overlapping_bins(
                overlapping_parameter,
                hamming_distance,
                recursive_core_bins,
                recursive_bins_iterations,
            )

            # update bin data and generate fasta
            contamination, contigs_data = update_contigs_data_recursive(
                contigs_data,
                recursive_bins,
                assembly,
                recursive_fasta_dir,
                tmpdir_binning,
                size,
                contamination,
            )

            # Build the clustering matrix of the subnetwork and add it.
            if cluster_matrix:
                clustering_matrix += mtp.build_clustering_matrix(
                    recursive_core_bins, hamming_distance, N)

            # Put back the info log
            logger.setLevel(logging.INFO)

    # Save the clustering matrix
    if cluster_matrix:
        clustering_matrix_file = join(outdir, "clustering_matrix_recursive")
        sparse.save_npz(clustering_matrix_file, clustering_matrix)
    else:
        clustering_matrix_file = None

    return contamination, contigs_data, clustering_matrix_file
Ejemplo n.º 5
0
    def set_metator_object(self, metator_object, name):
        """Method to get the metator object and name of the object usable for
        the algorithm.

        Parameters:
        -----------
        metator_object : str Object to extract contigs to build the matrix.
            Either "contig", "core_bin", "overlapping_bin", "recursive_bin",
            "final_bin" or "other".
        name : str
            Name of the object. Could be the name of a contig, an id of a bin or
            the name of the bin. Example: "NODE_1" or "MetaTOR_1_0".
        """
        if metator_object == "contig":
            self.object = "Name"
            self.name = name
        elif metator_object == "core_bin":
            self.object = "Core_bin_ID"
            try:
                int(name)
                self.name = str(name)
            except ValueError as object_no_exist:
                logger.error(
                    "With core bin object, the name should be the numeric ID of\
                     the core bin.")
                raise ValueError from object_no_exist
        elif metator_object == "overlapping_bin":
            self.object = "Overlapping_bin_ID"
            try:
                int(name)
                self.name = str(name)
            except ValueError:
                self.name = str(name.split("_")[1])
                try:
                    int(self.name)
                except ValueError as object_no_exist:
                    logger.error(
                        "Overlapping bin name should be either numerical ID or \
                        a name like 'MetaTOR_1' or 'MetaTOR_1_0'.")
                    raise ValueError from object_no_exist
        elif metator_object == "recursive_bin":
            self.object = "Recursive_bin_ID"
            try:
                int(name)
                self.name = str(name)
            except ValueError:
                self.name = str(name.split("_")[2])
                try:
                    int(self.name)
                except ValueError as object_no_exist:
                    logger.error(
                        "Overlapping bin name should be either numerical ID or \
                        a name like 'MetaTOR_1_1'.")
                    raise ValueError from object_no_exist
                if int(self.name) <= 0:
                    logger.error(
                        "A recursive bin should have an id bigger than 0.")
        elif metator_object == "final_bin":
            self.object = "Final_bin"
            self.name = name
        elif metator_object == "other":
            self.object = "Other"
            self.name = name
        else:
            logger.error(
                'Metator object should be one of these value: "contig", \
                core_bin", "overlapping_bin", "recursive_bin", "final_bin", \
                "other"')
            raise ValueError
Ejemplo n.º 6
0
def louvain_iterations_cpp(network_file, iterations, tmp_dir, louvain_path):
    """Use the cpp original Louvain to partition the network.

    Parameters:
    -----------
    network_file : str
        Path to the network computed previously. The file is 3 columns table
        separated by a tabulation with the id of the first contigs the id of the
        second one and the weights of the edge normalized or not.
    iterations : int
        Number of iterations of the algorithm of Louvain.
    tmp_dir : str
        Path to the temporary directory.
    louvain_path : str
        Path to the directory with louvain functions.

    Returns:
    --------
    dict:
        Dictionnary with the id of the contig as key and the list of the results
        of each iterations separated by a semicolon as values.
    """

    # Check if louvain cpp is available in the computer. If it's not available
    # launch python_louvain instead.
    if not mio.check_louvain_cpp(louvain_path):
        logger.error("Louvain implementation was not found.")
        logger.error(
            "You should have a LOUVAIN_PATH variable in your environnement"
        )
        raise NameError

    # Defined temporary files and args for louvain fonction calling and path to
    # the variables to call.
    network_bin = join(tmp_dir, "net_bin")
    network_weight = join(tmp_dir, "net_weight")
    network_tree = join(tmp_dir, "net_tree")
    network_labels = join(tmp_dir, "labels.txt")
    level_louvain = join(tmp_dir, "level.txt")
    output = join(tmp_dir, "output_louvain_")
    louvain = join(louvain_path, "louvain")
    convert = join(louvain_path, "convert")
    hierarchy = join(louvain_path, "hierarchy")
    output_louvain = dict()

    # Create dictionnary of all arguments
    louvain_args = {
        "net_txt": network_file,
        "net_bin": network_bin,
        "net_weight": network_weight,
        "net_tree": network_tree,
        "net_labels": network_labels,
        "level_file": level_louvain,
        "output": output,
        "level": 0,
        "iteration": 0,
        "convert": convert,
        "louvain": louvain,
        "hierarchy": hierarchy,
    }

    # Convert the file in binary file for Louvain partitionning.
    cmd = (
        "{convert} -i {net_txt} -o {net_bin} -r {net_labels} -w {net_weight}"
    ).format(**louvain_args)
    process = sp.Popen(cmd, shell=True)
    out, err = process.communicate()

    # Create a dictionary of Louvain labels and original contig id.
    labels = dict()
    with open(louvain_args["net_labels"]) as label_file:
        for label in label_file:
            label = label.split()
            labels[label[1]] = int(label[0])

    # Run the iterations of Louvain
    for i in range(iterations):
        logger.info("Iteration in progress: {0}".format(i))

        louvain_args["iteration"] = i

        # Partiotining with weights using louvain and compute the bin tree.
        cmd = ("{louvain} {net_bin} -l -1 -w {net_weight} > {net_tree}").format(
            **louvain_args
        )
        process = sp.Popen(cmd, shell=True)
        out, err = process.communicate()

        cmd = ("{hierarchy} {net_tree} > {level_file}").format(**louvain_args)
        process = sp.Popen(cmd, shell=True)
        out, err = process.communicate()

        level_file = open(level_louvain, "r")
        louvain_args["level"] = level_file.readlines()[-1][6]
        level_file.close()

        cmd = (
            "{hierarchy} {net_tree} -l {level} > {output}{iteration}.txt"
        ).format(**louvain_args)
        process = sp.Popen(cmd, shell=True)
        out, err = process.communicate()

        # Save the results in a dictionnary
        if i == 0:
            with open(output + str(i) + ".txt", "r") as out:
                for line in out:
                    result = line.split(" ")
                    output_louvain[labels[result[0]]] = result[1][:-1]

        else:
            with open(output + str(i) + ".txt", "r") as out:
                for line in out:
                    result = line.split(" ")
                    output_louvain[labels[result[0]]] += ";" + result[1][:-1]

    return output_louvain
Ejemplo n.º 7
0
def partition(
    algorithm,
    assembly,
    cluster_matrix,
    contig_data_file,
    iterations,
    network_file,
    outdir,
    fasta_dir,
    overlapping_parameter,
    resolution_parameter,
    size,
    temp_directory,
    threads,
):
    """Function to call the others functions to partition the network.

    Parameters:
    -----------
    algorithm : str
        Algorithm to use to partition the network. Either leiden or louvain.
    assembly : str
        Path to the assembly file used for the partition.
    cluster_matrix : bool
        If True, build and save the clustering matrix.
    contig_data_file : str
        Path to the contig data table to update.
    iterations : int
        Number of iterations to use for the partition.
    network_file : str
        Path to the network file.
    outdir : str
        Path to the output directory where to write the output files.
    fasta_dir : str
        Path to directory where to write the fasta files.
    overlapping_parameter : int
        Hamming distance threshold to use to merge bins (percentage).
    resolution_parameter : float
        Resolution parameter to use if Leiden algorithm is chosen. It will be a
        factor of the cost function used. A resolution parameter of 1 will be
        equivalent as the modularity function used in Louvain. Higher these
        parameters, smaller the bins will be in the output.
    size : int
        Threshold size in base pair of the output bins.
    temp_directory : str
        Path to the directory used to write temporary files.
    threads : int
        Number of threads to use.

    Returns:
    --------
    scipy.sparse.coo.coo_matrix:
        Matrix with all the previously computed hamming distance between two
        contigs.
    str:
        Path to the new contig data file with the bin informations in it.
    """

    # Create partition folders in the temporary directory
    temp_directory = join(temp_directory, "partition")
    os.makedirs(temp_directory, exist_ok=True)
    temp_directory_clustering = join(temp_directory, "clustering")
    os.makedirs(temp_directory_clustering, exist_ok=True)
    temp_directory_bins = join(temp_directory, "partition_bins")
    os.makedirs(temp_directory_bins, exist_ok=True)

    # Perform the iterations of Louvain or Leiden to partition the network.
    logger.info("Start iterations:")
    if algorithm == "leiden":
        LEIDEN_PATH = os.environ["LEIDEN_PATH"]
        output_partition = leiden_iterations_java(
            network_file,
            iterations,
            resolution_parameter,
            temp_directory_clustering,
            LEIDEN_PATH,
        )
    elif algorithm == "louvain":
        LOUVAIN_PATH = os.environ["LOUVAIN_PATH"]
        output_partition = louvain_iterations_cpp(
            network_file,
            iterations,
            temp_directory_clustering,
            LOUVAIN_PATH,
        )
    else:
        logger.error('algorithm should be either "louvain" or "leiden"')
        raise ValueError

    # Detect core bins
    logger.info("Detect core bins:")
    (
        core_bins_contigs,
        core_bins_iterations,
    ) = detect_core_bins(output_partition, iterations)

    # Compute the Hamming distance between core bins.
    logger.info("Detect overlapping bins:")
    hamming_distance = get_hamming_distance(
        core_bins_iterations,
        iterations,
        threads,
    )

    # Defined overlapping bins according to the threshold
    overlapping_bins = defined_overlapping_bins(
        overlapping_parameter,
        hamming_distance,
        core_bins_contigs,
        core_bins_iterations,
    )

    # Update the contigs_data_file.
    logger.info("Extract bins:")
    contigs_data, contigs_data_file = update_contigs_data(
        contig_data_file,
        core_bins_contigs,
        overlapping_bins,
        outdir,
    )

    # Generate Fasta file
    generate_fasta(
        assembly,
        overlapping_bins,
        contigs_data,
        size,
        fasta_dir,
        temp_directory_bins,
    )

    if cluster_matrix:
        # Build clustering matrix and save it.
        logger.info("Build  clustering matrix")
        clustering_matrix = build_clustering_matrix(
            core_bins_contigs, hamming_distance, len(contigs_data.ID)
        )
        # Save the clustering matrix
        clustering_matrix_file = join(outdir, "clustering_matrix_partition")
        sparse.save_npz(clustering_matrix_file, clustering_matrix)
    else:
        clustering_matrix_file = None

    return clustering_matrix_file, contigs_data_file
Ejemplo n.º 8
0
def get_contact_pairs(
    for_in,
    rev_in,
    index,
    assembly,
    aligner,
    min_qual,
    start,
    depth_file,
    enzyme,
    out_dir,
    tmp_dir,
    n_cpu,
):
    """General function to do the whole alignment of both fastq.

    The Function write at the output directory location given as an argument and
    return a tsv file of the aligned reads with 9 columns: ReadID, ContigA,
    Position_startA, Position_endA, StrandA, ContigB, Position_startB,
    Position_endB, StrandB. The name of the file will be alignment.txt.

    Two start stages are possible, from fastq or bam files.

    Parameters:
    -----------
    for_in : str
        Path to input forward fastq or bam file to align. If multiple files are
        given, list of path separated by a comma.
    rev_in : str
        Path to input reverse fastq or bam file to align. If multiple files are
        given, list of path separated by a comma.
    index : str
        Path to the bowtie2 index of the assembly.
    assembly : str
        The initial assembly path acting as the alignment file's reference
        assembly.
    aligner : str
        Either 'bowtie2' or 'bwa' aligner used or to be use to map the reads.
    min_qual : int
        Minimum mapping quality required to keep Hi-C pairs.
    start : str
        Either fastq or bam. Starting point for the pipeline.
    depth_file : str or None
        Path to the depth.txt file from jgi_summarize_bam_contig_depths from
        Metabat2 Software.
    enzyme : str or None
        String that contains the names of the enzyme separated by a comma.
    out_dir : str
        Path to directory where to write the output file.
    tmp_dir : str
        Path where temporary files should be written.
    n_cpu : int
        The number of CPUs to use for the alignment.

    Returns:
    --------
    list of str:
        List of path of the Files with the table containing the alignement data
        of the pairs: ReadID, ContigA, Position_startA, Position_endA, StrandA,
        ContigB, Position_startB, Position_endB, StrandB.
    dict
        Dictionnary of the all the contigs from the assembly, the contigs names
        are the keys to the data of the contig available with the following
        keys: "id", "length", "GC", "hit", "coverage". Coverage still at 0 and
        need to be updated later.
    dict:
        Dictionnary for hit information on each contigs.
    """

    # Iterates on all the input files:
    for_list = for_in.split(",")
    rev_list = rev_in.split(",")
    out_file_list = []
    total_aligned_pairs = 0

    # Create the contig data dictionnary and hit from each alignments
    nb_alignment = len(for_list)
    contig_data, hit_data = mtn.create_contig_data(assembly, nb_alignment,
                                                   depth_file, enzyme)

    for i in range(len(for_list)):
        for_in = for_list[i]
        try:
            rev_in = rev_list[i]
        except IndexError:
            rev_in = None
        name = "alignment_" + str(i)
        out_file = join(out_dir, "alignment_" + str(i) + ".pairs")
        out_file_list.append(out_file)

        # Align if necessary
        if start == "fastq":
            if aligner == "bowtie2":
                # Create files to save the alignment.
                alignment_for = join(out_dir, name + "_for.bam")
                alignment_rev = join(out_dir, name + "_rev.bam")

                # Align the forward reads
                logger.info("Alignment of %s:", for_in)
                align(for_in, index, aligner, alignment_for, n_cpu)

                # Align the reverse reads
                logger.info("Alignment of %s:", rev_in)
                align(rev_in, index, aligner, alignment_rev, n_cpu)
            elif aligner == "bwa":
                # Create file to save the alignement.
                alignment = join(out_dir, name + ".bam")
                logger.info("Alignment of %s and %s:", for_in, rev_in)
                align(for_in, index, aligner, alignment, n_cpu, rev_in)

        elif start == "bam":
            if aligner == "bowtie2":
                logger.info("Processing %s and %s:", for_in, rev_in)
                alignment_for = for_in
                alignment_rev = rev_in
            elif aligner == "bwa":
                alignment = for_in

        else:
            logger.error("Start argument should be either 'fastq' or 'bam'.")
            raise ValueError

        if aligner == "bowtie2":
            # Create files to save the alignment.
            alignment_temp_for = join(tmp_dir, name + "_for_temp.txt")
            alignment_temp_rev = join(tmp_dir, name + "_rev_temp.txt")

            # Filters the aligned and non aligned reads from the forward and
            # reverse bam files.
            aligned_reads_for = process_bamfile(alignment_for, min_qual,
                                                alignment_temp_for)
            aligned_reads_rev = process_bamfile(alignment_rev, min_qual,
                                                alignment_temp_rev)
            logger.info(
                "%s forward reads aligned and %s reverse reads aligned",
                aligned_reads_for,
                aligned_reads_rev,
            )

            # Merge alignement to create a pairs file
            logger.info("Merging the pairs:")
            n_pairs = merge_alignment(alignment_temp_for, alignment_temp_rev,
                                      contig_data, out_file)
            logger.info("%s pairs aligned.", n_pairs)
            total_aligned_pairs += n_pairs

        # Case where a bam file from bwa is given as input.
        if aligner == "bwa":
            n_pairs = process_bwa_bamfile(alignment, min_qual, contig_data,
                                          out_file)
            logger.info("%s pairs aligned.", n_pairs)
            total_aligned_pairs += n_pairs

    if len(out_file_list) > 1:
        logger.info("TOTAL PAIRS MAPPED: %s", total_aligned_pairs)

    return out_file_list, contig_data, hit_data
Ejemplo n.º 9
0
def process_bwa_bamfile(alignment, min_qual, contig_data, out_file):
    """Filter alignment BAM files

    Reads all the reads in the input BAM alignment file. Keep reads in the
    output if they are aligned with a good quality (greater than min quality
    threshold given) saving their only some columns: ReadID, Contig,
    Position_start, Position_end, strand to save memory.

    Parameters:
    -----------
    alignment : str
        Path to the input temporary alignment.
    min_qual : int
        Minimum mapping quality required to keep a Hi-C pair.
    contig_data : dict
        Dictionnary of the all the contigs from the assembly, the contigs names
        are the keys to the data of the contig available with the following
        keys: "id", "length", "GC", "hit", "coverage". Coverage still at 0 and
        need to be updated later.
    out_file : str
        Path to the output pairs file.

    Returns:
    --------
    int:
        Number of pairs aligned.
    """

    # Read the bam file.
    n_pairs = 0
    save = pysam.set_verbosity(0)
    temp_bam = pysam.AlignmentFile(alignment, "rb", check_sq=False)
    pysam.set_verbosity(save)

    with open(out_file, "w") as merged:

        # Write header of the pairs file.
        merged.write("## pairs format v1.0\n")
        merged.write("#columns: readID chr1 pos1 chr2 pos2 strand1 strand2\n")
        merged.write("#sorted: readID\n")
        merged.write("#shape: upper triangle\n")
        for contig in contig_data:
            merged.write("#chromsize: {0} {1}\n".format(
                contig, contig_data[contig]["length"]))

        # Loop until the end of the file. Read the reads by two as the forward
        # and reverse reads should be interleaved.
        while n_pairs >= 0:
            try:
                for_read = next(temp_bam)
                while for_read.is_supplementary:
                    for_read = next(temp_bam)
                rev_read = next(temp_bam)
                while rev_read.is_supplementary:
                    rev_read = next(temp_bam)

                # Check mapping quality
                if (for_read.mapping_quality >= min_qual
                        and rev_read.mapping_quality >= min_qual):

                    # Check flag
                    if not (for_read.is_unmapped or rev_read.is_unmapped):
                        n_pairs += 1

                        # Safety check (forward and reverse are the same reads)
                        if for_read.query_name != rev_read.query_name:
                            logger.error(
                                "Reads should be paired - %s\t%s",
                                for_read.query_name,
                                rev_read.query_name,
                            )
                            raise ValueError

                        # Define pairs value.
                        name = for_read.query_name
                        contig1 = for_read.reference_name
                        contig2 = rev_read.reference_name
                        pos1 = for_read.pos + 1
                        pos2 = for_read.pos + 1
                        strand1 = "+"
                        strand2 = "+"
                        if for_read.is_reverse:
                            strand1 = "-"
                        if rev_read.is_reverse:
                            strand2 = "-"

                        # Modify order to have an upper triangle and write
                        # the pair.
                        if (contig1 == contig2
                                and pos1 <= pos2) or contig_data[contig1][
                                    "id"] < contig_data[contig2]["id"]:
                            merged.write("\t".join([
                                name,
                                contig1,
                                str(pos1),
                                contig2,
                                str(pos2),
                                strand1,
                                strand2,
                            ]) + "\n")
                        else:
                            merged.write("\t".join([
                                name,
                                contig2,
                                str(pos2),
                                contig1,
                                str(pos1),
                                strand2,
                                strand1,
                            ]) + "\n")

            # Exit the loop if no more reads.
            except StopIteration:
                break

    # Close the bam file and return number of pairs
    temp_bam.close()
    return n_pairs
Ejemplo n.º 10
0
    def execute(self):

        # Defined the temporary directory.
        if not self.args["--tmpdir"]:
            tmp_dir = mio.generate_temp_dir("./tmp")
        else:
            tmp_dir = self.args["--tmpdir"]
            os.makedirs(tmp_dir, exist_ok=True)

        # Defined the output directory and output file names.
        if not self.args["--outdir"]:
            self.args["--outdir"] = "."
        os.makedirs(self.args["--outdir"], exist_ok=True)
        overlapping_fasta_dir = join(self.args["--outdir"], "overlapping_bin")
        if not exists(overlapping_fasta_dir):
            os.makedirs(overlapping_fasta_dir)
        else:
            if self.args["--force"]:
                shutil.rmtree(overlapping_fasta_dir)
                os.makedirs(overlapping_fasta_dir)
            else:
                print(self.args["--force"])
                logger.error(
                    "%s already existed. Remove directory or use -F argument to overwrite it.",
                    overlapping_fasta_dir,
                )
                raise ValueError

        # Enable file logging
        now = time.strftime("%Y%m%d%H%M%S")
        log_file = join(self.args["--outdir"], ("metator_" + now + ".log"))
        mtl.set_file_handler(log_file)

        # Define variable
        min_qual = int(self.args["--min-quality"])
        iterations = int(self.args["--iterations"])
        recursive_iterations = int(self.args["--rec-iter"])
        overlapping_parameter = int(self.args["--overlap"]) / 100
        recursive_overlapping_parameter = int(self.args["--rec-overlap"]) / 100
        size = int(self.args["--size"])
        threads = int(self.args["--threads"])
        resolution_parameter = float(self.args["--res-param"])

        # Check correct algorithm value
        if self.args["--algorithm"] not in ["louvain", "leiden"]:
            logger.error('algorithm should be either "louvain" or "leiden"')
            raise ValueError

        # Check if normalization in the list of possible normalization.
        list_normalization = [
            "None",
            "abundance",
            "length",
            "RS",
            "empirical_hit",
            "theoritical_hit",
        ]
        if self.args["--normalization"] not in list_normalization:
            logger.error(
                'Normalization should be among this list: "None", "abundance", "length", "RS", "empirical_hit", "theoritical_hit"'
            )
            raise ValueError
        enzyme_required = ["RS", "theoritical_hit"]
        if (self.args["--normalization"] in enzyme_required
                and not self.args["--enzyme"]):
            logger.error(
                'For "RS" and "theoritical_hit" normalization, enzyme is required.'
            )
            raise ValueError
        depth_required = ["abundance", "theoritical_hit"]
        if (self.args["--normalization"] in depth_required
                and not self.args["--depth"]):
            logger.error(
                'For "abundance" and "theoritical_hit" normalization, depth is required.'
            )
            raise ValueError

        # Sanity check for validation
        if not self.args["--skip-validation"]:
            recursive_fasta_dir = join(self.args["--outdir"], "recursive_bin")
            if not exists(recursive_fasta_dir):
                os.makedirs(recursive_fasta_dir)
            else:
                if self.args["--force"]:
                    shutil.rmtree(recursive_fasta_dir)
                    os.makedirs(recursive_fasta_dir)
                else:
                    logger.error(
                        "%s already existed. Remove directory or use -F argument to overwrite it",
                        recursive_fasta_dir,
                    )
                    raise ValueError
            final_fasta_dir = join(self.args["--outdir"], "final_bin")
            if not exists(final_fasta_dir):
                os.makedirs(final_fasta_dir)
            else:
                if self.args["--force"]:
                    shutil.rmtree(final_fasta_dir)
                    os.makedirs(final_fasta_dir)
                else:
                    logger.error(
                        "%s already existed. Remove directory or use -F argument to overwrite it.",
                        final_fasta_dir,
                    )
                    raise ValueError

            # Check checkM availability
            if not mio.check_checkm():
                logger.error(
                    "CheckM is not in the path. Could not make the iterations")
                raise NameError

        # Manage start point.
        if self.args["--start"] == "fastq":
            start = 1
        elif self.args["--start"] == "bam":
            start = 2
        elif self.args["--start"] == "pair":
            start = 3
        elif self.args["--start"] == "network":
            start = 4
        else:
            logger.error(
                "Start argument should be 'fastq', 'bam', 'pair' or 'network'."
            )
            raise ValueError

        # Check if forward and reverse reads are given for fastq and bam start.
        if (self.args["--start"] == "fastq" or
            (self.args["--start"] == "bam" and self.args["--aligner"]
             == "bowtie2")) and not self.args["--reverse"]:
            logger.error(
                "Forward and reverse arguments are necessary for fastq with %s start and %s aligner.",
                self.args["--start"],
                self.args["--aligner"],
            )
            raise ValueError

        # Print information of the workflow:
        if start == 1:
            logger.info("Minimum mapping quality: %d", min_qual)
        if start <= 2:
            logger.info("Enzyme: %s", self.args["--enzyme"])
            logger.info("Normalization: %s", self.args["--normalization"])
        logger.info("Aligner algorithm: %s", self.args["--aligner"])
        logger.info("Partition algorithm: %s", self.args["--algorithm"])
        logger.info("Partition iterations: %s", iterations)
        logger.info("Overlapping parameter: %s", overlapping_parameter)
        if not self.args["--skip-validation"]:
            logger.info("Recursive partition iterations: %d",
                        recursive_iterations)
            logger.info(
                "Recursive overlapping parameter: %s",
                recursive_overlapping_parameter,
            )

        # Extract index and genome file
        assembly = self.args["--assembly"]
        # Check what is the reference. If a fasta is given build the index. If a
        # bowtie2 index is given, retreive the fasta.
        index = mio.check_fasta_index(assembly, mode=self.args["--aligner"])
        if index is None:
            if mio.check_is_fasta(assembly):
                fasta = assembly
                if start == 1:
                    index = mio.generate_fasta_index(fasta,
                                                     self.args["--aligner"],
                                                     tmp_dir)
            else:
                logger.error(
                    "Please give as assembly argument a bowtie2 index or a fasta."
                )
                raise ValueError
        else:
            fasta = mio.retrieve_fasta(index, self.args["--aligner"], tmp_dir)

        # Run the whole workflow
        if start <= 3:
            if start <= 2:
                # Align pair-end reads with bowtie2
                alignment_files, contig_data, hit_data = mta.get_contact_pairs(
                    self.args["--forward"],
                    self.args["--reverse"],
                    index,
                    fasta,
                    self.args["--aligner"],
                    min_qual,
                    self.args["--start"],
                    self.args["--depth"],
                    self.args["--enzyme"],
                    self.args["--outdir"],
                    tmp_dir,
                    self.args["--threads"],
                )
            else:
                alignment_files = self.args["--forward"].split(",")
                nb_alignment = len(alignment_files)
                contig_data, hit_data = mtn.create_contig_data(
                    fasta,
                    nb_alignment,
                    self.args["--depth"],
                    self.args["--enzyme"],
                )
            # Build the network
            network_file, contigs_data_file = mtn.alignment_to_contacts(
                alignment_files,
                contig_data,
                hit_data,
                self.args["--outdir"],
                "network.txt",
                "contig_data_network.txt",
                tmp_dir,
                self.args["--threads"],
                self.args["--normalization"],
                False,
            )
        else:
            contigs_data_file = self.args["--contigs"]
            network_file = self.args["--network"]

        # Partition the network
        clustering_matrix_partition_file, contigs_data_file = mtp.partition(
            self.args["--algorithm"],
            fasta,
            self.args["--cluster-matrix"],
            contigs_data_file,
            iterations,
            network_file,
            self.args["--outdir"],
            overlapping_fasta_dir,
            overlapping_parameter,
            resolution_parameter,
            size,
            tmp_dir,
            threads,
        )

        # remove contig_data_network if not an input
        if start <= 2:
            contig_data_network_file = join(self.args["--outdir"],
                                            "contig_data_network.txt")
            os.remove(contig_data_network_file)

        # Launch validation if desired.
        if not self.args["--skip-validation"]:
            clustering_matrix_recursive_file = mtv.recursive_decontamination(
                self.args["--algorithm"],
                fasta,
                self.args["--cluster-matrix"],
                contigs_data_file,
                final_fasta_dir,
                overlapping_fasta_dir,
                recursive_iterations,
                network_file,
                self.args["--outdir"],
                recursive_overlapping_parameter,
                recursive_fasta_dir,
                resolution_parameter,
                size,
                tmp_dir,
                threads,
            )

            if self.args["--cluster-matrix"]:
                # Make the sum with the partiton clustering matrix and save it.
                clustering_matrix = load_npz(clustering_matrix_partition_file +
                                             ".npz")
                clustering_matrix_recursive = load_npz(
                    clustering_matrix_recursive_file + ".npz")
                clustering_matrix = (
                    (clustering_matrix + clustering_matrix_recursive) /
                    2).tocoo()
                clustering_matrix_file = join(self.args["--outdir"],
                                              "clustering_matrix")
                save_npz(clustering_matrix_file, clustering_matrix)

            # Remove contig_data_partition file
            contig_data_partition_file = join(self.args["--outdir"],
                                              "contig_data_partition.txt")
            os.remove(contig_data_partition_file)

        # Delete pyfastx index:
        os.remove(fasta + ".fxi")
        # Delete the temporary folder.
        if not self.args["--no-clean-up"]:
            shutil.rmtree(tmp_dir)
Ejemplo n.º 11
0
    def execute(self):

        # Defined the temporary directory.
        if not self.args["--tmpdir"]:
            tmp_dir = mio.generate_temp_dir("./tmp")
        else:
            tmp_dir = self.args["--tmpdir"]
            os.makedirs(tmp_dir, exist_ok=True)

        # Defined the output directory and output file names.
        if not self.args["--outdir"]:
            self.args["--outdir"] = "."
        os.makedirs(self.args["--outdir"], exist_ok=True)
        recursive_fasta_dir = join(self.args["--outdir"], "recursive_bin")
        final_fasta_dir = join(self.args["--outdir"], "final_bin")
        if not exists(recursive_fasta_dir):
            os.makedirs(recursive_fasta_dir)
        else:
            if self.args["--force"]:
                shutil.rmtree(recursive_fasta_dir)
                os.makedirs(recursive_fasta_dir)
            else:
                logger.error(
                    "%s already existed. Remove directory or use -F argument to overwrite it.",
                    recursive_fasta_dir,
                )
                raise ValueError
        if not exists(final_fasta_dir):
            os.makedirs(final_fasta_dir)
        else:
            if self.args["--force"]:
                shutil.rmtree(final_fasta_dir)
                os.makedirs(final_fasta_dir)
            else:
                logger.error(
                    "%s already existed. Remove directory or use -F argument to overwrite it.",
                    final_fasta_dir,
                )
                raise ValueError

        # Enable file logging
        now = time.strftime("%Y%m%d%H%M%S")
        log_file = join(self.args["--outdir"],
                        ("metator_validation_" + now + ".log"))
        mtl.set_file_handler(log_file)

        # Transform numeric variable as numeric
        iterations = int(self.args["--iterations"])
        size = int(self.args["--size"])
        threads = int(self.args["--threads"])
        overlapping_parameter = int(self.args["--overlap"]) / 100
        resolution_parameter = float(self.args["--res-param"])

        # Check checkM availabilitys
        if not mio.check_checkm():
            logger.error(
                "CheckM is not in the path. Could not make the iterations")
            raise NameError

        # Check correct algorithm value
        if self.args["--algorithm"] not in ["louvain", "leiden"]:
            logger.error('algorithm should be either "louvain" or "leiden"')
            raise ValueError

        _clustering_matrix_file = mtv.recursive_decontamination(
            self.args["--algorithm"],
            self.args["--assembly"],
            self.args["--cluster-matrix"],
            self.args["--contigs"],
            final_fasta_dir,
            self.args["--fasta"],
            iterations,
            self.args["--network"],
            self.args["--outdir"],
            overlapping_parameter,
            recursive_fasta_dir,
            resolution_parameter,
            size,
            tmp_dir,
            threads,
        )

        # Delete pyfastx index:
        os.remove(self.args["--assembly"] + ".fxi")
        # Delete the temporary folder
        if not self.args["--no-clean-up"]:
            shutil.rmtree(tmp_dir)
Ejemplo n.º 12
0
    def execute(self):

        # Defined the temporary directory.
        if not self.args["--tmpdir"]:
            tmp_dir = mio.generate_temp_dir("./tmp")
        else:
            tmp_dir = self.args["--tmpdir"]
            os.makedirs(tmp_dir, exist_ok=True)

        # Defined the output directory and output file names.
        if not self.args["--outdir"]:
            self.args["--outdir"] = "."
        os.makedirs(self.args["--outdir"], exist_ok=True)

        # Enable file logging
        now = time.strftime("%Y%m%d%H%M%S")
        log_file = join(self.args["--outdir"],
                        ("metator_network_" + now + ".log"))
        mtl.set_file_handler(log_file)

        # Transform integer variables as integer.
        min_qual = int(self.args["--min-quality"])

        # Defined boolean variables:
        self_contacts = self.args["--self-contacts"]

        # Check if forward and reverse arguments are given:
        if (self.args["--start"] == "fastq" or
            (self.args["--start"] == "bam" and self.args["--aligner"]
             == "bowtie2")) and not self.args["--reverse"]:
            logger.error(
                "Forward and reverse arguments are necessary for fastq with %s start and %s aligner.",
                self.args["--start"],
                self.args["--aligner"],
            )
            raise ValueError

        # Check if normalization in the list of possible normalization.
        list_normalization = [
            "None",
            "abundance",
            "length",
            "RS",
            "empirical_hit",
            "theoritical_hit",
        ]
        if self.args["--normalization"] not in list_normalization:
            logger.error(
                'Normalization should be among this list: "None", "abundance", "length", "RS", "empirical_hit", "theoritical_hit"'
            )
            raise ValueError
        enzyme_required = ["RS", "theoritical_hit"]
        if (self.args["--normalization"] in enzyme_required
                and not self.args["--enzyme"]):
            logger.error(
                'For "RS" and "theoritical_hit" normalization, enzyme is required.'
            )
            raise ValueError
        depth_required = ["abundance", "theoritical_hit"]
        if (self.args["--normalization"] in depth_required
                and not self.args["--depth"]):
            logger.error(
                'For "abundance" and "theoritical_hit" normalization, depth is required.'
            )
            raise ValueError
        if self.args["--start"] not in ["fastq", "bam", "pair", "network"]:
            logger.error(
                "Start argument should be 'fastq', 'bam', 'pair' or 'network'."
            )
            raise ValueError
        # Extract index and genome file
        assembly = self.args["--assembly"]
        # Check what is the reference. If a fasta is given build the index. If a
        # bowtie2 index is given, retreive the fasta.
        index = mio.check_fasta_index(assembly, mode=self.args["--aligner"])
        if index is None:
            if mio.check_is_fasta(assembly):
                fasta = assembly
                # If start at bam could skip the index generation.
                if self.args["--start"] == "fastq":
                    index = mio.generate_fasta_index(fasta,
                                                     self.args["--aligner"],
                                                     tmp_dir)
            else:
                logger.error(
                    "Please give as assembly argument a %s index or a fasta.",
                    self.args["--aligner"],
                )
                raise ValueError
        else:
            fasta = mio.retrieve_fasta(index, self.args["--aligner"], tmp_dir)

        # Print information of teh workflow:
        logger.info("Aligner algorithm: %s", self.args["--aligner"])
        logger.info("Enzyme: %s", self.args["--enzyme"])
        logger.info("Normalization: %s", self.args["--normalization"])
        logger.info("Minimum mapping quality: %s", self.args["--min-quality"])

        # Do not align if pair start
        if self.args["--start"] == "pair":
            alignment_files = self.args["--forward"].split(",")
            nb_alignment = len(alignment_files)
            contig_data, hit_data = mtn.create_contig_data(
                fasta,
                nb_alignment,
                self.args["--depth"],
                self.args["--enzyme"],
            )

        else:
            # Align pair-end reads with bowtie2
            alignment_files, contig_data, hit_data = mta.get_contact_pairs(
                self.args["--forward"],
                self.args["--reverse"],
                index,
                fasta,
                self.args["--aligner"],
                min_qual,
                self.args["--start"],
                self.args["--depth"],
                self.args["--enzyme"],
                self.args["--outdir"],
                tmp_dir,
                self.args["--threads"],
            )

        # Build the network
        mtn.alignment_to_contacts(
            alignment_files,
            contig_data,
            hit_data,
            self.args["--outdir"],
            "network.txt",
            "contig_data_network.txt",
            tmp_dir,
            self.args["--threads"],
            self.args["--normalization"],
            self_contacts,
        )

        # Delete the temporary folder
        if not self.args["--no-clean-up"]:
            shutil.rmtree(tmp_dir)