def generate_fasta_index(fasta, aligner, outdir): """Generate fasta index. Parameters: ----------- fasta : str Path to the fasta reference to index. aligner : str Aligner to use to build the index. outdir : str Path to the directory to write the index. Returns: -------- str: Path to the bowtie2 index build """ logger.info("Build index from the given fasta.") index = join(outdir, "index") if aligner == "bowtie2": cmd = "bowtie2-build -q {0} {1}".format(fasta, index) elif aligner == "bwa": cmd = "bwa index -p {1} {0}".format(fasta, index) process = sp.Popen(cmd, shell=True, stdout=sp.PIPE) _out, _err = process.communicate() return index
def leiden_iterations_java( network_file, iterations, resolution_parameter, tmp_dir, leiden_path ): """Use the java implementation of Leiden to prtition the network. Parameters: ----------- network_file : str Path to the network computed previously. The file is 3 columns table separated by a tabulation with the id of the first contigs the id of the second one and the weights of the edge normalized or not. iterations : int Number of iterations of the algorithm of Leiden. resolution_parameter : float Resolution parameter for Leiden clustering. tmp_dir : str Path to the temporary directory. leiden_path : str Path to the directory with network analysis java implementation. Returns: -------- dict: Dictionnary with the id of the contig as key and the list of the results of each iterations separated by a semicolon as values. """ output_partition = dict() # Run the iterations of Leiden for i in range(iterations): logger.info("Iteration in progress: {0}".format(i)) output = join(tmp_dir, "partition_{0}.txt".format(i)) # Clusterize the network using Leiden. cmd = ( " java -cp {0} nl.cwts.networkanalysis.run.RunNetworkClustering -i 4 -r {1} -w -o {2} -q Modularity -a Leiden {3}" ).format(leiden_path, resolution_parameter, output, network_file) process = sp.Popen(cmd, shell=True, stderr=sp.DEVNULL) process.communicate() # Save the results in a dictionnary if i == 0: with open(output, "r") as out: for line in out: result = line.split("\t") output_partition[int(result[0])] = result[1][:-1] else: with open(output, "r") as out: for line in out: result = line.split("\t") output_partition[int(result[0])] += ";" + result[1][:-1] # Remove isolates (nodes with no contacts): output_partition.pop(0) output_partition = remove_isolates(output_partition, network_file) return output_partition
def detect_core_bins(output_partition, iterations): """Detect core bins from the output of the partition algorithm. The function search for duplicated values in the output of Louvain or Leiden algorithm in order to find contigs which are always in the same bin. The bins find with this method are called the core bins. Parameters: ----------- output_partition : dict Dictionnary with the id of the contig as key and the list of the results of each iterations separated by a semicolon as values. iterations : int Number of iterations made previously with the partition algorithm. Returns: -------- dict: Dictionnary which has as keys the core bins id and as value the id of the contigs of the core bin. pandas.core.frame.DataFrame: Table with the id of the core bin and their values for each iterations. """ # finding duplicate values in the output of louvain or leiden using a # flipped dictionary. # Create dictionnary for core bins core_bins = {} core_bins_contigs = {} core_bins_iterations = np.empty((0, iterations), int) core_bin_id = 0 for key, value in output_partition.items(): if value not in core_bins: # Create an entry in a dictionnary with all the contigs with # iterations list as a key. core_bins[value] = core_bin_id # Create an entry in a dictionnary with all the contigs with core # bin id as a key. core_bins_contigs[core_bin_id] = [key] core_bin_id += 1 # Add a line to compute the array used to compute the distance # between two core bins core_bins_iterations = np.append( core_bins_iterations, np.array([list(map(int, value.split(";")))]), axis=0, ) # If already an entry created for this bin add a contig in the lists. else: core_bins_contigs[core_bins[value]].append(key) # Transform the array in a dataframe core_bins_iterations = pd.DataFrame(core_bins_iterations) logger.info("{0} core bins were found.\n".format(len(core_bins))) return core_bins_contigs, core_bins_iterations
def retrieve_fasta(in_file, aligner, tmpdir): """ Function to retrieve fasta from the given reference file. If index is given retrieve it using bowtie2 inspect. Thraw an error if not a fasta or bowtie2 index. Parameters: ----------- in_file : str Path to the reference file given. aligner : str Name of the aligner used. Either 'bowtie2' or 'bwa'. tmpdir : str Path to the temp directory to write the fasta if necessary. Returns: -------- str: Path to the fasta file. """ if check_is_fasta(in_file): fasta = in_file else: if check_fasta_index(in_file, aligner): if aligner == "bowtie2": logger.info("Retrieve fasta from bowtie2 index.") fasta = join(tmpdir, "assembly.fa") cmd = "bowtie2-inspect {0} > {1}".format(in_file, fasta) process = sp.Popen(cmd, shell=True, stdout=sp.PIPE) _out, _err = process.communicate() elif aligner == "bwa": if isfile(in_file + ".fa"): if check_is_fasta(in_file + ".fa"): fasta = in_file + ".fa" elif isfile(in_file + ".fasta"): if check_is_fasta(in_file + ".fasta"): fasta = in_file + ".fasta" else: logger.error( "If you give bwa index, please make sure the fasta exists with the same prefix." ) raise ValueError else: logger.error( "Please give as a reference a bowtie2 index or a fasta.") raise ValueError return fasta
def generate_contact_map( assembly, contig_data_file, enzyme, name, pairs, out_dir, tmp_dir, filter_events=False, force=False, mat_fmt="graal", metator_object="final_bin", min_size=5000, pcr_duplicates=False, threads=1, ): """General function to extract pairs of the MetaTOR object and generate its the contact map. Parameters: ----------- assembly : str Path to the fasta file containing the contigs of interest. Could be the whole or the extracted contigs of one bin. contig_data_file : str Path to the contig_data_final.txt file form MetaTOR output. enzyme : str Enzyme used to digest the genome in the HiC experiment. Example: HpaII,MluCI. name : str Name of the object. Could be the name of a contig, an id of a bin or the name of the bin. Example: "NODE_1" or "MetaTOR_1_0". pairs : str Path of the ".pairs" file or bgzip indexed pair file. If more than one is given, files should be separated by a comma. out_dir : str Path where output files should be written. Current directory by default. tmp_dir : str Path where temporary files will be written. filter_events : bool Filter spurious or uninformative 3C events. Requires a restriction enzyme. Default: False. force : bool If True, overwrite existing files with the same name as output. Default: False. mat_fmt : str Select the output matrix format. Can be either "bg2" for the bedgraph2 format, "cool" for Mirnylab's cool format, or graal for a plain text COO format compatible with Koszullab's instagraal software. Default: "graal". metator_object : str Object to extract contigs to build the matrix. Either "contig", "core_bin", "overlapping_bin", "recursive_bin", "final_bin" or "other". min_size : int Minimum contig size required to keep it. pcr_duplicates : bool If True, PCR duplicates will be filtered based on genomic positions. Pairs where both reads have exactly the same coordinates are considered duplicates and only one of those will be conserved. Default: False. threads : int Numbers of threads to use. Default: 1. """ # Extract bin information from metaTOR outdir. logger.info("Generate HiC contact map for %s", name) metator_data = MetatorObject(metator_object, name, assembly, contig_data_file, pairs, min_size) metator_data.set_contigs() if min_size > 0: metator_data.set_large_contigs() metator_data.write_fasta(tmp_dir, out_dir) metator_data.pairs = join(tmp_dir, name + ".pairs") # Extract pairs of the bin. n_pairs = extract_pairs(metator_data) if n_pairs == 0: logger.info("No pairs have been extracted") else: logger.info("%d pairs have been extracted.", n_pairs) # Launch hicstuff pipeline. hcp.full_pipeline( genome=metator_data.fasta, input1=metator_data.pairs, distance_law=False, enzyme=enzyme, filter_events=filter_events, force=force, mat_fmt=mat_fmt, out_dir=out_dir, pcr_duplicates=pcr_duplicates, plot=False, start_stage="pairs", threads=threads, tmp_dir=tmp_dir, ) return n_pairs
def recursive_decontamination( algorithm, assembly, cluster_matrix, contig_data_file, final_fasta_dir, input_fasta_dir, iterations, network_file, outdir, overlapping_parameter, recursive_fasta_dir, resolution_parameter, size, temp_directory, threads, ): """Function to validate bins do the recursive decontamination using Louvain or Leiden algorithm Parameters: ----------- algorithm : str Algorithm to use to recursively partition the network. Either leiden or louvain. assembly : str Path to the assembly file used for the partition. cluster_matrix : bool If True, build the clustering matrix and save it. contig_data_file : str Path to the contig data table to update. final_fasta_dir : str Path to write the final fasta decontaminated bins. input_fasta_dir : str Path to the directory where the fasta bin from the partition are. iterations : int Number of iterations to use for the recursive partition. network_file : str Path to the network file. outdir : str Path to the output directory where to write the output files. overlapping_parameter : int Hamming distance threshold in percentage to use to consider to bins as one in the recursive partition. recursive_fasta_dir : str Path to write the fasta decontaminated bins. resolution_parameter : float Resolution parameter to use if Leiden algorithm is chosen. It will be a factor of the cost function used. A resolution parameter of 1 will be equivalent as the modularity function used in Louvain. Higher these parameters, smaller the bins will be in the output. size : int Threshold size in base pair of the output bins. temp_directory : str Path to the directory used to write temporary files. threads : int Number of threads to use. Returns: -------- scipy.sparse.coo.coo_matrix: Matrix with all the previously computed hamming distance between two contigs. """ # Create folders in the temporary directory tmpdir_checkm = join(temp_directory, "checkm") os.makedirs(tmpdir_checkm, exist_ok=True) tmpdir_recursive_clustering = join(temp_directory, "recursive_clustering") os.makedirs(tmpdir_recursive_clustering, exist_ok=True) # Defined checkm output file path overlapping_checkm_file = join(outdir, "overlapping_checkm_results.txt") overlapping_taxonomy_file = join(outdir, "overlapping_checkm_taxonomy.txt") recursive_checkm_file = join(outdir, "recursive_checkm_results.txt") recursive_taxonomy_file = join(outdir, "recursive_checkm_taxonomy.txt") # Launch checkM checkm( input_fasta_dir, overlapping_checkm_file, overlapping_taxonomy_file, tmpdir_checkm, threads, ) # Iterates Louvain or Leiden on contaminated and complete bins. contamination, contigs_data, clustering_matrix_file = recursive_clustering( assembly, iterations, overlapping_parameter, resolution_parameter, outdir, recursive_fasta_dir, algorithm, tmpdir_recursive_clustering, overlapping_checkm_file, overlapping_taxonomy_file, contig_data_file, network_file, cluster_matrix, size, threads, ) # Recursive iterations of Louvain or Leiden on the contaminated bins. Save # bin information if the new bins have the same quality otherwise keep the # original bin information. if contamination: # Run checkm on the recursive bins. tmpdir_checkm = join(temp_directory, "checkm2") checkm( recursive_fasta_dir, recursive_checkm_file, recursive_taxonomy_file, tmpdir_checkm, threads, ) # Compare bin_summary = compare_bins( overlapping_checkm_file, overlapping_taxonomy_file, recursive_checkm_file, recursive_taxonomy_file, ) # Keep overlapping bin information else: logger.info("No contaminated bin have been found") bin_summary = mio.read_results_checkm(overlapping_checkm_file, overlapping_taxonomy_file) # Create fasta directory and copy final bins. for bin_name in bin_summary: dst = join(final_fasta_dir, bin_name + ".fa") if bin_name.split("_")[2] == "0": src = join(input_fasta_dir, bin_name + ".fa") else: src = join(recursive_fasta_dir, bin_name + ".fa") shutil.copyfile(src, dst) # Return some values of efficiency of the binning. give_results_info(bin_summary) # Write relevant bins/contigs information for anvio. binning_file = join(outdir, "binning.txt") contigs_data = write_bins_contigs(bin_summary, contigs_data, binning_file) # Compute the abundance of the mags. bin_summary = get_bin_coverage(bin_summary, contigs_data) # Save bin information in final file bin_summary_file = join(outdir, "bin_summary.txt") mio.write_checkm_summary(bin_summary, bin_summary_file) # Write the new file contig_data_file_final = join(outdir, "contig_data_final.txt") contigs_data.to_csv(contig_data_file_final, sep="\t", header=True, index=False) # Plot some figures of contigs distribution inside bins: mtf.plot_figures(outdir, contigs_data, bin_summary, size) return clustering_matrix_file
def checkm(fasta_dir, outfile, taxonomy_file, tmpdir, threads): """Function to evaluate fasta bins using CheckM. Write the checkM results summary in the outfile and the taxonomy results in the the taxonomy file. Parameters: ----------- fasta_dir : str Path to the input fasta of the bins to evaluate. outfile : str Path to the file where the results of checkm will be written. taxonomy_file : str path to the file where checkm taxonomy results will be written. tmpdir : str Path to the temporary directory where CheckM intermediary files will be written. threads : int Numbers of threads to use for CheckM. """ logger.info("Start CheckM validation.") # Build CheckM tree cmd = "checkm tree -q -t {0} -x fa {1} {2}".format(threads, fasta_dir, tmpdir) logger.info(cmd) process = sp.Popen(cmd, shell=True) out, err = process.communicate() # Build taxonomy values of the bins cmd = "checkm tree_qa {0} -q -o 1 -f {1}".format(tmpdir, taxonomy_file) logger.info(cmd) process = sp.Popen(cmd, shell=True) out, err = process.communicate() # Build lineage marker set markers_set = join(tmpdir, "markers.txt") cmd = "checkm lineage_set -q {0} {1}".format(tmpdir, markers_set) logger.info(cmd) process = sp.Popen(cmd, shell=True) out, err = process.communicate() # Compute the analysis cmd = "checkm analyze -q -x fa -t {0} {1} {2} {3}".format( threads, markers_set, fasta_dir, tmpdir) logger.info(cmd) process = sp.Popen(cmd, shell=True) out, err = process.communicate() # Write the summary file cmd = "checkm qa -q {0} {1} -o 2 > {2}".format(markers_set, tmpdir, outfile) logger.info(cmd) process = sp.Popen(cmd, shell=True) out, err = process.communicate()
def recursive_clustering( assembly, iterations, overlapping_parameter, resolution_parameter, outdir, recursive_fasta_dir, algorithm, tmpdir, checkm_file, taxonomy_file, contigs_data_file, network_file, cluster_matrix, size, threads, ): """Function to run recursive iterations on contaminated bins in order to try to improve the quality of the bins using Louvain or Leiden algorthm. Parameters: ----------- assembly : str Path to the fasta file used as assembly. iterations : int Number of iterations to use for recursive iterations of Louvain or Leiden. overlapping_parameter : float Hamming distance threshold to consider two bins as the same bin. resolution parameter : float Resolution parameter of Leiden algorithm. outdir : str Path to the output directory. recursive_fasta_dir : str Path to the directory where to write the decontaminated fasta. algorithm : str Algorithm to use, either louvain or leiden. tmpdir : str Path the temp directory. checkm_file : str Path to the output file of CheckM from checkm function. taxonomy_file : str Path to the taxonomy CheckM file. contigs_data_file : str Path to the contigs data file from metator partition. network_file : str Path to the network file from metator network. cluster_matrix : bool If True, build the clustering matrix and save it. size : int Size threshodl in base pairs of the bins. threads : int Number of threads to use. Returns: -------- boolean: True if at least one new recursive bin has been generated. pandas.DataFrame Updated dictionnary which has as keys the values of the iterations from the recursive partition separated by a semicolon and as values the list of the id of the contigs. scipy.sparse.coo.coo_matrix: Matrix with all the previously computed hamming distance between two contigs. """ # Create temporary folders tmpdir_subnetwork = join(tmpdir, "recursive_bins") os.makedirs(tmpdir_subnetwork, exist_ok=True) tmpdir_clustering = join(tmpdir, "recursive_clustering") os.makedirs(tmpdir_clustering, exist_ok=True) tmpdir_binning = join(tmpdir, "recursive_bins") os.makedirs(tmpdir_binning, exist_ok=True) # Load CheckM result: checkm_summary = mio.read_results_checkm(checkm_file, taxonomy_file) # Load network: network = nx.read_edgelist(network_file, nodetype=int, data=(("weight", float), )) # Load contigs data: contigs_data = pd.read_csv(contigs_data_file, sep="\t", header=0, index_col=False) # Add new coulumns for recursive information. contigs_data["Recursive_bin_ID"] = "0" contigs_data["Recursive_bin_contigs"] = "-" contigs_data["Recursive_bin_size"] = "-" contigs_data["Final_bin"] = "ND" # Default no contamination contamination = False # Create an empty matrix N = len(contigs_data.ID) clustering_matrix = sparse.coo_matrix((N + 1, N + 1), dtype=np.float32) # Iterate on chcekm summary to find conatminated bins: for bin_id in checkm_summary: if (float(checkm_summary[bin_id]["completness"]) >= 50) & (float( checkm_summary[bin_id]["contamination"]) >= 5): logger.info("Bin in progress: {0}".format(bin_id)) subnetwork_file = join(tmpdir_subnetwork, "subnetwork_" + bin_id + ".txt") bin_id = str(bin_id.split("_")[1]) # Extract contigs mask = contigs_data["Overlapping_bin_ID"].apply(str) == bin_id list_contigs = list(contigs_data.loc[mask, "ID"]) # Extract subnetwork subnetwork = network.subgraph(list_contigs) # Write the new subnetwork nx.write_edgelist(subnetwork, subnetwork_file, delimiter="\t", data=["weight"]) # Stop to report info log logger.setLevel(logging.WARNING) # Use Louvain or Leiden algorithm the subnetwork. if algorithm == "leiden": LEIDEN_PATH = os.environ["LEIDEN_PATH"] output_partition = mtp.leiden_iterations_java( subnetwork_file, iterations, resolution_parameter, tmpdir_clustering, LEIDEN_PATH, ) elif algorithm == "louvain": LOUVAIN_PATH = os.environ["LOUVAIN_PATH"] output_partition = mtp.louvain_iterations_cpp( subnetwork_file, iterations, tmpdir_clustering, LOUVAIN_PATH, ) else: logger.error( 'algorithm should be either "louvain" or "leiden"') raise ValueError # Detect core bins ( recursive_core_bins, recursive_bins_iterations, ) = mtp.detect_core_bins(output_partition, iterations) # Compute the Hamming distance between core bins. hamming_distance = mtp.get_hamming_distance( recursive_bins_iterations, iterations, threads, ) # Defined overlapping bins according to the threshold recursive_bins = mtp.defined_overlapping_bins( overlapping_parameter, hamming_distance, recursive_core_bins, recursive_bins_iterations, ) # update bin data and generate fasta contamination, contigs_data = update_contigs_data_recursive( contigs_data, recursive_bins, assembly, recursive_fasta_dir, tmpdir_binning, size, contamination, ) # Build the clustering matrix of the subnetwork and add it. if cluster_matrix: clustering_matrix += mtp.build_clustering_matrix( recursive_core_bins, hamming_distance, N) # Put back the info log logger.setLevel(logging.INFO) # Save the clustering matrix if cluster_matrix: clustering_matrix_file = join(outdir, "clustering_matrix_recursive") sparse.save_npz(clustering_matrix_file, clustering_matrix) else: clustering_matrix_file = None return contamination, contigs_data, clustering_matrix_file
def give_results_info(bin_summary): """Function to return the general information about the binning results. Parameters: ----------- bin_summary : dict Dictionnary with the summary results of the kept bins. """ # Defined categories of the bins HQ = 0 # Completness >= 90 and Contamination <= 5 total_size_HQ = 0 MQ = 0 # Completness >= 70 and Contamination <= 10 total_size_MQ = 0 LQ = 0 # Completness >= 50 and Contamination <= 10 total_size_LQ = 0 conta_bins = 0 # Completness >= 50 and Contamination > 10 total_size_conta_bins = 0 others = 0 # Not determined bins. total_size_others = 0 # Class each bin in a category for bin_name in bin_summary: completness = float(bin_summary[bin_name]["completness"]) contamination = float(bin_summary[bin_name]["contamination"]) size = int(bin_summary[bin_name]["size"]) if completness >= 50: if contamination > 10: conta_bins += 1 total_size_conta_bins += size else: if completness >= 90 and contamination <= 5: HQ += 1 total_size_HQ += size elif completness >= 70: MQ += 1 total_size_MQ += size else: LQ += 1 total_size_LQ += size else: others += 1 total_size_others += size total = HQ + MQ + LQ + conta_bins + others total_size = (total_size_HQ + total_size_MQ + total_size_LQ + total_size_conta_bins + total_size_others) # Return info in the logger: logger.info( "{0} bins have been kept after the recursive iterations.".format( total)) logger.info("Total size of the extracted bins: {0}".format(total_size)) logger.info("HQ MAGs: {0}\tTotal Size: {1}".format(HQ, total_size_HQ)) logger.info("MQ MAGs: {0}\tTotal Size: {1}".format(MQ, total_size_MQ)) logger.info("LQ MAGs: {0}\tTotal Size: {1}".format(LQ, total_size_LQ)) logger.info("Contaminated potential MAGs: {0}\tTotal Size: {1}".format( conta_bins, total_size_conta_bins)) logger.info("Others bins: {0}\tTotal Size: {1}".format( others, total_size_others))
def precompute_network( alignment_files, contig_data, hit_data, out_file, tmp_dir, self_contacts=False, ): """Write a file with only the contig id separated by a tabulation and count the contacts by contigs to be able to compute directlty the normalized network. Parameters: ----------- alignment_files : list of str List of path to the alignment file(s). contig_data : dict Dictionnary of the all the contigs from the assembly, the contigs names are the keys to the data of the contig available with the following keys: "id", "length", "GC", "hit", "coverage". Coverage still at 0 and need to be updated later. hit_data : dict Dictionnary with the count of hits for each aligment file. out_file : str Path to the write the output_file which will be necessary to compute the network. self_contacts : bool If True, the contacts on the same contigs will be kept. Otherwise only displays the inter contigs contacts. [Default False] Return: ------- dict: Dictionnary of the all the contigs from the assembly, the contigs names are the keys to the data of the contig available with the following keys: "id", "length", "GC", "hit", "coverage" "RS". Coverage still at 0 and need to be updated later. """ # Initiate value to compute 3D ratio all_contacts = 0 inter_contacts = 0 out_files_list = [] # Prepare a file to save contact with their global ID with open(out_file, "w") as pre_net: # Iterates on the alignment files for i, aligment_file in enumerate(alignment_files): all_contacts_temp = 0 inter_contacts_temp = 0 out_file_sample = join(tmp_dir, "prenetwork" + str(i) + ".txt") out_files_list.append(out_file_sample) # Read the alignment_file and build pairs for the network with open(aligment_file, "r") as pairs, open(out_file_sample, "w") as pre_net_sample: for pair in pairs: # Ignore header lines if pair.startswith("#"): continue # Split the line on the tabulation p = pair.split("\t") # Extract the contig names which are at the position 2 and # 4. contig1, contig2 = p[1], p[3] id1 = contig_data[contig1]["id"] id2 = contig_data[contig2]["id"] # Count the contact all_contacts_temp += 1 contig_data[contig1]["hit"] += 1 contig_data[contig2]["hit"] += 1 if len(alignment_files) > 1: hit_data[contig1]["hit"][i] += 1 hit_data[contig2]["hit"][i] += 1 # Write the file used for the computation of the network. if self_contacts and id1 == id2: pre_net.write("\t".join(map(str, [contig1, contig2])) + "\n") pre_net_sample.write( "\t".join(map(str, [contig1, contig2])) + "\n") elif id1 < id2: inter_contacts_temp += 1 pre_net.write("\t".join(map(str, [contig1, contig2])) + "\n") pre_net_sample.write( "\t".join(map(str, [contig1, contig2])) + "\n") elif id1 > id2: inter_contacts_temp += 1 pre_net.write("\t".join(map(str, [contig2, contig1])) + "\n") pre_net_sample.write( "\t".join(map(str, [contig2, contig1])) + "\n") # Count contacts and return sample informations. all_contacts += all_contacts_temp inter_contacts += inter_contacts_temp logger.info("Information of {0}:".format(basename(aligment_file))) logger.info( "{0} contacts in the library.".format(all_contacts_temp)) logger.info("{0} contacts inter-contigs in the library.".format( inter_contacts_temp)) logger.info("3D ratio : {0}\n".format(inter_contacts_temp / all_contacts_temp)) # Return information about the network if len(alignment_files) > 1: logger.info("General information:") logger.info("{0} contacts in the library.".format(all_contacts)) logger.info("{0} contacts inter-contigs in the library.".format( inter_contacts)) logger.info("3D ratio : {0}\n".format(inter_contacts / all_contacts)) return contig_data, out_files_list
def execute(self): # Defined the temporary directory. if not self.args["--tmpdir"]: tmp_dir = mio.generate_temp_dir("./tmp") else: tmp_dir = self.args["--tmpdir"] os.makedirs(tmp_dir, exist_ok=True) # Defined the output directory and output file names. if not self.args["--outdir"]: self.args["--outdir"] = "." os.makedirs(self.args["--outdir"], exist_ok=True) overlapping_fasta_dir = join(self.args["--outdir"], "overlapping_bin") if not exists(overlapping_fasta_dir): os.makedirs(overlapping_fasta_dir) else: if self.args["--force"]: shutil.rmtree(overlapping_fasta_dir) os.makedirs(overlapping_fasta_dir) else: print(self.args["--force"]) logger.error( "%s already existed. Remove directory or use -F argument to overwrite it.", overlapping_fasta_dir, ) raise ValueError # Enable file logging now = time.strftime("%Y%m%d%H%M%S") log_file = join(self.args["--outdir"], ("metator_" + now + ".log")) mtl.set_file_handler(log_file) # Define variable min_qual = int(self.args["--min-quality"]) iterations = int(self.args["--iterations"]) recursive_iterations = int(self.args["--rec-iter"]) overlapping_parameter = int(self.args["--overlap"]) / 100 recursive_overlapping_parameter = int(self.args["--rec-overlap"]) / 100 size = int(self.args["--size"]) threads = int(self.args["--threads"]) resolution_parameter = float(self.args["--res-param"]) # Check correct algorithm value if self.args["--algorithm"] not in ["louvain", "leiden"]: logger.error('algorithm should be either "louvain" or "leiden"') raise ValueError # Check if normalization in the list of possible normalization. list_normalization = [ "None", "abundance", "length", "RS", "empirical_hit", "theoritical_hit", ] if self.args["--normalization"] not in list_normalization: logger.error( 'Normalization should be among this list: "None", "abundance", "length", "RS", "empirical_hit", "theoritical_hit"' ) raise ValueError enzyme_required = ["RS", "theoritical_hit"] if (self.args["--normalization"] in enzyme_required and not self.args["--enzyme"]): logger.error( 'For "RS" and "theoritical_hit" normalization, enzyme is required.' ) raise ValueError depth_required = ["abundance", "theoritical_hit"] if (self.args["--normalization"] in depth_required and not self.args["--depth"]): logger.error( 'For "abundance" and "theoritical_hit" normalization, depth is required.' ) raise ValueError # Sanity check for validation if not self.args["--skip-validation"]: recursive_fasta_dir = join(self.args["--outdir"], "recursive_bin") if not exists(recursive_fasta_dir): os.makedirs(recursive_fasta_dir) else: if self.args["--force"]: shutil.rmtree(recursive_fasta_dir) os.makedirs(recursive_fasta_dir) else: logger.error( "%s already existed. Remove directory or use -F argument to overwrite it", recursive_fasta_dir, ) raise ValueError final_fasta_dir = join(self.args["--outdir"], "final_bin") if not exists(final_fasta_dir): os.makedirs(final_fasta_dir) else: if self.args["--force"]: shutil.rmtree(final_fasta_dir) os.makedirs(final_fasta_dir) else: logger.error( "%s already existed. Remove directory or use -F argument to overwrite it.", final_fasta_dir, ) raise ValueError # Check checkM availability if not mio.check_checkm(): logger.error( "CheckM is not in the path. Could not make the iterations") raise NameError # Manage start point. if self.args["--start"] == "fastq": start = 1 elif self.args["--start"] == "bam": start = 2 elif self.args["--start"] == "pair": start = 3 elif self.args["--start"] == "network": start = 4 else: logger.error( "Start argument should be 'fastq', 'bam', 'pair' or 'network'." ) raise ValueError # Check if forward and reverse reads are given for fastq and bam start. if (self.args["--start"] == "fastq" or (self.args["--start"] == "bam" and self.args["--aligner"] == "bowtie2")) and not self.args["--reverse"]: logger.error( "Forward and reverse arguments are necessary for fastq with %s start and %s aligner.", self.args["--start"], self.args["--aligner"], ) raise ValueError # Print information of the workflow: if start == 1: logger.info("Minimum mapping quality: %d", min_qual) if start <= 2: logger.info("Enzyme: %s", self.args["--enzyme"]) logger.info("Normalization: %s", self.args["--normalization"]) logger.info("Aligner algorithm: %s", self.args["--aligner"]) logger.info("Partition algorithm: %s", self.args["--algorithm"]) logger.info("Partition iterations: %s", iterations) logger.info("Overlapping parameter: %s", overlapping_parameter) if not self.args["--skip-validation"]: logger.info("Recursive partition iterations: %d", recursive_iterations) logger.info( "Recursive overlapping parameter: %s", recursive_overlapping_parameter, ) # Extract index and genome file assembly = self.args["--assembly"] # Check what is the reference. If a fasta is given build the index. If a # bowtie2 index is given, retreive the fasta. index = mio.check_fasta_index(assembly, mode=self.args["--aligner"]) if index is None: if mio.check_is_fasta(assembly): fasta = assembly if start == 1: index = mio.generate_fasta_index(fasta, self.args["--aligner"], tmp_dir) else: logger.error( "Please give as assembly argument a bowtie2 index or a fasta." ) raise ValueError else: fasta = mio.retrieve_fasta(index, self.args["--aligner"], tmp_dir) # Run the whole workflow if start <= 3: if start <= 2: # Align pair-end reads with bowtie2 alignment_files, contig_data, hit_data = mta.get_contact_pairs( self.args["--forward"], self.args["--reverse"], index, fasta, self.args["--aligner"], min_qual, self.args["--start"], self.args["--depth"], self.args["--enzyme"], self.args["--outdir"], tmp_dir, self.args["--threads"], ) else: alignment_files = self.args["--forward"].split(",") nb_alignment = len(alignment_files) contig_data, hit_data = mtn.create_contig_data( fasta, nb_alignment, self.args["--depth"], self.args["--enzyme"], ) # Build the network network_file, contigs_data_file = mtn.alignment_to_contacts( alignment_files, contig_data, hit_data, self.args["--outdir"], "network.txt", "contig_data_network.txt", tmp_dir, self.args["--threads"], self.args["--normalization"], False, ) else: contigs_data_file = self.args["--contigs"] network_file = self.args["--network"] # Partition the network clustering_matrix_partition_file, contigs_data_file = mtp.partition( self.args["--algorithm"], fasta, self.args["--cluster-matrix"], contigs_data_file, iterations, network_file, self.args["--outdir"], overlapping_fasta_dir, overlapping_parameter, resolution_parameter, size, tmp_dir, threads, ) # remove contig_data_network if not an input if start <= 2: contig_data_network_file = join(self.args["--outdir"], "contig_data_network.txt") os.remove(contig_data_network_file) # Launch validation if desired. if not self.args["--skip-validation"]: clustering_matrix_recursive_file = mtv.recursive_decontamination( self.args["--algorithm"], fasta, self.args["--cluster-matrix"], contigs_data_file, final_fasta_dir, overlapping_fasta_dir, recursive_iterations, network_file, self.args["--outdir"], recursive_overlapping_parameter, recursive_fasta_dir, resolution_parameter, size, tmp_dir, threads, ) if self.args["--cluster-matrix"]: # Make the sum with the partiton clustering matrix and save it. clustering_matrix = load_npz(clustering_matrix_partition_file + ".npz") clustering_matrix_recursive = load_npz( clustering_matrix_recursive_file + ".npz") clustering_matrix = ( (clustering_matrix + clustering_matrix_recursive) / 2).tocoo() clustering_matrix_file = join(self.args["--outdir"], "clustering_matrix") save_npz(clustering_matrix_file, clustering_matrix) # Remove contig_data_partition file contig_data_partition_file = join(self.args["--outdir"], "contig_data_partition.txt") os.remove(contig_data_partition_file) # Delete pyfastx index: os.remove(fasta + ".fxi") # Delete the temporary folder. if not self.args["--no-clean-up"]: shutil.rmtree(tmp_dir)
def defined_overlapping_bins( overlap, hamming_distance, core_bins_contigs, core_bins_iterations ): """This function extract the overlapped bins From the hamming distances between the core bins, the function identifies the overlapping bins and create a dictionnary with the list of the contigs ID for each core bin. Two core bins are considered overlapping if there have a percentage of identity superior or equal to the threshold given. Parameters: ----------- overlap : float hamming distance threshold use to consider that two bins are overlapping. hamming_distance : scipy.sparse.csr.csr_matrix Matrix with all the previously computed hamming distance between two core bins. core_bins_contigs : dict Dictionnary which has as keys the core bins id and as value the id of the contigs of the core bin. core_bins_iteration : pandas.core.frame.DataFrame Table with the id of the core bin and their values for each iterations. Returns: -------- dict: A dictionnary with the id of the overlapping bins as keys and the list of id of their contigs as values. """ # Extract bins which are connected, i.e. bins with an hamming distance # superior than the threshold given. The small variation is necessary as # python give a float not really equal to the true value # (i.e. 0.1 -> 0.09999999999999998) connections = hamming_distance >= (overlap - 1e-10) overlapping_bins_id = sparse.csgraph.connected_components( connections, directed=False )[1] # Create a dictionnary of the overlapped bins (ID from the previous file) # with the ID of their contigs as value overlapping_bins = {} cc_id = 0 # Iterate on each core bins. for oc_id in overlapping_bins_id: # Extract contig ID from the core bin. core_bin_contigs = core_bins_contigs[cc_id].copy() # Add the contig ID on the overlapping bin. if oc_id + 1 not in overlapping_bins: overlapping_bins[oc_id + 1] = core_bin_contigs else: overlapping_bins[oc_id + 1] += core_bin_contigs cc_id += 1 logger.info( "{0} overlapping bins were found.".format(len(overlapping_bins)) ) return overlapping_bins
def partition( algorithm, assembly, cluster_matrix, contig_data_file, iterations, network_file, outdir, fasta_dir, overlapping_parameter, resolution_parameter, size, temp_directory, threads, ): """Function to call the others functions to partition the network. Parameters: ----------- algorithm : str Algorithm to use to partition the network. Either leiden or louvain. assembly : str Path to the assembly file used for the partition. cluster_matrix : bool If True, build and save the clustering matrix. contig_data_file : str Path to the contig data table to update. iterations : int Number of iterations to use for the partition. network_file : str Path to the network file. outdir : str Path to the output directory where to write the output files. fasta_dir : str Path to directory where to write the fasta files. overlapping_parameter : int Hamming distance threshold to use to merge bins (percentage). resolution_parameter : float Resolution parameter to use if Leiden algorithm is chosen. It will be a factor of the cost function used. A resolution parameter of 1 will be equivalent as the modularity function used in Louvain. Higher these parameters, smaller the bins will be in the output. size : int Threshold size in base pair of the output bins. temp_directory : str Path to the directory used to write temporary files. threads : int Number of threads to use. Returns: -------- scipy.sparse.coo.coo_matrix: Matrix with all the previously computed hamming distance between two contigs. str: Path to the new contig data file with the bin informations in it. """ # Create partition folders in the temporary directory temp_directory = join(temp_directory, "partition") os.makedirs(temp_directory, exist_ok=True) temp_directory_clustering = join(temp_directory, "clustering") os.makedirs(temp_directory_clustering, exist_ok=True) temp_directory_bins = join(temp_directory, "partition_bins") os.makedirs(temp_directory_bins, exist_ok=True) # Perform the iterations of Louvain or Leiden to partition the network. logger.info("Start iterations:") if algorithm == "leiden": LEIDEN_PATH = os.environ["LEIDEN_PATH"] output_partition = leiden_iterations_java( network_file, iterations, resolution_parameter, temp_directory_clustering, LEIDEN_PATH, ) elif algorithm == "louvain": LOUVAIN_PATH = os.environ["LOUVAIN_PATH"] output_partition = louvain_iterations_cpp( network_file, iterations, temp_directory_clustering, LOUVAIN_PATH, ) else: logger.error('algorithm should be either "louvain" or "leiden"') raise ValueError # Detect core bins logger.info("Detect core bins:") ( core_bins_contigs, core_bins_iterations, ) = detect_core_bins(output_partition, iterations) # Compute the Hamming distance between core bins. logger.info("Detect overlapping bins:") hamming_distance = get_hamming_distance( core_bins_iterations, iterations, threads, ) # Defined overlapping bins according to the threshold overlapping_bins = defined_overlapping_bins( overlapping_parameter, hamming_distance, core_bins_contigs, core_bins_iterations, ) # Update the contigs_data_file. logger.info("Extract bins:") contigs_data, contigs_data_file = update_contigs_data( contig_data_file, core_bins_contigs, overlapping_bins, outdir, ) # Generate Fasta file generate_fasta( assembly, overlapping_bins, contigs_data, size, fasta_dir, temp_directory_bins, ) if cluster_matrix: # Build clustering matrix and save it. logger.info("Build clustering matrix") clustering_matrix = build_clustering_matrix( core_bins_contigs, hamming_distance, len(contigs_data.ID) ) # Save the clustering matrix clustering_matrix_file = join(outdir, "clustering_matrix_partition") sparse.save_npz(clustering_matrix_file, clustering_matrix) else: clustering_matrix_file = None return clustering_matrix_file, contigs_data_file
def louvain_iterations_cpp(network_file, iterations, tmp_dir, louvain_path): """Use the cpp original Louvain to partition the network. Parameters: ----------- network_file : str Path to the network computed previously. The file is 3 columns table separated by a tabulation with the id of the first contigs the id of the second one and the weights of the edge normalized or not. iterations : int Number of iterations of the algorithm of Louvain. tmp_dir : str Path to the temporary directory. louvain_path : str Path to the directory with louvain functions. Returns: -------- dict: Dictionnary with the id of the contig as key and the list of the results of each iterations separated by a semicolon as values. """ # Check if louvain cpp is available in the computer. If it's not available # launch python_louvain instead. if not mio.check_louvain_cpp(louvain_path): logger.error("Louvain implementation was not found.") logger.error( "You should have a LOUVAIN_PATH variable in your environnement" ) raise NameError # Defined temporary files and args for louvain fonction calling and path to # the variables to call. network_bin = join(tmp_dir, "net_bin") network_weight = join(tmp_dir, "net_weight") network_tree = join(tmp_dir, "net_tree") network_labels = join(tmp_dir, "labels.txt") level_louvain = join(tmp_dir, "level.txt") output = join(tmp_dir, "output_louvain_") louvain = join(louvain_path, "louvain") convert = join(louvain_path, "convert") hierarchy = join(louvain_path, "hierarchy") output_louvain = dict() # Create dictionnary of all arguments louvain_args = { "net_txt": network_file, "net_bin": network_bin, "net_weight": network_weight, "net_tree": network_tree, "net_labels": network_labels, "level_file": level_louvain, "output": output, "level": 0, "iteration": 0, "convert": convert, "louvain": louvain, "hierarchy": hierarchy, } # Convert the file in binary file for Louvain partitionning. cmd = ( "{convert} -i {net_txt} -o {net_bin} -r {net_labels} -w {net_weight}" ).format(**louvain_args) process = sp.Popen(cmd, shell=True) out, err = process.communicate() # Create a dictionary of Louvain labels and original contig id. labels = dict() with open(louvain_args["net_labels"]) as label_file: for label in label_file: label = label.split() labels[label[1]] = int(label[0]) # Run the iterations of Louvain for i in range(iterations): logger.info("Iteration in progress: {0}".format(i)) louvain_args["iteration"] = i # Partiotining with weights using louvain and compute the bin tree. cmd = ("{louvain} {net_bin} -l -1 -w {net_weight} > {net_tree}").format( **louvain_args ) process = sp.Popen(cmd, shell=True) out, err = process.communicate() cmd = ("{hierarchy} {net_tree} > {level_file}").format(**louvain_args) process = sp.Popen(cmd, shell=True) out, err = process.communicate() level_file = open(level_louvain, "r") louvain_args["level"] = level_file.readlines()[-1][6] level_file.close() cmd = ( "{hierarchy} {net_tree} -l {level} > {output}{iteration}.txt" ).format(**louvain_args) process = sp.Popen(cmd, shell=True) out, err = process.communicate() # Save the results in a dictionnary if i == 0: with open(output + str(i) + ".txt", "r") as out: for line in out: result = line.split(" ") output_louvain[labels[result[0]]] = result[1][:-1] else: with open(output + str(i) + ".txt", "r") as out: for line in out: result = line.split(" ") output_louvain[labels[result[0]]] += ";" + result[1][:-1] return output_louvain
def generate_fasta( assembly, overlapping_bins, contigs_data, size, output_dir, tmpdir ): """Generate the fasta files of each bins from the assembly. Parameters: ----------- assembly : str Path to the fasta file of the original assembly. overlapping_bins : dict A dictionnary with the id of the overlapping bins as keys and the list of id of their contigs as values. contigs_data : pandas.core.frame.DataFrame Table with all the information on the contigs included their appartenance to the bins. size : int Thrshold size chosen to write the bins. output_dir : str Path to the output directory where the fasta of all the bin will be written. tmpdir : str Path to the temporary directory to write the temporary contigs list files. """ nb_bins = 0 length_bins = 0 # For each bin create a list of the contigs and extract them from the # assembly to create a new fasta file with only the bin. for bin_id in overlapping_bins: # Extract the list of the contigs from the contigs data file. list_contigs_id = overlapping_bins[bin_id] list_contigs_name = [] # Test if the bin is bigger than the size threshold given. length_bin = contigs_data.loc[ list_contigs_id[0] - 1, "Overlapping_bin_size" ] if length_bin >= size: nb_bins += 1 length_bins += length_bin for contig_id in list_contigs_id: list_contigs_name.append( contigs_data.loc[contig_id - 1, "Name"] ) # Define the output file. output_file = join(output_dir, "MetaTOR_{0}_0.fa".format(bin_id)) # Create the fasta file. contigs_file = join(tmpdir, "MetaTOR_{0}_0.txt".format(bin_id)) with open(contigs_file, "w") as f: for contig_name in list_contigs_name: f.write("%s\n" % contig_name) cmd = "pyfastx extract {0} -l {1} > {2}".format( assembly, contigs_file, output_file ) process = sp.Popen(cmd, shell=True) process.communicate() logger.info("{0} bins have been extracted".format(nb_bins)) logger.info( "Total size of the extracted bins: {0}Mb".format( round(length_bins / 10 ** 6, 3) ) )
def get_contact_pairs( for_in, rev_in, index, assembly, aligner, min_qual, start, depth_file, enzyme, out_dir, tmp_dir, n_cpu, ): """General function to do the whole alignment of both fastq. The Function write at the output directory location given as an argument and return a tsv file of the aligned reads with 9 columns: ReadID, ContigA, Position_startA, Position_endA, StrandA, ContigB, Position_startB, Position_endB, StrandB. The name of the file will be alignment.txt. Two start stages are possible, from fastq or bam files. Parameters: ----------- for_in : str Path to input forward fastq or bam file to align. If multiple files are given, list of path separated by a comma. rev_in : str Path to input reverse fastq or bam file to align. If multiple files are given, list of path separated by a comma. index : str Path to the bowtie2 index of the assembly. assembly : str The initial assembly path acting as the alignment file's reference assembly. aligner : str Either 'bowtie2' or 'bwa' aligner used or to be use to map the reads. min_qual : int Minimum mapping quality required to keep Hi-C pairs. start : str Either fastq or bam. Starting point for the pipeline. depth_file : str or None Path to the depth.txt file from jgi_summarize_bam_contig_depths from Metabat2 Software. enzyme : str or None String that contains the names of the enzyme separated by a comma. out_dir : str Path to directory where to write the output file. tmp_dir : str Path where temporary files should be written. n_cpu : int The number of CPUs to use for the alignment. Returns: -------- list of str: List of path of the Files with the table containing the alignement data of the pairs: ReadID, ContigA, Position_startA, Position_endA, StrandA, ContigB, Position_startB, Position_endB, StrandB. dict Dictionnary of the all the contigs from the assembly, the contigs names are the keys to the data of the contig available with the following keys: "id", "length", "GC", "hit", "coverage". Coverage still at 0 and need to be updated later. dict: Dictionnary for hit information on each contigs. """ # Iterates on all the input files: for_list = for_in.split(",") rev_list = rev_in.split(",") out_file_list = [] total_aligned_pairs = 0 # Create the contig data dictionnary and hit from each alignments nb_alignment = len(for_list) contig_data, hit_data = mtn.create_contig_data(assembly, nb_alignment, depth_file, enzyme) for i in range(len(for_list)): for_in = for_list[i] try: rev_in = rev_list[i] except IndexError: rev_in = None name = "alignment_" + str(i) out_file = join(out_dir, "alignment_" + str(i) + ".pairs") out_file_list.append(out_file) # Align if necessary if start == "fastq": if aligner == "bowtie2": # Create files to save the alignment. alignment_for = join(out_dir, name + "_for.bam") alignment_rev = join(out_dir, name + "_rev.bam") # Align the forward reads logger.info("Alignment of %s:", for_in) align(for_in, index, aligner, alignment_for, n_cpu) # Align the reverse reads logger.info("Alignment of %s:", rev_in) align(rev_in, index, aligner, alignment_rev, n_cpu) elif aligner == "bwa": # Create file to save the alignement. alignment = join(out_dir, name + ".bam") logger.info("Alignment of %s and %s:", for_in, rev_in) align(for_in, index, aligner, alignment, n_cpu, rev_in) elif start == "bam": if aligner == "bowtie2": logger.info("Processing %s and %s:", for_in, rev_in) alignment_for = for_in alignment_rev = rev_in elif aligner == "bwa": alignment = for_in else: logger.error("Start argument should be either 'fastq' or 'bam'.") raise ValueError if aligner == "bowtie2": # Create files to save the alignment. alignment_temp_for = join(tmp_dir, name + "_for_temp.txt") alignment_temp_rev = join(tmp_dir, name + "_rev_temp.txt") # Filters the aligned and non aligned reads from the forward and # reverse bam files. aligned_reads_for = process_bamfile(alignment_for, min_qual, alignment_temp_for) aligned_reads_rev = process_bamfile(alignment_rev, min_qual, alignment_temp_rev) logger.info( "%s forward reads aligned and %s reverse reads aligned", aligned_reads_for, aligned_reads_rev, ) # Merge alignement to create a pairs file logger.info("Merging the pairs:") n_pairs = merge_alignment(alignment_temp_for, alignment_temp_rev, contig_data, out_file) logger.info("%s pairs aligned.", n_pairs) total_aligned_pairs += n_pairs # Case where a bam file from bwa is given as input. if aligner == "bwa": n_pairs = process_bwa_bamfile(alignment, min_qual, contig_data, out_file) logger.info("%s pairs aligned.", n_pairs) total_aligned_pairs += n_pairs if len(out_file_list) > 1: logger.info("TOTAL PAIRS MAPPED: %s", total_aligned_pairs) return out_file_list, contig_data, hit_data
def alignment_to_contacts( alignment_files, contig_data, hit_data, out_dir, output_file_network, output_file_contig_data, tmp_dir, n_cpus, normalization, self_contacts, ): """Generates a network file (in edgelist form) from an alignment. Contigs are the network nodes and the edges are the contact counts. The network is in a strict barebone form so that it can be reused and imported quickly into other applications etc. Verbose information about every single node in the network is written on a 'contig data' file. Parameters: ----------- alignment_files : list of str List of path to the alignment file(s) used as input. contig_data : dict Dictionnary of the all the contigs from the assembly, the contigs names are the keys to the data of the contig available with the following keys: "id", "length", "GC", "hit", "coverage". Coverage still at 0 and need to be updated later. hit_data : dict: Dictionnary for hit information on each contigs. out_dir : str The output directory to write the network and chunk data into. output_file_network : str, optional The specific file name for the output network file. Default is 'network.txt' output_file_contig_data : str, optional The specific file name for the output chunk data file. Default is 'idx_contig_length_GC_hit_cov.txt' tmp_dir : str Path to th temporary directory. Default in the working directory normalization : str If None, do not normalized the count of a contact by the geometric mean of the coverage of the contigs. Otherwise it's the type of normalization. self_contacts : bool Whether to return network with self contact. Default is False. Returns: -------- str: Path to the network file. str: Path to the verbose contig data file. """ # Create temporary and output file which will be necessary precompute_network_file = join(tmp_dir, "precompute_network_file.txt") pre_network_sorted_file = join(tmp_dir, "tmp_network_sorted.txt") network_file = join(out_dir, output_file_network) contig_data_file = join(out_dir, output_file_contig_data) hit_data_file = join(out_dir, "hit_data_alignment.txt") nb_alignment = len(alignment_files) logger.info("New time course network") # Create a contact file easily readable for counting the contacts. contig_data, out_files_list = precompute_network( alignment_files, contig_data, hit_data, precompute_network_file, tmp_dir, self_contacts, ) # Compute network compute_network( precompute_network_file, network_file, contig_data, tmp_dir, pre_network_sorted_file, n_cpus, normalization, ) # Compute sample network for i, precompute_network_file_sample in enumerate(out_files_list): network_file_sample = join(out_dir, "network_{0}.txt".format(i)) pre_network_sorted_file = join(tmp_dir, "tmp_network_sorted_{0}.txt".format(i)) compute_network( precompute_network_file_sample, network_file_sample, contig_data, tmp_dir, pre_network_sorted_file, n_cpus, normalization, ) # Write the data from the contigs write_contig_data(contig_data, contig_data_file) if nb_alignment > 1: write_hit_data(hit_data, hit_data_file) return network_file, contig_data_file
def execute(self): # Defined the temporary directory. if not self.args["--tmpdir"]: tmp_dir = mio.generate_temp_dir("./tmp") else: tmp_dir = self.args["--tmpdir"] os.makedirs(tmp_dir, exist_ok=True) # Defined the output directory and output file names. if not self.args["--outdir"]: self.args["--outdir"] = "." os.makedirs(self.args["--outdir"], exist_ok=True) # Enable file logging now = time.strftime("%Y%m%d%H%M%S") log_file = join(self.args["--outdir"], ("metator_network_" + now + ".log")) mtl.set_file_handler(log_file) # Transform integer variables as integer. min_qual = int(self.args["--min-quality"]) # Defined boolean variables: self_contacts = self.args["--self-contacts"] # Check if forward and reverse arguments are given: if (self.args["--start"] == "fastq" or (self.args["--start"] == "bam" and self.args["--aligner"] == "bowtie2")) and not self.args["--reverse"]: logger.error( "Forward and reverse arguments are necessary for fastq with %s start and %s aligner.", self.args["--start"], self.args["--aligner"], ) raise ValueError # Check if normalization in the list of possible normalization. list_normalization = [ "None", "abundance", "length", "RS", "empirical_hit", "theoritical_hit", ] if self.args["--normalization"] not in list_normalization: logger.error( 'Normalization should be among this list: "None", "abundance", "length", "RS", "empirical_hit", "theoritical_hit"' ) raise ValueError enzyme_required = ["RS", "theoritical_hit"] if (self.args["--normalization"] in enzyme_required and not self.args["--enzyme"]): logger.error( 'For "RS" and "theoritical_hit" normalization, enzyme is required.' ) raise ValueError depth_required = ["abundance", "theoritical_hit"] if (self.args["--normalization"] in depth_required and not self.args["--depth"]): logger.error( 'For "abundance" and "theoritical_hit" normalization, depth is required.' ) raise ValueError if self.args["--start"] not in ["fastq", "bam", "pair", "network"]: logger.error( "Start argument should be 'fastq', 'bam', 'pair' or 'network'." ) raise ValueError # Extract index and genome file assembly = self.args["--assembly"] # Check what is the reference. If a fasta is given build the index. If a # bowtie2 index is given, retreive the fasta. index = mio.check_fasta_index(assembly, mode=self.args["--aligner"]) if index is None: if mio.check_is_fasta(assembly): fasta = assembly # If start at bam could skip the index generation. if self.args["--start"] == "fastq": index = mio.generate_fasta_index(fasta, self.args["--aligner"], tmp_dir) else: logger.error( "Please give as assembly argument a %s index or a fasta.", self.args["--aligner"], ) raise ValueError else: fasta = mio.retrieve_fasta(index, self.args["--aligner"], tmp_dir) # Print information of teh workflow: logger.info("Aligner algorithm: %s", self.args["--aligner"]) logger.info("Enzyme: %s", self.args["--enzyme"]) logger.info("Normalization: %s", self.args["--normalization"]) logger.info("Minimum mapping quality: %s", self.args["--min-quality"]) # Do not align if pair start if self.args["--start"] == "pair": alignment_files = self.args["--forward"].split(",") nb_alignment = len(alignment_files) contig_data, hit_data = mtn.create_contig_data( fasta, nb_alignment, self.args["--depth"], self.args["--enzyme"], ) else: # Align pair-end reads with bowtie2 alignment_files, contig_data, hit_data = mta.get_contact_pairs( self.args["--forward"], self.args["--reverse"], index, fasta, self.args["--aligner"], min_qual, self.args["--start"], self.args["--depth"], self.args["--enzyme"], self.args["--outdir"], tmp_dir, self.args["--threads"], ) # Build the network mtn.alignment_to_contacts( alignment_files, contig_data, hit_data, self.args["--outdir"], "network.txt", "contig_data_network.txt", tmp_dir, self.args["--threads"], self.args["--normalization"], self_contacts, ) # Delete the temporary folder if not self.args["--no-clean-up"]: shutil.rmtree(tmp_dir)