def update_sanity_check_and_parse(params): # Provide sanity checks for outputs (not specific to a test) and return loaded data if not check_files(params["output_db_prefix"], ["ibf", "map", "tax", "gnn"]): return None res = {} # Sequence information from database to be updated if not params["update_complete"]: res["seq_info"] = parse_seq_info(params["db_prefix"] + ".seqinfo.txt") else: # Do not load it in case of update_complete, where all sequences must be provided res["seq_info"] = pd.DataFrame() # Parse in and out files if "seq_info_file" in params and params["seq_info_file"]: res["seq_info"] = res["seq_info"].append(parse_seq_info( params["seq_info_file"]), ignore_index=True) else: res["seq_info"] = res["seq_info"].append( parse_seq_info(params["output_db_prefix"] + ".seqinfo.txt"), ignore_index=True) res["gnn"] = Gnn(file=params["output_db_prefix"] + ".gnn") res["tax_pd"] = parse_tax(params["output_db_prefix"] + ".tax") res["map_pd"] = parse_map(params["output_db_prefix"] + ".map") res["bins_pd"] = parse_bins(Bins(taxsbp_ret=res["gnn"].bins)) # Check number of bins if res["map_pd"].binid.unique().size != res["gnn"].number_of_bins: print("Number of bins do not match between .gnn and .map") return None # Check if all input accession made it to the bins if not res["seq_info"]["seqid"].isin(res["bins_pd"]["seqid"]).all(): print("Missing sequence accessions on bins") return None # Check if all taxids/assembly on .map appear on .tax if res["tax_pd"]["taxid"].isin( res["map_pd"]["target"].drop_duplicates()).all(): print( "Inconsistent entries between taxonomy (.tax) and bin map (.map)") return None return res
def build_sanity_check_and_parse(params): # Provide sanity checks for outputs (not specific to a test) and return loaded data if not check_files(params["db_prefix"], ["ibf", "map", "tax", "gnn"]): return None res = {} # Parse in and out files if "seq_info_file" in params and params["seq_info_file"]: res["seq_info"] = parse_seq_info(params["seq_info_file"]) else: res["seq_info"] = parse_seq_info(params["db_prefix"] + ".seqinfo.txt") res["gnn"] = Gnn(file=params["db_prefix"] + ".gnn") res["tax_pd"] = parse_tax(params["db_prefix"] + ".tax") res["map_pd"] = parse_map(params["db_prefix"] + ".map") res["bins_pd"] = parse_bins( Bins(taxsbp_ret=res["gnn"].bins, use_specialization=True if res["gnn"].specialization else False)) # Check number of bins if res["map_pd"].binid.unique().size != res["gnn"].number_of_bins: print("Number of bins do not match between .gnn and .map") return None # Check if all input accession made it to the bins if not res["seq_info"]["seqid"].isin(res["bins_pd"]["seqid"]).all(): print("Missing sequence accessions on bins") return None # Check if all taxids/assembly on .map appear on .tax if res["tax_pd"]["taxid"].isin( res["map_pd"]["target"].drop_duplicates()).all(): print( "Inconsistent entries between taxonomy (.tax) and bin map (.map)") return None return res
def run_taxsbp(seqinfo, bin_length, fragment_length, overlap_length, rank, specialization, ncbi_nodes_file, ncbi_merged_file, verbose, bins: Bins = None): taxsbp_params = {} taxsbp_params["input_table"] = seqinfo.seqinfo if bins is not None: taxsbp_params["update_table"] = bins.bins taxsbp_params["nodes_file"] = ncbi_nodes_file if ncbi_merged_file: taxsbp_params["merged_file"] = ncbi_merged_file taxsbp_params["bin_len"] = bin_length if fragment_length: taxsbp_params["fragment_len"] = fragment_length taxsbp_params["overlap_len"] = overlap_length if specialization: taxsbp_params["specialization"] = specialization taxsbp_params["bin_exclusive"] = specialization else: # either species,genus ... or "leaves" taxsbp_params["bin_exclusive"] = rank #if verbose: taxsbp_params["silent"] = False return Bins(taxsbp_ret=taxsbp.taxsbp.pack(**taxsbp_params), use_specialization=True if specialization else False)
def update(cfg): tx = time.time() # validate input files input_files = validate_input_files(cfg.input_files, cfg.input_directory, cfg.input_extension, cfg.quiet) if len(input_files) == 0: print_log("ERROR: No valid input files found", cfg.quiet) return False # Set db prefixes db_prefix = { prefix: cfg.db_prefix + "." + prefix for prefix in ["ibf", "map", "tax", "gnn"] } # Set temporary working folder (current or new output) tmp_output_folder = cfg.output_db_prefix + "_tmp/" if cfg.output_db_prefix else cfg.db_prefix + "_tmp/" if not set_tmp_folder(tmp_output_folder): return False # Load .gnn file gnn = Gnn(file=db_prefix["gnn"]) # If specialization was set on database if gnn.specialization: # if not provided by user, use defition of database if not cfg.specialization: cfg.specialization = gnn.specialization print_log("Using --specialization " + cfg.specialization, cfg.quiet) else: if cfg.specialization: # If user defined specialization on update but database has none print_log( "ERROR: not possible to update a database with --specialization if it was built without it", cfg.quiet) return False # load bins bins = Bins(taxsbp_ret=gnn.bins, use_specialization=True if cfg.specialization else False) # load seqinfo (file or seqids) seqinfo = load_seqinfo(cfg, input_files) # check sequences compared to bins added_seqids, removed_seqids, kept_seqids = check_updated_seqids( set(seqinfo.get_seqids()), set(bins.get_seqids())) # Ignore removed sequences if not doing complete update if cfg.update_complete: print_log( "Update: adding " + str(len(added_seqids)) + " sequences, removing " + str(len(removed_seqids)) + " sequences, keeping " + str(len(kept_seqids)) + " sequences", cfg.quiet) else: removed_seqids = [] print_log( "Update: adding " + str(len(added_seqids)) + " sequences, ignoring " + str(len(kept_seqids)) + " repeated sequences", cfg.quiet) print_log("", cfg.quiet) if not added_seqids and not removed_seqids: print_log("Nothing to update", cfg.quiet) rm_tmp_folder(tmp_output_folder) return False if cfg.update_complete: # Remove already included seqids to just retrieve information for added sequences seqinfo.remove_seqids(kept_seqids | removed_seqids) else: # Remove seqids already present in the current version (repeated entries) seqinfo.remove_seqids(kept_seqids) # retrive sequence information (after removing invalid seqids) if not cfg.seq_info_file: retrieve_seqinfo(seqinfo, tmp_output_folder, input_files, cfg) # Convert cols data types if cfg.specialization: replaced_spec = seqinfo.validate_specialization() if replaced_spec: print_log( str(replaced_spec) + " invalid specialization entries (sequence accession used instead)\n", cfg.quiet) if not cfg.seq_info_file and cfg.write_seq_info_file: seqinfo.write(cfg.output_db_prefix + ".seqinfo.txt") # save set of current binids previous_binids = set(bins.get_binids()) # remove seqids from bins if performing update complete if cfg.update_complete and removed_seqids: bins.remove_seqids(removed_seqids) # save set of kept binids after removal kept_binids = set(bins.get_binids()) # Set up taxonomy files ncbi_nodes_file, ncbi_merged_file, ncbi_names_file = set_taxdump_files( cfg.taxdump_file, tmp_output_folder, cfg.quiet) tx = time.time() print_log("Running taxonomic clustering (TaxSBP)", cfg.quiet) updated_bins = run_taxsbp(seqinfo, gnn.bin_length, gnn.fragment_length, gnn.overlap_length, gnn.rank, cfg.specialization, ncbi_nodes_file, ncbi_merged_file, cfg.verbose, bins=bins) # bin statistics taxsbp_binids = set(updated_bins.get_binids()) removed_binids = previous_binids.difference(kept_binids | taxsbp_binids) new_binids = taxsbp_binids.difference(previous_binids) updated_binids = taxsbp_binids.intersection(previous_binids) print_log( " - " + str(len(new_binids)) + " bins added, " + str(len(updated_binids)) + " bins updated, " + str(len(removed_binids)) + " bins removed", cfg.quiet) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) tx = time.time() print_log("Updating database files", cfg.quiet) # load new taxonomy print_log( " - " + cfg.output_db_prefix + ".tax" if cfg.output_db_prefix else db_prefix["tax"], cfg.quiet) tax = Tax(ncbi_nodes=ncbi_nodes_file, ncbi_names=ncbi_names_file) # Update and write .tax file # filter only used taxids tax.filter(updated_bins.get_taxids()) # add specialization nodes if cfg.specialization: tax.add_nodes(updated_bins.get_specialization_taxid(), cfg.specialization) # Load old .tax file into new taxonomy tax.merge(Tax([db_prefix["tax"]])) # Write .tax file tax.write(cfg.output_db_prefix + ".tax" if cfg.output_db_prefix else db_prefix["tax"]) # TODO - remove entries from .tax from removed entries of the db # merge updated and old bins together bins.merge(updated_bins) # Write .gnn file print_log( " - " + cfg.output_db_prefix + ".gnn" if cfg.output_db_prefix else db_prefix["gnn"], cfg.quiet) gnn.bins = bins.get_list() # save updated bins gnn.number_of_bins = bins.get_number_of_bins() # add new bins count # set new specialization to gnn gnn.specialization = cfg.specialization gnn.write(cfg.output_db_prefix + ".gnn" if cfg.output_db_prefix else db_prefix["gnn"]) # Recreate .map file based on the new bins print_log( " - " + cfg.output_db_prefix + ".map" if cfg.output_db_prefix else db_prefix["map"], cfg.quiet) bins.write_map_file( cfg.output_db_prefix + ".map" if cfg.output_db_prefix else db_prefix["map"], use_specialization=True if cfg.specialization else False) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) tx = time.time() print_log("Updating index (ganon-build)", cfg.quiet) # Write aux. file for ganon # This file has to contain all new sequences # in case of update_complete acc_bin_file = tmp_output_folder + "acc_bin.txt" if cfg.update_complete: # all sequences from the bins with added/removed sequences should be written bins.write_acc_bin_file(acc_bin_file, new_binids | updated_binids) # If all sequences of a bin were removed and no new sequence added # insert a dummy entry for ganon-build to clear the bin if removed_binids: with open(acc_bin_file, "a") as abf: for b in removed_binids: print(0, 0, 0, b, sep="\t", file=abf) else: # Only new sequences (updated_bins) either on old or new binids updated_bins.write_acc_bin_file(acc_bin_file) # Update with same values used for build kmer_size = gnn.kmer_size window_size = gnn.window_size hash_functions = gnn.hash_functions # Free memory for build del seqinfo del bins del updated_bins del tax del gnn # Temporary output filter tmp_db_prefix_ibf = tmp_output_folder + "ganon.ibf" run_ganon_build_cmd = " ".join([ cfg.path_exec['build'], "--update-filter-file " + db_prefix["ibf"], "--kmer-size " + str(kmer_size), "--window-size " + str(window_size) if window_size else "", "--count-hashes " if window_size else "", "--hash-functions " + str(hash_functions), "--seqid-bin-file " + acc_bin_file, "--output-filter-file " + tmp_db_prefix_ibf, "--threads " + str(cfg.threads), "--verbose" if cfg.verbose else "", "--quiet" if cfg.quiet else "", "--n-refs " + str(cfg.n_refs) if cfg.n_refs is not None else "", "--n-batches " + str(cfg.n_batches) if cfg.n_batches is not None else "", "--reference-files " + ",".join([file for file in input_files]) if input_files and not cfg.input_directory else "", "--directory-reference-files " + cfg.input_directory if cfg.input_directory else "", "--extension " + cfg.input_extension if cfg.input_extension else "", "--update-complete" if cfg.update_complete else "" ]) stdout, stderr = run(run_ganon_build_cmd, print_stderr=True) # move IBF to final location shutil.move( tmp_db_prefix_ibf, cfg.output_db_prefix + ".ibf" if cfg.output_db_prefix else db_prefix["ibf"]) # Delete temp files rm_tmp_folder(tmp_output_folder) return True
def update(cfg): tx = time.time() # validate input files input_files = validate_input_files(cfg.input_files, cfg.input_directory, cfg.input_extension, cfg.quiet) if len(input_files) == 0: print_log("ERROR: No valid input files found") return False # Set db prefixes db_prefix = { prefix: cfg.db_prefix + "." + prefix for prefix in ["ibf", "map", "tax", "gnn"] } # Set temporary working folder (current or new output) tmp_output_folder = cfg.output_db_prefix + "_tmp/" if cfg.output_db_prefix else cfg.db_prefix + "_tmp/" if not set_tmp_folder(tmp_output_folder): return False # Load .gnn file gnn = Gnn(file=db_prefix["gnn"]) # Set assembly mode use_assembly = True if gnn.rank == "assembly" else False # load bins bins = Bins(taxsbp_ret=gnn.bins) # Load seqids and generate seqinfo if cfg.seq_info_file: seqinfo = load_seqids(seq_info_file=cfg.seq_info_file, quiet=cfg.quiet) else: seqinfo = load_seqids(files=input_files, quiet=cfg.quiet) # check sequences compared to bins added_seqids, removed_seqids, kept_seqids = check_updated_seqids( set(seqinfo.get_seqids()), set(bins.get_seqids())) # Ignore removed sequences if not doing complete update if cfg.update_complete: print_log( "Update: adding " + str(len(added_seqids)) + " sequences, removing " + str(len(removed_seqids)) + " sequences, keeping " + str(len(kept_seqids)) + " sequences", cfg.quiet) else: removed_seqids = [] print_log( "Update: adding " + str(len(added_seqids)) + " sequences, ignoring " + str(len(kept_seqids)) + " repeated sequences", cfg.quiet) print_log("", cfg.quiet) if not added_seqids and not removed_seqids: print_log("ERROR: Nothing to update") return False if cfg.update_complete: # Remove already included seqids to just retrieve information for added sequences seqinfo.remove_seqids(kept_seqids | removed_seqids) else: # Remove seqids already present in the current version (repeated entries) seqinfo.remove_seqids(kept_seqids) # load seqinfo file with data (after removing ids) if not cfg.seq_info_file: load_seqinfo(tmp_output_folder, seqinfo, cfg.path_exec, cfg.seq_info_mode, use_assembly, cfg.quiet) if cfg.write_seq_info_file: seqinfo.write(cfg.output_db_prefix + ".seqinfo.txt" if cfg. output_db_prefix else cfg.db_prefix + ".seqinfo.txt") # save set of current binids previous_binids = set(bins.get_binids()) # remove seqids from bins if performing update complete if cfg.update_complete and removed_seqids: bins.remove_seqids(removed_seqids) # save set of kept binids after removal kept_binids = set(bins.get_binids()) # Set up taxonomy files ncbi_nodes_file, ncbi_merged_file, ncbi_names_file = set_taxdump_files( cfg.taxdump_file, tmp_output_folder, cfg.quiet) tx = time.time() print_log("Running taxonomic clustering (TaxSBP)", cfg.quiet) taxsbp_params = {} taxsbp_params["update_table"] = bins.get_csv() taxsbp_params["nodes_file"] = ncbi_nodes_file taxsbp_params["bin_len"] = gnn.bin_length if use_assembly: taxsbp_params["bin_exclusive"] = "assembly" elif gnn.rank == "taxid": taxsbp_params["bin_exclusive"] = "leaves" else: taxsbp_params["bin_exclusive"] = gnn.rank if ncbi_merged_file: taxsbp_params["merged_file"] = ncbi_merged_file if gnn.fragment_length: taxsbp_params["fragment_len"] = gnn.fragment_length taxsbp_params["overlap_len"] = gnn.overlap_length if use_assembly: taxsbp_params["specialization"] = "assembly" taxsbp_params["input_table"] = seqinfo.get_csv() updated_bins = Bins(taxsbp_ret=taxsbp.taxsbp.pack(**taxsbp_params)) # bin statistics taxsbp_binids = set(updated_bins.get_binids()) removed_binids = previous_binids.difference(kept_binids | taxsbp_binids) new_binids = taxsbp_binids.difference(previous_binids) updated_binids = taxsbp_binids.intersection(previous_binids) print_log( " - " + str(len(new_binids)) + " bins added, " + str(len(updated_binids)) + " bins updated, " + str(len(removed_binids)) + " bins removed", cfg.quiet) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) tx = time.time() print_log("Updating database files", cfg.quiet) # load new taxonomy print_log( " - " + cfg.output_db_prefix + ".tax" if cfg.output_db_prefix else db_prefix["tax"], cfg.quiet) tax = Tax(ncbi_nodes=ncbi_nodes_file, ncbi_names=ncbi_names_file) # Update and write .tax file tax.filter(updated_bins.get_taxids()) # filter only used taxids if use_assembly: tax.add_nodes(updated_bins.get_specialization_taxid(), "assembly") # add assembly nodes # Load old .tax file into new taxonomy tax.merge(Tax([db_prefix["tax"]])) # Write .tax file tax.write(cfg.output_db_prefix + ".tax" if cfg.output_db_prefix else db_prefix["tax"]) # TODO - remove entries from .tax from removed entries of the db # merge updated and old bins together bins.merge(updated_bins) # Write .gnn file print_log( " - " + cfg.output_db_prefix + ".gnn" if cfg.output_db_prefix else db_prefix["gnn"], cfg.quiet) gnn.bins = bins.get_list() # save updated bins gnn.number_of_bins = bins.get_number_of_bins() # add new bins count gnn.write(cfg.output_db_prefix + ".gnn" if cfg.output_db_prefix else db_prefix["gnn"]) # Recreate .map file based on the new bins print_log( " - " + cfg.output_db_prefix + ".map" if cfg.output_db_prefix else db_prefix["map"], cfg.quiet) bins.write_map_file( cfg.output_db_prefix + ".map" if cfg.output_db_prefix else db_prefix["map"], use_assembly) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) tx = time.time() print_log("Updating index (ganon-build)", cfg.quiet) # Write aux. file for ganon # This file has to contain all new sequences # in case of update_complete, acc_bin_file = tmp_output_folder + "acc_bin.txt" if cfg.update_complete: # all sequences from the bins with added/removed sequences should be written bins.write_acc_bin_file(acc_bin_file, new_binids | updated_binids) # If all sequences of a bin were removed and no new sequence added # insert a dummy entry for ganon-build to clear the bin if removed_binids: with open(acc_bin_file, "a") as abf: for b in removed_binids: print(0, 0, 0, b, sep="\t", file=abf) else: # Only new sequences (updated_bins) either on old or new binids updated_bins.write_acc_bin_file(acc_bin_file) # Temporary output filter tmp_db_prefix_ibf = tmp_output_folder + "ganon.ibf" run_ganon_build_cmd = " ".join([ cfg.path_exec['build'], "--update-filter-file " + db_prefix["ibf"], "--seqid-bin-file " + acc_bin_file, "--output-filter-file " + tmp_db_prefix_ibf, "--threads " + str(cfg.threads), "--verbose" if cfg.verbose else "", "--quiet" if cfg.quiet else "", "--n-refs " + str(cfg.n_refs) if cfg.n_refs is not None else "", "--n-batches " + str(cfg.n_batches) if cfg.n_batches is not None else "", "--reference-files " + ",".join([file for file in input_files]) if input_files else "", "--directory-reference-files " + cfg.input_directory if cfg.input_directory else "", "--extension " + cfg.input_extension if cfg.input_extension else "", "--update-complete" if cfg.update_complete else "" ]) stdout, stderr = run(run_ganon_build_cmd, print_stderr=True) # move IBF to final location shutil.move( tmp_db_prefix_ibf, cfg.output_db_prefix + ".ibf" if cfg.output_db_prefix else db_prefix["ibf"]) # Delete temp files rm_tmp_folder(tmp_output_folder) return True
def build(cfg): # validate input files input_files = validate_input_files(cfg.input_files, cfg.input_directory, cfg.input_extension, cfg.quiet) if len(input_files) == 0: print_log("ERROR: No valid input files found") return False # Set db prefixes db_prefix = { prefix: cfg.db_prefix + "." + prefix for prefix in ["ibf", "map", "tax", "gnn"] } # Set temporary working folder tmp_output_folder = cfg.db_prefix + "_tmp/" if not set_tmp_folder(tmp_output_folder): return False # Set assembly mode use_assembly = True if cfg.rank == "assembly" else False # Set up taxonomy ncbi_nodes_file, ncbi_merged_file, ncbi_names_file = set_taxdump_files( cfg.taxdump_file, tmp_output_folder, cfg.quiet) tx = time.time() print_log("Parsing taxonomy", cfg.quiet) tax = Tax(ncbi_nodes=ncbi_nodes_file, ncbi_names=ncbi_names_file) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) # Load seqids and generate seqinfo if cfg.seq_info_file: seqinfo = load_seqids(seq_info_file=cfg.seq_info_file, quiet=cfg.quiet) else: seqinfo = load_seqids(files=input_files, quiet=cfg.quiet) load_seqinfo(tmp_output_folder, seqinfo, cfg.path_exec, cfg.seq_info_mode, use_assembly, cfg.quiet) if cfg.write_seq_info_file: seqinfo.write(cfg.db_prefix + ".seqinfo.txt") # check sequences compared to bins added_seqids, _, _ = check_updated_seqids(set(seqinfo.get_seqids()), set()) # Ignore removed sequences if not doing complete update print_log("Build: adding " + str(len(added_seqids)) + " sequences", cfg.quiet) print_log("", cfg.quiet) # Set bin length if cfg.bin_length: # user defined bin_length = cfg.bin_length else: tx = time.time() print_log("Calculating best bin length", cfg.quiet) bin_length, approx_size, n_bins = estimate_bin_len_size( cfg, seqinfo, tax, use_assembly) if bin_length <= 0: bin_length = 1000000 print_log( "WARNING: could not estimate bin length, using default of " + str(bin_length) + "bp") else: print_log( " - bin length: " + str(bin_length) + "bp (approx: " + str(n_bins) + " bins / " + str("{0:.2f}".format(approx_size)) + "MB)", cfg.quiet) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) # Set fragment length if cfg.fragment_length == -1: # if ==-1 set default fragment_length = bin_length - cfg.overlap_length elif cfg.fragment_length == 0: # if ==0 deactivate fragment_length = 0 else: # user input fragment_length = cfg.fragment_length - cfg.overlap_length tx = time.time() print_log("Running taxonomic clustering (TaxSBP)", cfg.quiet) taxsbp_params = {} taxsbp_params["nodes_file"] = ncbi_nodes_file taxsbp_params["bin_len"] = bin_length if use_assembly: taxsbp_params["bin_exclusive"] = "assembly" elif cfg.rank == "taxid": taxsbp_params["bin_exclusive"] = "leaves" else: taxsbp_params["bin_exclusive"] = cfg.rank if ncbi_merged_file: taxsbp_params["merged_file"] = ncbi_merged_file if fragment_length: taxsbp_params["fragment_len"] = fragment_length taxsbp_params["overlap_len"] = cfg.overlap_length if use_assembly: taxsbp_params["specialization"] = "assembly" taxsbp_params["input_table"] = seqinfo.get_csv() bins = Bins(taxsbp_ret=taxsbp.taxsbp.pack(**taxsbp_params)) del taxsbp_params # bin statistics actual_number_of_bins = bins.get_number_of_bins() optimal_number_of_bins = optimal_bins(actual_number_of_bins) max_length_bin = bins.get_max_bin_length() max_kmer_count = max_length_bin - cfg.kmer_size + 1 # aproximate number of unique k-mers by just considering that they are all unique print_log(" - " + str(actual_number_of_bins) + " bins created", cfg.quiet) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) tx = time.time() print_log("Building database files", cfg.quiet) # Write .map file print_log(" - " + db_prefix["map"], cfg.quiet) bins.write_map_file(db_prefix["map"], use_assembly) # Write .tax file print_log(" - " + db_prefix["tax"], cfg.quiet) tax.filter(bins.get_taxids()) # filter only used taxids if use_assembly: tax.add_nodes(bins.get_specialization_taxid(), "assembly") # add assembly nodes tax.write(db_prefix["tax"]) # Write .gnn file print_log(" - " + db_prefix["gnn"], cfg.quiet) gnn = Gnn(kmer_size=cfg.kmer_size, hash_functions=cfg.hash_functions, number_of_bins=actual_number_of_bins, rank=cfg.rank, bin_length=bin_length, fragment_length=fragment_length, overlap_length=cfg.overlap_length, bins=bins.get_list()) gnn.write(db_prefix["gnn"]) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) print_log("Building index (ganon-build)", cfg.quiet) # define bloom filter size based on given false positive MBinBits = 8388608 print_log( " - max unique " + str(cfg.kmer_size) + "-mers: " + str(max_kmer_count), cfg.quiet) if not cfg.fixed_bloom_size: bin_size_bits = math.ceil(-(1 / ( (1 - cfg.max_fp**(1 / float(cfg.hash_functions)))** (1 / float(cfg.hash_functions * max_kmer_count)) - 1))) print_log( " - IBF calculated size with fp<=" + str(cfg.max_fp) + ": " + str("{0:.2f}".format( (bin_size_bits * optimal_number_of_bins) / MBinBits)) + "MB (" + str(bin_size_bits) + " bits/bin * " + str(optimal_number_of_bins) + " optimal bins [" + str(actual_number_of_bins) + " real bins])", cfg.quiet) else: bin_size_bits = math.ceil( (cfg.fixed_bloom_size * MBinBits) / optimal_number_of_bins) estimated_max_fp = (1 - ((1 - (1 / float(bin_size_bits)))**( cfg.hash_functions * max_kmer_count)))**cfg.hash_functions print_log( " - IBF calculated max. fp with size=" + str(cfg.fixed_bloom_size) + "MB: " + str("{0:.2f}".format(estimated_max_fp) + " (" + str(optimal_number_of_bins) + " optimal bins [" + str(actual_number_of_bins) + " real bins])"), cfg.quiet) # Write aux. file for ganon acc_bin_file = tmp_output_folder + "acc_bin.txt" bins.write_acc_bin_file(acc_bin_file) run_ganon_build_cmd = " ".join([ cfg.path_exec['build'], "--seqid-bin-file " + acc_bin_file, "--filter-size-bits " + str(bin_size_bits * optimal_number_of_bins) if cfg.max_fp else "--filter-size " + str(cfg.fixed_bloom_size), "--kmer-size " + str(cfg.kmer_size), "--hash-functions " + str(cfg.hash_functions), "--threads " + str(cfg.threads), "--output-filter-file " + db_prefix["ibf"], "--verbose" if cfg.verbose else "", "--quiet" if cfg.quiet else "", "--n-refs " + str(cfg.n_refs) if cfg.n_refs is not None else "", "--n-batches " + str(cfg.n_batches) if cfg.n_batches is not None else "", "--reference-files " + ",".join([file for file in input_files]) if input_files else "", "--directory-reference-files " + cfg.input_directory if cfg.input_directory else "", "--extension " + cfg.input_extension if cfg.input_extension else "" ]) stdout, stderr = run(run_ganon_build_cmd, print_stderr=True) # Delete temp files rm_tmp_folder(tmp_output_folder) return True