Ejemplo n.º 1
0
def update(cfg):
    tx = time.time()

    # validate input files
    input_files = validate_input_files(cfg.input_files, cfg.input_directory,
                                       cfg.input_extension, cfg.quiet)
    if len(input_files) == 0:
        print_log("ERROR: No valid input files found", cfg.quiet)
        return False

    # Set db prefixes
    db_prefix = {
        prefix: cfg.db_prefix + "." + prefix
        for prefix in ["ibf", "map", "tax", "gnn"]
    }

    # Set temporary working folder (current or new output)
    tmp_output_folder = cfg.output_db_prefix + "_tmp/" if cfg.output_db_prefix else cfg.db_prefix + "_tmp/"
    if not set_tmp_folder(tmp_output_folder): return False

    # Load .gnn file
    gnn = Gnn(file=db_prefix["gnn"])

    # If specialization was set on database
    if gnn.specialization:
        # if not provided by user, use defition of database
        if not cfg.specialization: cfg.specialization = gnn.specialization
        print_log("Using --specialization " + cfg.specialization, cfg.quiet)
    else:
        if cfg.specialization:
            # If user defined specialization on update but database has none
            print_log(
                "ERROR: not possible to update a database with --specialization if it was built without it",
                cfg.quiet)
            return False

    # load bins
    bins = Bins(taxsbp_ret=gnn.bins,
                use_specialization=True if cfg.specialization else False)

    # load seqinfo (file or seqids)
    seqinfo = load_seqinfo(cfg, input_files)

    # check sequences compared to bins
    added_seqids, removed_seqids, kept_seqids = check_updated_seqids(
        set(seqinfo.get_seqids()), set(bins.get_seqids()))
    # Ignore removed sequences if not doing complete update
    if cfg.update_complete:
        print_log(
            "Update: adding " + str(len(added_seqids)) +
            " sequences, removing " + str(len(removed_seqids)) +
            " sequences, keeping " + str(len(kept_seqids)) + " sequences",
            cfg.quiet)
    else:
        removed_seqids = []
        print_log(
            "Update: adding " + str(len(added_seqids)) +
            " sequences, ignoring " + str(len(kept_seqids)) +
            " repeated sequences", cfg.quiet)
    print_log("", cfg.quiet)

    if not added_seqids and not removed_seqids:
        print_log("Nothing to update", cfg.quiet)
        rm_tmp_folder(tmp_output_folder)
        return False

    if cfg.update_complete:
        # Remove already included seqids to just retrieve information for added sequences
        seqinfo.remove_seqids(kept_seqids | removed_seqids)
    else:
        # Remove seqids already present in the current version (repeated entries)
        seqinfo.remove_seqids(kept_seqids)

    # retrive sequence information (after removing invalid seqids)
    if not cfg.seq_info_file:
        retrieve_seqinfo(seqinfo, tmp_output_folder, input_files, cfg)

    # Convert cols data types
    if cfg.specialization:
        replaced_spec = seqinfo.validate_specialization()
        if replaced_spec:
            print_log(
                str(replaced_spec) +
                " invalid specialization entries (sequence accession used instead)\n",
                cfg.quiet)

    if not cfg.seq_info_file and cfg.write_seq_info_file:
        seqinfo.write(cfg.output_db_prefix + ".seqinfo.txt")

    # save set of current binids
    previous_binids = set(bins.get_binids())
    # remove seqids from bins if performing update complete
    if cfg.update_complete and removed_seqids:
        bins.remove_seqids(removed_seqids)
    # save set of kept binids after removal
    kept_binids = set(bins.get_binids())

    # Set up taxonomy files
    ncbi_nodes_file, ncbi_merged_file, ncbi_names_file = set_taxdump_files(
        cfg.taxdump_file, tmp_output_folder, cfg.quiet)

    tx = time.time()
    print_log("Running taxonomic clustering (TaxSBP)", cfg.quiet)
    updated_bins = run_taxsbp(seqinfo,
                              gnn.bin_length,
                              gnn.fragment_length,
                              gnn.overlap_length,
                              gnn.rank,
                              cfg.specialization,
                              ncbi_nodes_file,
                              ncbi_merged_file,
                              cfg.verbose,
                              bins=bins)
    # bin statistics
    taxsbp_binids = set(updated_bins.get_binids())
    removed_binids = previous_binids.difference(kept_binids | taxsbp_binids)
    new_binids = taxsbp_binids.difference(previous_binids)
    updated_binids = taxsbp_binids.intersection(previous_binids)
    print_log(
        " - " + str(len(new_binids)) + " bins added, " +
        str(len(updated_binids)) + " bins updated, " +
        str(len(removed_binids)) + " bins removed", cfg.quiet)
    print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n",
              cfg.quiet)

    tx = time.time()
    print_log("Updating database files", cfg.quiet)
    # load new taxonomy
    print_log(
        " - " + cfg.output_db_prefix +
        ".tax" if cfg.output_db_prefix else db_prefix["tax"], cfg.quiet)
    tax = Tax(ncbi_nodes=ncbi_nodes_file, ncbi_names=ncbi_names_file)
    # Update and write .tax file
    # filter only used taxids
    tax.filter(updated_bins.get_taxids())
    # add specialization nodes
    if cfg.specialization:
        tax.add_nodes(updated_bins.get_specialization_taxid(),
                      cfg.specialization)

    # Load old .tax file into new taxonomy
    tax.merge(Tax([db_prefix["tax"]]))
    # Write .tax file
    tax.write(cfg.output_db_prefix +
              ".tax" if cfg.output_db_prefix else db_prefix["tax"])
    # TODO - remove entries from .tax from removed entries of the db

    # merge updated and old bins together
    bins.merge(updated_bins)

    # Write .gnn file
    print_log(
        " - " + cfg.output_db_prefix +
        ".gnn" if cfg.output_db_prefix else db_prefix["gnn"], cfg.quiet)
    gnn.bins = bins.get_list()  # save updated bins
    gnn.number_of_bins = bins.get_number_of_bins()  # add new bins count
    # set new specialization to gnn
    gnn.specialization = cfg.specialization
    gnn.write(cfg.output_db_prefix +
              ".gnn" if cfg.output_db_prefix else db_prefix["gnn"])

    # Recreate .map file based on the new bins
    print_log(
        " - " + cfg.output_db_prefix +
        ".map" if cfg.output_db_prefix else db_prefix["map"], cfg.quiet)
    bins.write_map_file(
        cfg.output_db_prefix +
        ".map" if cfg.output_db_prefix else db_prefix["map"],
        use_specialization=True if cfg.specialization else False)
    print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n",
              cfg.quiet)

    tx = time.time()
    print_log("Updating index (ganon-build)", cfg.quiet)

    # Write aux. file for ganon
    # This file has to contain all new sequences
    # in case of update_complete
    acc_bin_file = tmp_output_folder + "acc_bin.txt"

    if cfg.update_complete:
        # all sequences from the bins with added/removed sequences should be written
        bins.write_acc_bin_file(acc_bin_file, new_binids | updated_binids)
        # If all sequences of a bin were removed and no new sequence added
        # insert a dummy entry for ganon-build to clear the bin
        if removed_binids:
            with open(acc_bin_file, "a") as abf:
                for b in removed_binids:
                    print(0, 0, 0, b, sep="\t", file=abf)
    else:
        # Only new sequences (updated_bins) either on old or new binids
        updated_bins.write_acc_bin_file(acc_bin_file)

    # Update with same values used for build
    kmer_size = gnn.kmer_size
    window_size = gnn.window_size
    hash_functions = gnn.hash_functions

    # Free memory for build
    del seqinfo
    del bins
    del updated_bins
    del tax
    del gnn

    # Temporary output filter
    tmp_db_prefix_ibf = tmp_output_folder + "ganon.ibf"
    run_ganon_build_cmd = " ".join([
        cfg.path_exec['build'], "--update-filter-file " + db_prefix["ibf"],
        "--kmer-size " + str(kmer_size),
        "--window-size " + str(window_size) if window_size else "",
        "--count-hashes " if window_size else "", "--hash-functions " +
        str(hash_functions), "--seqid-bin-file " + acc_bin_file,
        "--output-filter-file " + tmp_db_prefix_ibf,
        "--threads " + str(cfg.threads), "--verbose" if cfg.verbose else "",
        "--quiet" if cfg.quiet else "", "--n-refs " +
        str(cfg.n_refs) if cfg.n_refs is not None else "", "--n-batches " +
        str(cfg.n_batches) if cfg.n_batches is not None else "",
        "--reference-files " + ",".join([file for file in input_files])
        if input_files and not cfg.input_directory else "",
        "--directory-reference-files " +
        cfg.input_directory if cfg.input_directory else "",
        "--extension " + cfg.input_extension if cfg.input_extension else "",
        "--update-complete" if cfg.update_complete else ""
    ])
    stdout, stderr = run(run_ganon_build_cmd, print_stderr=True)

    # move IBF to final location
    shutil.move(
        tmp_db_prefix_ibf, cfg.output_db_prefix +
        ".ibf" if cfg.output_db_prefix else db_prefix["ibf"])

    # Delete temp files
    rm_tmp_folder(tmp_output_folder)

    return True
Ejemplo n.º 2
0
def update(cfg):
    tx = time.time()
    # validate input files
    input_files = validate_input_files(cfg.input_files, cfg.input_directory,
                                       cfg.input_extension, cfg.quiet)
    if len(input_files) == 0:
        print_log("ERROR: No valid input files found")
        return False

    # Set db prefixes
    db_prefix = {
        prefix: cfg.db_prefix + "." + prefix
        for prefix in ["ibf", "map", "tax", "gnn"]
    }

    # Set temporary working folder (current or new output)
    tmp_output_folder = cfg.output_db_prefix + "_tmp/" if cfg.output_db_prefix else cfg.db_prefix + "_tmp/"
    if not set_tmp_folder(tmp_output_folder): return False

    # Load .gnn file
    gnn = Gnn(file=db_prefix["gnn"])
    # Set assembly mode
    use_assembly = True if gnn.rank == "assembly" else False

    # load bins
    bins = Bins(taxsbp_ret=gnn.bins)

    # Load seqids and generate seqinfo
    if cfg.seq_info_file:
        seqinfo = load_seqids(seq_info_file=cfg.seq_info_file, quiet=cfg.quiet)
    else:
        seqinfo = load_seqids(files=input_files, quiet=cfg.quiet)

    # check sequences compared to bins
    added_seqids, removed_seqids, kept_seqids = check_updated_seqids(
        set(seqinfo.get_seqids()), set(bins.get_seqids()))
    # Ignore removed sequences if not doing complete update
    if cfg.update_complete:
        print_log(
            "Update: adding " + str(len(added_seqids)) +
            " sequences, removing " + str(len(removed_seqids)) +
            " sequences, keeping " + str(len(kept_seqids)) + " sequences",
            cfg.quiet)
    else:
        removed_seqids = []
        print_log(
            "Update: adding " + str(len(added_seqids)) +
            " sequences, ignoring " + str(len(kept_seqids)) +
            " repeated sequences", cfg.quiet)
    print_log("", cfg.quiet)

    if not added_seqids and not removed_seqids:
        print_log("ERROR: Nothing to update")
        return False

    if cfg.update_complete:
        # Remove already included seqids to just retrieve information for added sequences
        seqinfo.remove_seqids(kept_seqids | removed_seqids)
    else:
        # Remove seqids already present in the current version (repeated entries)
        seqinfo.remove_seqids(kept_seqids)

    # load seqinfo file with data (after removing ids)
    if not cfg.seq_info_file:
        load_seqinfo(tmp_output_folder, seqinfo, cfg.path_exec,
                     cfg.seq_info_mode, use_assembly, cfg.quiet)
        if cfg.write_seq_info_file:
            seqinfo.write(cfg.output_db_prefix + ".seqinfo.txt" if cfg.
                          output_db_prefix else cfg.db_prefix + ".seqinfo.txt")

    # save set of current binids
    previous_binids = set(bins.get_binids())
    # remove seqids from bins if performing update complete
    if cfg.update_complete and removed_seqids:
        bins.remove_seqids(removed_seqids)
    # save set of kept binids after removal
    kept_binids = set(bins.get_binids())

    # Set up taxonomy files
    ncbi_nodes_file, ncbi_merged_file, ncbi_names_file = set_taxdump_files(
        cfg.taxdump_file, tmp_output_folder, cfg.quiet)

    tx = time.time()
    print_log("Running taxonomic clustering (TaxSBP)", cfg.quiet)
    taxsbp_params = {}
    taxsbp_params["update_table"] = bins.get_csv()
    taxsbp_params["nodes_file"] = ncbi_nodes_file
    taxsbp_params["bin_len"] = gnn.bin_length
    if use_assembly:
        taxsbp_params["bin_exclusive"] = "assembly"
    elif gnn.rank == "taxid":
        taxsbp_params["bin_exclusive"] = "leaves"
    else:
        taxsbp_params["bin_exclusive"] = gnn.rank
    if ncbi_merged_file: taxsbp_params["merged_file"] = ncbi_merged_file
    if gnn.fragment_length:
        taxsbp_params["fragment_len"] = gnn.fragment_length
        taxsbp_params["overlap_len"] = gnn.overlap_length
    if use_assembly: taxsbp_params["specialization"] = "assembly"
    taxsbp_params["input_table"] = seqinfo.get_csv()
    updated_bins = Bins(taxsbp_ret=taxsbp.taxsbp.pack(**taxsbp_params))
    # bin statistics
    taxsbp_binids = set(updated_bins.get_binids())
    removed_binids = previous_binids.difference(kept_binids | taxsbp_binids)
    new_binids = taxsbp_binids.difference(previous_binids)
    updated_binids = taxsbp_binids.intersection(previous_binids)
    print_log(
        " - " + str(len(new_binids)) + " bins added, " +
        str(len(updated_binids)) + " bins updated, " +
        str(len(removed_binids)) + " bins removed", cfg.quiet)
    print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n",
              cfg.quiet)

    tx = time.time()
    print_log("Updating database files", cfg.quiet)
    # load new taxonomy
    print_log(
        " - " + cfg.output_db_prefix +
        ".tax" if cfg.output_db_prefix else db_prefix["tax"], cfg.quiet)
    tax = Tax(ncbi_nodes=ncbi_nodes_file, ncbi_names=ncbi_names_file)
    # Update and write .tax file
    tax.filter(updated_bins.get_taxids())  # filter only used taxids
    if use_assembly:
        tax.add_nodes(updated_bins.get_specialization_taxid(),
                      "assembly")  # add assembly nodes
    # Load old .tax file into new taxonomy
    tax.merge(Tax([db_prefix["tax"]]))
    # Write .tax file
    tax.write(cfg.output_db_prefix +
              ".tax" if cfg.output_db_prefix else db_prefix["tax"])
    # TODO - remove entries from .tax from removed entries of the db

    # merge updated and old bins together
    bins.merge(updated_bins)

    # Write .gnn file
    print_log(
        " - " + cfg.output_db_prefix +
        ".gnn" if cfg.output_db_prefix else db_prefix["gnn"], cfg.quiet)
    gnn.bins = bins.get_list()  # save updated bins
    gnn.number_of_bins = bins.get_number_of_bins()  # add new bins count
    gnn.write(cfg.output_db_prefix +
              ".gnn" if cfg.output_db_prefix else db_prefix["gnn"])

    # Recreate .map file based on the new bins
    print_log(
        " - " + cfg.output_db_prefix +
        ".map" if cfg.output_db_prefix else db_prefix["map"], cfg.quiet)
    bins.write_map_file(
        cfg.output_db_prefix +
        ".map" if cfg.output_db_prefix else db_prefix["map"], use_assembly)
    print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n",
              cfg.quiet)

    tx = time.time()
    print_log("Updating index (ganon-build)", cfg.quiet)

    # Write aux. file for ganon
    # This file has to contain all new sequences
    # in case of update_complete,
    acc_bin_file = tmp_output_folder + "acc_bin.txt"

    if cfg.update_complete:
        # all sequences from the bins with added/removed sequences should be written
        bins.write_acc_bin_file(acc_bin_file, new_binids | updated_binids)
        # If all sequences of a bin were removed and no new sequence added
        # insert a dummy entry for ganon-build to clear the bin
        if removed_binids:
            with open(acc_bin_file, "a") as abf:
                for b in removed_binids:
                    print(0, 0, 0, b, sep="\t", file=abf)

    else:
        # Only new sequences (updated_bins) either on old or new binids
        updated_bins.write_acc_bin_file(acc_bin_file)

    # Temporary output filter
    tmp_db_prefix_ibf = tmp_output_folder + "ganon.ibf"
    run_ganon_build_cmd = " ".join([
        cfg.path_exec['build'], "--update-filter-file " + db_prefix["ibf"],
        "--seqid-bin-file " + acc_bin_file, "--output-filter-file " +
        tmp_db_prefix_ibf, "--threads " + str(cfg.threads),
        "--verbose" if cfg.verbose else "", "--quiet" if cfg.quiet else "",
        "--n-refs " + str(cfg.n_refs) if cfg.n_refs is not None else "",
        "--n-batches " +
        str(cfg.n_batches) if cfg.n_batches is not None else "",
        "--reference-files " +
        ",".join([file for file in input_files]) if input_files else "",
        "--directory-reference-files " +
        cfg.input_directory if cfg.input_directory else "",
        "--extension " + cfg.input_extension if cfg.input_extension else "",
        "--update-complete" if cfg.update_complete else ""
    ])
    stdout, stderr = run(run_ganon_build_cmd, print_stderr=True)

    # move IBF to final location
    shutil.move(
        tmp_db_prefix_ibf, cfg.output_db_prefix +
        ".ibf" if cfg.output_db_prefix else db_prefix["ibf"])

    # Delete temp files
    rm_tmp_folder(tmp_output_folder)

    return True
Ejemplo n.º 3
0
def build(cfg):
    # validate input files
    input_files = validate_input_files(cfg.input_files, cfg.input_directory,
                                       cfg.input_extension, cfg.quiet)
    if len(input_files) == 0:
        print_log("ERROR: No valid input files found", cfg.quiet)
        return False

    # Set db prefixes
    db_prefix = {
        prefix: cfg.db_prefix + "." + prefix
        for prefix in ["ibf", "map", "tax", "gnn"]
    }

    # Set temporary working folder
    tmp_output_folder = cfg.db_prefix + "_tmp/"
    if not set_tmp_folder(tmp_output_folder): return False

    # Set up taxonomy
    ncbi_nodes_file, ncbi_merged_file, ncbi_names_file = set_taxdump_files(
        cfg.taxdump_file, tmp_output_folder, cfg.quiet)

    # Parse .tax
    tx = time.time()
    print_log("Parsing taxonomy", cfg.quiet)
    tax = Tax(ncbi_nodes=ncbi_nodes_file, ncbi_names=ncbi_names_file)
    print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n",
              cfg.quiet)

    # load seqinfo (file or seqids)
    seqinfo = load_seqinfo(cfg, input_files)

    # Retrieve sequence information
    if not cfg.seq_info_file:
        retrieve_seqinfo(seqinfo, tmp_output_folder, input_files, cfg)

    # Check for valid specialization
    if cfg.specialization:
        replaced_spec = seqinfo.validate_specialization()
        if replaced_spec:
            print_log(
                str(replaced_spec) +
                " invalid specialization entries (sequence accession used instead)\n",
                cfg.quiet)

    # Write seq-info-file
    if not cfg.seq_info_file and cfg.write_seq_info_file:
        seqinfo.write(cfg.db_prefix + ".seqinfo.txt")

    # check sequences compared to bins
    added_seqids, _, _ = check_updated_seqids(set(seqinfo.get_seqids()), set())
    # Ignore removed sequences if not doing complete update
    print_log("Build: adding " + str(len(added_seqids)) + " sequences",
              cfg.quiet)
    print_log("", cfg.quiet)

    if not added_seqids:
        print_log("No valid seq. info to build", cfg.quiet)
        rm_tmp_folder(tmp_output_folder)
        return False

    # Set or calculate best --bin-length
    if cfg.bin_length:
        bin_length = cfg.bin_length
    else:
        tx = time.time()
        print_log("Simulating parameters", cfg.quiet)
        bin_length = estimate_bin_length(cfg, seqinfo, tax)
        print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n",
                  cfg.quiet)

    # Set fragment length
    if cfg.fragment_length == -1:  # if ==-1 set default
        fragment_length = bin_length - cfg.overlap_length
    elif cfg.fragment_length == 0:  # if ==0 deactivate
        fragment_length = 0
    else:  # user input
        fragment_length = cfg.fragment_length - cfg.overlap_length

    tx = time.time()
    print_log("Running taxonomic clustering (TaxSBP)", cfg.quiet)
    bins = run_taxsbp(seqinfo, bin_length, fragment_length, cfg.overlap_length,
                      cfg.rank, cfg.specialization, ncbi_nodes_file,
                      ncbi_merged_file, cfg.verbose)
    # bin statistics
    actual_number_of_bins = bins.get_number_of_bins()
    optimal_number_of_bins = optimal_bins(actual_number_of_bins)
    max_length_bin = bins.get_max_bin_length()
    max_kmer_count = estimate_elements(max_length_bin, cfg.kmer_size,
                                       cfg.window_size)
    #max_kmer_count = max_length_bin - cfg.kmer_size + 1
    print_log(" - " + str(actual_number_of_bins) + " bins created", cfg.quiet)
    print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n",
              cfg.quiet)

    # Get optimal parameters from user input and taxsbp result
    if cfg.filter_size:
        optimal_params = derive_bf_params(
            max_kmer_count, 0,
            math.ceil(mb2bits(cfg.filter_size) / optimal_number_of_bins),
            cfg.hash_functions)
    else:
        optimal_params = derive_bf_params(max_kmer_count, cfg.max_fp, 0,
                                          cfg.hash_functions)

    # When fixed size is too small
    if optimal_params["hash_functions"] == 0:
        optimal_params["hash_functions"] = 3

    print_log("Optimal bins: " + str(optimal_number_of_bins), cfg.quiet)
    print_log(
        "Max. false positive: " +
        str("{0:.5f}".format(optimal_params["false_positive"])), cfg.quiet)
    print_log("Hash functions: " + str(optimal_params["hash_functions"]),
              cfg.quiet)
    if cfg.window_size:
        # Check lower bound for minimizers with estimated bin-length
        min_mini = math.floor(((max_length_bin - cfg.kmer_size + 1) /
                               (cfg.window_size - cfg.kmer_size + 1)))
        min_size_mini = derive_bf_params(min_mini,
                                         optimal_params["false_positive"], 0,
                                         optimal_params["hash_functions"])
        print_log(
            "Possible elements per bin: " + str(min_mini) + ".." +
            str(max_kmer_count), cfg.quiet)
        print_log(
            "Possible filter sizes: " + str("{0:.2f}".format(
                bits2mb(min_size_mini["size_bits"] * optimal_number_of_bins)))
            + "MB.." + str("{0:.2f}".format(
                bits2mb(optimal_params["size_bits"] * optimal_number_of_bins)))
            + "MB", cfg.quiet)
    else:
        print_log("Elements per bin: " + str(max_kmer_count), cfg.quiet)
        print_log(
            "Filter size: " + str("{0:.2f}".format(
                bits2mb(optimal_params["size_bits"] * optimal_number_of_bins)))
            + "MB", cfg.quiet)

    print_log("")

    # Build database files (map, tax, gnn)
    tx = time.time()
    print_log("Building database files", cfg.quiet)
    # Write .map file
    print_log(" - " + db_prefix["map"], cfg.quiet)
    bins.write_map_file(
        db_prefix["map"],
        use_specialization=True if cfg.specialization else False)

    # Write .tax file
    print_log(" - " + db_prefix["tax"], cfg.quiet)
    # filter only used taxids
    tax.filter(bins.get_taxids())
    # add specialization nodes
    if cfg.specialization:
        tax.add_nodes(bins.get_specialization_taxid(), cfg.specialization)
    tax.write(db_prefix["tax"])

    if cfg.specialization and cfg.rank != "leaves":
        print_log(
            " - --rank is set to leaves when using specialization values",
            cfg.quiet)
        cfg.rank = "leaves"

    # Write .gnn file
    print_log(" - " + db_prefix["gnn"], cfg.quiet)
    gnn = Gnn(kmer_size=cfg.kmer_size,
              window_size=cfg.window_size,
              hash_functions=optimal_params["hash_functions"],
              number_of_bins=actual_number_of_bins,
              rank=cfg.rank,
              specialization=cfg.specialization,
              bin_length=bin_length,
              fragment_length=fragment_length,
              overlap_length=cfg.overlap_length,
              bins=bins.get_list())
    gnn.write(db_prefix["gnn"])
    print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n",
              cfg.quiet)

    print_log("Building index (ganon-build)", cfg.quiet)
    # Write aux. file for ganon
    acc_bin_file = tmp_output_folder + "acc_bin.txt"
    bins.write_acc_bin_file(acc_bin_file)

    # Free memory for build
    del seqinfo
    del bins
    del tax
    del gnn

    run_ganon_build_cmd = " ".join([
        cfg.path_exec['build'], "--seqid-bin-file " + acc_bin_file,
        "--bin-size-bits " + str(optimal_params["size_bits"])
        if cfg.filter_size else "--false-positive " + str(cfg.max_fp),
        "--kmer-size " + str(cfg.kmer_size),
        "--window-size " + str(cfg.window_size) if cfg.window_size else "",
        "--count-hashes " if cfg.window_size else "", "--hash-functions " +
        str(optimal_params["hash_functions"]), "--threads " + str(cfg.threads),
        "--output-filter-file " + db_prefix["ibf"],
        "--verbose" if cfg.verbose else "", "--quiet" if cfg.quiet else "",
        "--n-refs " + str(cfg.n_refs) if cfg.n_refs is not None else "",
        "--n-batches " +
        str(cfg.n_batches) if cfg.n_batches is not None else "",
        "--reference-files " + ",".join([file for file in input_files])
        if input_files and not cfg.input_directory else "",
        "--directory-reference-files " +
        cfg.input_directory if cfg.input_directory else "",
        "--extension " + cfg.input_extension if cfg.input_extension else ""
    ])
    stdout, stderr = run(run_ganon_build_cmd, print_stderr=True)

    # Delete temp files
    rm_tmp_folder(tmp_output_folder)

    return True
Ejemplo n.º 4
0
def build(cfg):
    # validate input files
    input_files = validate_input_files(cfg.input_files, cfg.input_directory,
                                       cfg.input_extension, cfg.quiet)
    if len(input_files) == 0:
        print_log("ERROR: No valid input files found")
        return False

    # Set db prefixes
    db_prefix = {
        prefix: cfg.db_prefix + "." + prefix
        for prefix in ["ibf", "map", "tax", "gnn"]
    }

    # Set temporary working folder
    tmp_output_folder = cfg.db_prefix + "_tmp/"
    if not set_tmp_folder(tmp_output_folder): return False

    # Set assembly mode
    use_assembly = True if cfg.rank == "assembly" else False

    # Set up taxonomy
    ncbi_nodes_file, ncbi_merged_file, ncbi_names_file = set_taxdump_files(
        cfg.taxdump_file, tmp_output_folder, cfg.quiet)

    tx = time.time()
    print_log("Parsing taxonomy", cfg.quiet)
    tax = Tax(ncbi_nodes=ncbi_nodes_file, ncbi_names=ncbi_names_file)
    print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n",
              cfg.quiet)

    # Load seqids and generate seqinfo
    if cfg.seq_info_file:
        seqinfo = load_seqids(seq_info_file=cfg.seq_info_file, quiet=cfg.quiet)
    else:
        seqinfo = load_seqids(files=input_files, quiet=cfg.quiet)
        load_seqinfo(tmp_output_folder, seqinfo, cfg.path_exec,
                     cfg.seq_info_mode, use_assembly, cfg.quiet)
        if cfg.write_seq_info_file:
            seqinfo.write(cfg.db_prefix + ".seqinfo.txt")
    # check sequences compared to bins
    added_seqids, _, _ = check_updated_seqids(set(seqinfo.get_seqids()), set())
    # Ignore removed sequences if not doing complete update
    print_log("Build: adding " + str(len(added_seqids)) + " sequences",
              cfg.quiet)
    print_log("", cfg.quiet)

    # Set bin length
    if cfg.bin_length:  # user defined
        bin_length = cfg.bin_length
    else:
        tx = time.time()
        print_log("Calculating best bin length", cfg.quiet)
        bin_length, approx_size, n_bins = estimate_bin_len_size(
            cfg, seqinfo, tax, use_assembly)
        if bin_length <= 0:
            bin_length = 1000000
            print_log(
                "WARNING: could not estimate bin length, using default of " +
                str(bin_length) + "bp")
        else:
            print_log(
                " - bin length: " + str(bin_length) + "bp (approx: " +
                str(n_bins) + " bins / " + str("{0:.2f}".format(approx_size)) +
                "MB)", cfg.quiet)
        print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n",
                  cfg.quiet)

    # Set fragment length
    if cfg.fragment_length == -1:  # if ==-1 set default
        fragment_length = bin_length - cfg.overlap_length
    elif cfg.fragment_length == 0:  # if ==0 deactivate
        fragment_length = 0
    else:  # user input
        fragment_length = cfg.fragment_length - cfg.overlap_length

    tx = time.time()
    print_log("Running taxonomic clustering (TaxSBP)", cfg.quiet)
    taxsbp_params = {}
    taxsbp_params["nodes_file"] = ncbi_nodes_file
    taxsbp_params["bin_len"] = bin_length
    if use_assembly:
        taxsbp_params["bin_exclusive"] = "assembly"
    elif cfg.rank == "taxid":
        taxsbp_params["bin_exclusive"] = "leaves"
    else:
        taxsbp_params["bin_exclusive"] = cfg.rank
    if ncbi_merged_file: taxsbp_params["merged_file"] = ncbi_merged_file
    if fragment_length:
        taxsbp_params["fragment_len"] = fragment_length
        taxsbp_params["overlap_len"] = cfg.overlap_length
    if use_assembly: taxsbp_params["specialization"] = "assembly"
    taxsbp_params["input_table"] = seqinfo.get_csv()
    bins = Bins(taxsbp_ret=taxsbp.taxsbp.pack(**taxsbp_params))
    del taxsbp_params
    # bin statistics
    actual_number_of_bins = bins.get_number_of_bins()
    optimal_number_of_bins = optimal_bins(actual_number_of_bins)
    max_length_bin = bins.get_max_bin_length()
    max_kmer_count = max_length_bin - cfg.kmer_size + 1  # aproximate number of unique k-mers by just considering that they are all unique
    print_log(" - " + str(actual_number_of_bins) + " bins created", cfg.quiet)
    print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n",
              cfg.quiet)

    tx = time.time()
    print_log("Building database files", cfg.quiet)

    # Write .map file
    print_log(" - " + db_prefix["map"], cfg.quiet)
    bins.write_map_file(db_prefix["map"], use_assembly)

    # Write .tax file
    print_log(" - " + db_prefix["tax"], cfg.quiet)
    tax.filter(bins.get_taxids())  # filter only used taxids
    if use_assembly:
        tax.add_nodes(bins.get_specialization_taxid(),
                      "assembly")  # add assembly nodes
    tax.write(db_prefix["tax"])

    # Write .gnn file
    print_log(" - " + db_prefix["gnn"], cfg.quiet)
    gnn = Gnn(kmer_size=cfg.kmer_size,
              hash_functions=cfg.hash_functions,
              number_of_bins=actual_number_of_bins,
              rank=cfg.rank,
              bin_length=bin_length,
              fragment_length=fragment_length,
              overlap_length=cfg.overlap_length,
              bins=bins.get_list())
    gnn.write(db_prefix["gnn"])
    print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n",
              cfg.quiet)

    print_log("Building index (ganon-build)", cfg.quiet)
    # define bloom filter size based on given false positive
    MBinBits = 8388608
    print_log(
        " - max unique " + str(cfg.kmer_size) + "-mers: " +
        str(max_kmer_count), cfg.quiet)
    if not cfg.fixed_bloom_size:
        bin_size_bits = math.ceil(-(1 / (
            (1 - cfg.max_fp**(1 / float(cfg.hash_functions)))**
            (1 / float(cfg.hash_functions * max_kmer_count)) - 1)))
        print_log(
            " - IBF calculated size with fp<=" + str(cfg.max_fp) + ": " +
            str("{0:.2f}".format(
                (bin_size_bits * optimal_number_of_bins) / MBinBits)) +
            "MB (" + str(bin_size_bits) + " bits/bin * " +
            str(optimal_number_of_bins) + " optimal bins [" +
            str(actual_number_of_bins) + " real bins])", cfg.quiet)
    else:
        bin_size_bits = math.ceil(
            (cfg.fixed_bloom_size * MBinBits) / optimal_number_of_bins)
        estimated_max_fp = (1 - ((1 - (1 / float(bin_size_bits)))**(
            cfg.hash_functions * max_kmer_count)))**cfg.hash_functions
        print_log(
            " - IBF calculated max. fp with size=" +
            str(cfg.fixed_bloom_size) + "MB: " +
            str("{0:.2f}".format(estimated_max_fp) + " (" +
                str(optimal_number_of_bins) + " optimal bins [" +
                str(actual_number_of_bins) + " real bins])"), cfg.quiet)

    # Write aux. file for ganon
    acc_bin_file = tmp_output_folder + "acc_bin.txt"
    bins.write_acc_bin_file(acc_bin_file)

    run_ganon_build_cmd = " ".join([
        cfg.path_exec['build'], "--seqid-bin-file " + acc_bin_file,
        "--filter-size-bits " + str(bin_size_bits * optimal_number_of_bins)
        if cfg.max_fp else "--filter-size " + str(cfg.fixed_bloom_size),
        "--kmer-size " + str(cfg.kmer_size), "--hash-functions " +
        str(cfg.hash_functions), "--threads " + str(cfg.threads),
        "--output-filter-file " + db_prefix["ibf"],
        "--verbose" if cfg.verbose else "", "--quiet" if cfg.quiet else "",
        "--n-refs " + str(cfg.n_refs) if cfg.n_refs is not None else "",
        "--n-batches " +
        str(cfg.n_batches) if cfg.n_batches is not None else "",
        "--reference-files " +
        ",".join([file for file in input_files]) if input_files else "",
        "--directory-reference-files " +
        cfg.input_directory if cfg.input_directory else "",
        "--extension " + cfg.input_extension if cfg.input_extension else ""
    ])
    stdout, stderr = run(run_ganon_build_cmd, print_stderr=True)

    # Delete temp files
    rm_tmp_folder(tmp_output_folder)

    return True