Beispiel #1
0
def annotate_genes_slave(args):
    """
    https://github.com/czbiohub/iggtools/wiki
    """

    violation = "Please do not call build_pangenome_slave directly.  Violation"
    assert args.zzz_slave_mode, f"{violation}:  Missing --zzz_slave_mode arg."
    assert os.path.isfile(
        args.zzz_slave_toc
    ), f"{violation}: File does not exist: {args.zzz_slave_toc}"

    db = UHGG(args.zzz_slave_toc)
    species_for_genome = db.genomes

    genome_id = args.genomes
    species_id = species_for_genome[genome_id]

    last_output = f"{genome_id}.fna.lz4"
    dest_file = annotations_file(genome_id, species_id, last_output)
    command(f"aws s3 rm --recursive {dest_file.rsplit('/', 1)[0]}")
    output_files = annotate_genome(genome_id, species_id)
    upload_tasks = []
    for o in output_files:
        olz = o + ".lz4"
        if olz != last_output:
            upload_tasks.append((o, annotations_file(genome_id, species_id,
                                                     olz)))

    multithreading_map(upload_star, upload_tasks)

    # Upload this last because it indicates all other work has succeeded.
    upload(drop_lz4(last_output),
           annotations_file(genome_id, species_id, last_output))
def build_marker_genes_slave(args):
    """
    https://github.com/czbiohub/iggtools/wiki
    """

    violation = "Please do not call build_merker_genes_slave directly.  Violation"
    assert args.zzz_slave_mode, f"{violation}:  Missing --zzz_slave_mode arg."
    assert os.path.isfile(args.zzz_slave_toc), f"{violation}: File does not exist: {args.zzz_slave_toc}"
    assert os.path.isfile(args.zzz_slave_marker_genes_hmm), f"{violation}: Maker genes HMM model file does not exist: {args.zzz_slave_marker_genes_hmm}"

    db = UHGG(args.zzz_slave_toc)
    species_for_genome = db.genomes

    genome_id = args.genomes
    species_id = species_for_genome[genome_id]
    marker_genes_hmm = args.zzz_slave_marker_genes_hmm

    output_files = identify_marker_genes(genome_id, species_id, marker_genes_hmm)

    # Upload to S3
    upload_tasks = []
    for o in output_files[:-1]:
        upload_tasks.append((o, destpath(genome_id, species_id, o)))
    multithreading_map(upload_star, upload_tasks)

    # Upload this last because it indicates all other work has succeeded.
    upload(output_files[-1], destpath(genome_id, species_id, output_files[-1]))
Beispiel #3
0
def build_pangenome_master(args):

    # Fetch table of contents from s3.
    # This will be read separately by each species build subcommand, so we make a local copy.
    local_toc = os.path.basename(outputs.genomes)
    command(f"rm -f {local_toc}")
    command(f"aws s3 cp --only-show-errors {outputs.genomes} {local_toc}")

    db = UHGG(local_toc)
    species = db.species

    def species_work(species_id):
        assert species_id in species, f"Species {species_id} is not in the database."
        species_genomes = species[species_id]

        def destpath(src):
            return pangenome_file(species_id, src + ".lz4")

        # The species build will upload this file last, after everything else is successfully uploaded.
        # Therefore, if this file exists in s3, there is no need to redo the species build.
        dest_file = destpath("gene_info.txt")
        msg = f"Building pangenome for species {species_id} with {len(species_genomes)} total genomes."
        if find_files_with_retry(dest_file):
            if not args.force:
                tsprint(
                    f"Destination {dest_file} for species {species_id} pangenome already exists.  Specify --force to overwrite."
                )
                return
            msg = msg.replace("Building", "Rebuilding")

        with CONCURRENT_SPECIES_BUILDS:
            tsprint(msg)
            slave_log = "pangenome_build.log"
            slave_subdir = str(species_id)
            if not args.debug:
                command(f"rm -rf {slave_subdir}")
            if not os.path.isdir(slave_subdir):
                command(f"mkdir {slave_subdir}")
            # Recurisve call via subcommand.  Use subdir, redirect logs.
            slave_cmd = f"cd {slave_subdir}; PYTHONPATH={pythonpath()} {sys.executable} -m iggtools build_pangenome -s {species_id} --zzz_slave_mode --zzz_slave_toc {os.path.abspath(local_toc)} {'--debug' if args.debug else ''} &>> {slave_log}"
            with open(f"{slave_subdir}/{slave_log}", "w") as slog:
                slog.write(msg + "\n")
                slog.write(slave_cmd + "\n")
            try:
                command(slave_cmd)
            finally:
                # Cleanup should not raise exceptions of its own, so as not to interfere with any
                # prior exceptions that may be more informative.  Hence check=False.
                upload(f"{slave_subdir}/{slave_log}",
                       destpath(slave_log),
                       check=False)
                if not args.debug:
                    command(f"rm -rf {slave_subdir}", check=False)

    # Check for destination presence in s3 with up to 10-way concurrency.
    # If destination is absent, commence build with up to 3-way concurrency as constrained by CONCURRENT_SPECIES_BUILDS.
    species_id_list = decode_species_arg(args, species)
    multithreading_map(species_work, species_id_list, num_threads=10)
Beispiel #4
0
def import_uhgg_master(args):

    # Fetch table of contents from s3.
    # This will be read separately by each species build subcommand, so we make a local copy.
    local_toc = os.path.basename(outputs.genomes)
    command(f"rm -f {local_toc}")
    command(f"aws s3 cp --only-show-errors {outputs.genomes} {local_toc}")

    db = UHGG(local_toc)
    species_for_genome = db.genomes

    def genome_work(genome_id):
        assert genome_id in species_for_genome, f"Genome {genome_id} is not in the database."
        species_id = species_for_genome[genome_id]

        dest_file = imported_genome_file(genome_id, species_id,
                                         f"{genome_id}.fna.lz4")
        msg = f"Importing genome {genome_id} from species {species_id}."
        if find_files_with_retry(dest_file):
            if not args.force:
                tsprint(
                    f"Destination {dest_file} for genome {genome_id} already exists.  Specify --force to overwrite."
                )
                return
            msg = msg.replace("Importing", "Reimporting")

        tsprint(msg)
        slave_log = "import_uhgg.log"
        slave_subdir = f"{species_id}__{genome_id}"
        if not args.debug:
            command(f"rm -rf {slave_subdir}")
        if not os.path.isdir(slave_subdir):
            command(f"mkdir {slave_subdir}")
        # Recurisve call via subcommand.  Use subdir, redirect logs.
        slave_cmd = f"cd {slave_subdir}; PYTHONPATH={pythonpath()} {sys.executable} -m iggtools import_uhgg --genome {genome_id} --zzz_slave_mode --zzz_slave_toc {os.path.abspath(local_toc)} {'--debug' if args.debug else ''} &>> {slave_log}"
        with open(f"{slave_subdir}/{slave_log}", "w") as slog:
            slog.write(msg + "\n")
            slog.write(slave_cmd + "\n")
        try:
            command(slave_cmd)
        finally:
            # Cleanup should not raise exceptions of its own, so as not to interfere with any
            # prior exceptions that may be more informative.  Hence check=False.
            upload(f"{slave_subdir}/{slave_log}",
                   imported_genome_file(genome_id, species_id,
                                        slave_log + ".lz4"),
                   check=False)
            if not args.debug:
                command(f"rm -rf {slave_subdir}", check=False)

    genome_id_list = decode_genomes_arg(args, species_for_genome)
    multithreading_map(genome_work,
                       genome_id_list,
                       num_threads=CONCURRENT_GENOME_IMPORTS)
Beispiel #5
0
def midas_run_species(args):

    tempdir = f"{args.outdir}/species/temp/"

    command(f"rm -rf {tempdir}")
    command(f"mkdir -p {tempdir}")

    markers_db_files = multithreading_map(download_reference, [f"s3://microbiome-igg/2.0/marker_genes/phyeco/phyeco.fa{ext}.lz4" for ext in ["", ".bwt", ".header", ".sa", ".sequence"]] + ["s3://microbiome-igg/2.0/marker_genes/phyeco/phyeco.map.lz4"])

    db = UHGG()
    species_info = db.species

    marker_info = read_marker_info_repgenomes(markers_db_files[-1])

    with TimedSection("aligning reads to marker-genes database"):
        m8_file = map_reads_hsblast(tempdir, args.r1, args.r2, args.word_size, markers_db_files[0], args.max_reads)

    with InputStream(params.inputs.marker_genes_hmm_cutoffs) as cutoff_params:
        marker_cutoffs = dict(select_from_tsv(cutoff_params, selected_columns={"marker_id": str, "marker_cutoff": float}))

    with TimedSection("classifying reads"):
        best_hits = find_best_hits(args, marker_info, m8_file, marker_cutoffs)
        unique_alns = assign_unique(best_hits, species_info, marker_info)
        species_alns = assign_non_unique(best_hits, unique_alns, marker_info)

    with TimedSection("estimating species abundance"):
        total_gene_length = sum_marker_gene_lengths(marker_info)
        species_abundance = normalize_counts(species_alns, total_gene_length)

    write_abundance(args.outdir, species_abundance)
def build_marker_genes_master(args):

    # Fetch table of contents and marker genes HMM model from s3.
    # This will be read separately by each species build subcommand, so we make a local copy.
    local_toc, marker_genes_hmm = multithreading_map(download_reference, [outputs.genomes, inputs.marker_genes_hmm])

    db = UHGG(local_toc)
    species_for_genome = db.genomes

    def genome_work(genome_id):
        assert genome_id in species_for_genome, f"Genome {genome_id} is not in the database."
        species_id = species_for_genome[genome_id]

        dest_file = destpath(genome_id, species_id, lastoutput(genome_id))
        msg = f"Running HMMsearch for genome {genome_id} from species {species_id}."
        if find_files_with_retry(dest_file):
            if not args.force:
                tsprint(f"Destination {dest_file} for genome {genome_id} already exists.  Specify --force to overwrite.")
                return
            msg = msg.replace("Running", "Rerunning")

        tsprint(msg)
        slave_log = "build_marker_genes.log"
        slave_subdir = f"{species_id}__{genome_id}"
        if not args.debug:
            command(f"rm -rf {slave_subdir}")
        if not os.path.isdir(slave_subdir):
            command(f"mkdir {slave_subdir}")

        # Recurisve call via subcommand.  Use subdir, redirect logs.
        slave_cmd = f"cd {slave_subdir}; PYTHONPATH={pythonpath()} {sys.executable} -m iggtools build_marker_genes --genome {genome_id} --zzz_slave_mode --zzz_slave_toc {os.path.abspath(local_toc)} --zzz_slave_marker_genes_hmm {os.path.abspath(marker_genes_hmm)} {'--debug' if args.debug else ''} &>> {slave_log}"
        with open(f"{slave_subdir}/{slave_log}", "w") as slog:
            slog.write(msg + "\n")
            slog.write(slave_cmd + "\n")
        try:
            command(slave_cmd)
        finally:
            # Cleanup should not raise exceptions of its own, so as not to interfere with any
            # prior exceptions that may be more informative.  Hence check=False.
            upload(f"{slave_subdir}/{slave_log}", destpath(genome_id, species_id, slave_log), check=False)
            if not args.debug:
                command(f"rm -rf {slave_subdir}", check=False)

    genome_id_list = decode_genomes_arg(args, species_for_genome)
    multithreading_map(genome_work, genome_id_list, num_threads=CONCURRENT_MARKER_GENES_IDENTIFY)
Beispiel #7
0
def build_pangenome_slave(args):
    """
    Input spec:  https://github.com/czbiohub/iggtools/wiki#gene-annotations
    Output spec: https://github.com/czbiohub/iggtools/wiki#pan-genomes
    """

    violation = "Please do not call build_pangenome_slave directly.  Violation"
    assert args.zzz_slave_mode, f"{violation}:  Missing --zzz_slave_mode arg."
    assert os.path.isfile(
        args.zzz_slave_toc
    ), f"{violation}: File does not exist: {args.zzz_slave_toc}"
    assert os.path.basename(
        os.getcwd()
    ) == args.species, f"{violation}: {os.path.basename(os.getcwd())} != {args.species}"

    db = UHGG(args.zzz_slave_toc)
    species = db.species
    species_id = args.species

    assert species_id in species, f"{violation}: Species {species_id} is not in the database."

    species_genomes = species[species_id]
    species_genomes_ids = species_genomes.keys()

    def destpath(src):
        return pangenome_file(species_id, src + ".lz4")

    command(f"aws s3 rm --recursive {pangenome_file(species_id, '')}")

    cleaned = multiprocessing_map(clean_genes,
                                  ((species_id, genome_id)
                                   for genome_id in species_genomes_ids))

    command("rm -f genes.ffn genes.len")

    for temp_files in split(cleaned, 20):  # keep "cat" commands short
        ffn_files, len_files = transpose(temp_files)
        command("cat " + " ".join(ffn_files) + " >> genes.ffn")
        command("cat " + " ".join(len_files) + " >> genes.len")

    # The initial clustering to max_percent takes longest.
    max_percent, lower_percents = CLUSTERING_PERCENTS[0], CLUSTERING_PERCENTS[
        1:]
    cluster_files = {max_percent: vsearch(max_percent, "genes.ffn")}

    # Reclustering of the max_percent centroids is usually quick, and can proceed in prallel.
    recluster = lambda percent_id: vsearch(percent_id, cluster_files[
        max_percent][0])
    cluster_files.update(multithreading_hashmap(recluster, lower_percents))

    xref(cluster_files, "gene_info.txt")

    # Create list of (source, dest) pairs for uploading.
    # Note that centroids.{max_percent}.ffn is uploaded to 2 different destinations.
    upload_tasks = [
        ("genes.ffn", destpath("genes.ffn")),
        ("genes.len", destpath("genes.len")),
        (f"centroids.{max_percent}.ffn", destpath("centroids.ffn")
         )  # no percent in dest, per spec
    ]
    for src in flatten(cluster_files.values()):
        upload_tasks.append((src, destpath("temp/" + src)))

    # Upload in parallel.
    multithreading_map(upload_star, upload_tasks)

    # Leave this upload for last, so the presence of this file in s3 would indicate the entire species build has succeeded.
    upload("gene_info.txt", destpath("gene_info.txt"))
def collate_repgenome_markers(args):

    db = UHGG()
    species = db.species
    representatives = db.representatives

    collate_log = "collate_repgenome_markers.log"
    collate_subdir = f"collate_repgenome_markers"

    dest_file = destpath(collate_log)
    msg = f"Collating marker genes sequences."
    if find_files_with_retry(dest_file):
        if not args.force:
            tsprint(
                f"Destination {dest_file} already exists.  Specify --force to overwrite."
            )
            return
        msg = msg.replace(msg.split(" ")[0], "Re-" + msg.split(" ")[0])

    tsprint(msg)
    if not args.debug:
        command(f"rm -rf {collate_subdir}")
    if not os.path.isdir(collate_subdir):
        command(f"mkdir {collate_subdir}")
    with open(f"{collate_subdir}/{collate_log}", "w") as slog:
        slog.write(msg + "\n")

    # Download
    download_seq_tasks = []
    download_map_tasks = []
    for species_id in species.keys():
        rep_id = representatives[species_id]
        remote_path_seq = input_marker_genes_file(rep_id, species_id,
                                                  f"{rep_id}.markers.fa.lz4")
        remote_path_map = input_marker_genes_file(rep_id, species_id,
                                                  f"{rep_id}.markers.map.lz4")
        download_seq_tasks.append((remote_path_seq, collate_subdir))
        download_map_tasks.append((remote_path_map, collate_subdir))
    downloaded_marker_seqs = multithreading_map(
        download_reference,
        download_seq_tasks,
        num_threads=CONCURRENT_MARKER_GENES_DOWNLOAD)
    downloaded_marker_maps = multithreading_map(
        download_reference,
        download_map_tasks,
        num_threads=CONCURRENT_MARKER_GENES_DOWNLOAD)

    ## Collate
    collated_rep_marker_seqs = output_all_rep_marker_genes("fa")
    collated_genes = os.path.basename(collated_rep_marker_seqs)
    for marker_fa_files in split(downloaded_marker_seqs, 20):
        command("cat " + " ".join(marker_fa_files) +
                f" >> {collate_subdir}/{collated_genes}")

    collated_rep_marker_maps = output_all_rep_marker_genes("map")
    collated_maps = os.path.basename(collated_rep_marker_maps)
    for marker_map_files in split(downloaded_marker_maps, 20):
        command("cat " + " ".join(marker_map_files) +
                f" >> {collate_subdir}/{collated_maps}")

    ## Index
    cmd_index = f"cd {collate_subdir}; hs-blastn index {collated_genes} &>> {collate_log}"
    with open(f"{collate_subdir}/{collate_log}", "a") as slog:
        slog.write(cmd_index + "\n")
    command(cmd_index)
    index_suffix = ["fa", "map", "fa.bwt", "fa.header", "fa.sa", "fa.sequence"]
    output_files = [
        f"{collate_subdir}/{inputs.marker_set}.{isuffix}"
        for isuffix in index_suffix
    ]

    ## Upload
    upload_tasks = []
    for o in output_files:
        upload_tasks.append((o, destpath(os.path.basename(o))))
    multithreading_map(upload_star, upload_tasks)

    # Upload the log file in the last
    upload(f"{collate_subdir}/{collate_log}",
           destpath(collate_log),
           check=False)

    ## Clean up
    if not args.debug:
        command(f"rm -rf {collate_subdir}", check=False)