コード例 #1
0
def import_uhgg_slave(args):
    """
    https://github.com/czbiohub/iggtools/wiki
    """

    violation = "Please do not call build_pangenome_slave directly.  Violation"
    assert args.zzz_slave_mode, f"{violation}:  Missing --zzz_slave_mode arg."
    assert os.path.isfile(
        args.zzz_slave_toc
    ), f"{violation}: File does not exist: {args.zzz_slave_toc}"

    db = UHGG(args.zzz_slave_toc)
    representatives = db.representatives
    species_for_genome = db.genomes

    genome_id = args.genomes
    species_id = species_for_genome[genome_id]
    representative_id = representatives[species_id]

    dest = imported_genome_file(genome_id, species_id, f"{genome_id}.fna.lz4")
    command(
        f"aws s3 rm --recursive {imported_genome_file(genome_id, species_id, '')}"
    )
    cleaned = clean_genome(genome_id, representative_id)
    upload(cleaned, dest)
コード例 #2
0
def identify_marker_genes(genome_id, species_id, marker_genes_hmm):

    command(f"aws s3 rm --recursive {output_marker_genes_file(genome_id, species_id, '')}")

    hmmsearch_file = hmmsearch(genome_id, species_id, marker_genes_hmm, num_threads=1)

    annotated_genes_s3_path = input_annotations_file(genome_id, species_id, f"{genome_id}.ffn.lz4")
    genes = fetch_genes(annotated_genes_s3_path)

    # Parse local hmmsearch file
    hmmsearch_seq = f"{genome_id}.markers.fa"
    hmmsearch_map = f"{genome_id}.markers.map"

    with open(hmmsearch_seq, "w") as o_seq, open(hmmsearch_map, "w") as o_map:
        for rec in find_hits(hmmsearch_file):
            marker_gene = genes[rec["query"]].upper()
            marker_info = [species_id, genome_id, rec["query"], len(marker_gene), rec["target"]]
            o_map.write('\t'.join(str(mi) for mi in marker_info) + '\n')
            o_seq.write('>%s\n%s\n' % (rec['query'], marker_gene))

    output_files = [hmmsearch_file, hmmsearch_seq, hmmsearch_map]
    # Make sure output hmmsearch_map last cuz it indicates all other files has successed
    assert output_files[-1] == lastoutput(genome_id)

    return output_files
コード例 #3
0
def midas_run_species(args):

    tempdir = f"{args.outdir}/species/temp/"

    command(f"rm -rf {tempdir}")
    command(f"mkdir -p {tempdir}")

    markers_db_files = multithreading_map(download_reference, [f"s3://microbiome-igg/2.0/marker_genes/phyeco/phyeco.fa{ext}.lz4" for ext in ["", ".bwt", ".header", ".sa", ".sequence"]] + ["s3://microbiome-igg/2.0/marker_genes/phyeco/phyeco.map.lz4"])

    db = UHGG()
    species_info = db.species

    marker_info = read_marker_info_repgenomes(markers_db_files[-1])

    with TimedSection("aligning reads to marker-genes database"):
        m8_file = map_reads_hsblast(tempdir, args.r1, args.r2, args.word_size, markers_db_files[0], args.max_reads)

    with InputStream(params.inputs.marker_genes_hmm_cutoffs) as cutoff_params:
        marker_cutoffs = dict(select_from_tsv(cutoff_params, selected_columns={"marker_id": str, "marker_cutoff": float}))

    with TimedSection("classifying reads"):
        best_hits = find_best_hits(args, marker_info, m8_file, marker_cutoffs)
        unique_alns = assign_unique(best_hits, species_info, marker_info)
        species_alns = assign_non_unique(best_hits, unique_alns, marker_info)

    with TimedSection("estimating species abundance"):
        total_gene_length = sum_marker_gene_lengths(marker_info)
        species_abundance = normalize_counts(species_alns, total_gene_length)

    write_abundance(args.outdir, species_abundance)
コード例 #4
0
def init_nvme(args):
    # TODO:  Generalize the magic numbers 838 and 1715518 (those are for AWS instance type r5.12xlarge).  # pylint: disable=fixme
    # https://github.com/czbiohub/iggtools/issues/17
    if nvme_size_str() != '1715518':
        # Raid, format, and mount the NVME drives attached to this instance.
        tsprint("Initializing instance NVME storage.")
        try:
            command(
                """set -o pipefail; lsblk | grep 838 | awk '{print "/dev/"$1}' | xargs -n 10 s3mi raid nvme"""
            )
        except Exception as e:
            try:
                # Sometimes we've formatted it in a prior incarnation but the mountpoint can't exist in the container to tell us.
                # In those cases we can just try to mount it.
                command("""mount /dev/md0 /mnt/nvme""")
            except:
                raise e
        assert nvme_size_str(
        ) == '1715518', "Failed to initialize and mount instance NVME storage."
    else:
        tsprint("Instance NVME storage previously initialized.")
        if args.force:
            tsprint(
                "Ignoring --force argument.  It is usually unnecessary to reinitialize AWS instance storage."
            )
コード例 #5
0
ファイル: annotate_genes.py プロジェクト: bsmith89/iggtools
def annotate_genes_slave(args):
    """
    https://github.com/czbiohub/iggtools/wiki
    """

    violation = "Please do not call build_pangenome_slave directly.  Violation"
    assert args.zzz_slave_mode, f"{violation}:  Missing --zzz_slave_mode arg."
    assert os.path.isfile(
        args.zzz_slave_toc
    ), f"{violation}: File does not exist: {args.zzz_slave_toc}"

    db = UHGG(args.zzz_slave_toc)
    species_for_genome = db.genomes

    genome_id = args.genomes
    species_id = species_for_genome[genome_id]

    last_output = f"{genome_id}.fna.lz4"
    dest_file = annotations_file(genome_id, species_id, last_output)
    command(f"aws s3 rm --recursive {dest_file.rsplit('/', 1)[0]}")
    output_files = annotate_genome(genome_id, species_id)
    upload_tasks = []
    for o in output_files:
        olz = o + ".lz4"
        if olz != last_output:
            upload_tasks.append((o, annotations_file(genome_id, species_id,
                                                     olz)))

    multithreading_map(upload_star, upload_tasks)

    # Upload this last because it indicates all other work has succeeded.
    upload(drop_lz4(last_output),
           annotations_file(genome_id, species_id, last_output))
コード例 #6
0
ファイル: build_pangenome.py プロジェクト: bsmith89/iggtools
def build_pangenome_master(args):

    # Fetch table of contents from s3.
    # This will be read separately by each species build subcommand, so we make a local copy.
    local_toc = os.path.basename(outputs.genomes)
    command(f"rm -f {local_toc}")
    command(f"aws s3 cp --only-show-errors {outputs.genomes} {local_toc}")

    db = UHGG(local_toc)
    species = db.species

    def species_work(species_id):
        assert species_id in species, f"Species {species_id} is not in the database."
        species_genomes = species[species_id]

        def destpath(src):
            return pangenome_file(species_id, src + ".lz4")

        # The species build will upload this file last, after everything else is successfully uploaded.
        # Therefore, if this file exists in s3, there is no need to redo the species build.
        dest_file = destpath("gene_info.txt")
        msg = f"Building pangenome for species {species_id} with {len(species_genomes)} total genomes."
        if find_files_with_retry(dest_file):
            if not args.force:
                tsprint(
                    f"Destination {dest_file} for species {species_id} pangenome already exists.  Specify --force to overwrite."
                )
                return
            msg = msg.replace("Building", "Rebuilding")

        with CONCURRENT_SPECIES_BUILDS:
            tsprint(msg)
            slave_log = "pangenome_build.log"
            slave_subdir = str(species_id)
            if not args.debug:
                command(f"rm -rf {slave_subdir}")
            if not os.path.isdir(slave_subdir):
                command(f"mkdir {slave_subdir}")
            # Recurisve call via subcommand.  Use subdir, redirect logs.
            slave_cmd = f"cd {slave_subdir}; PYTHONPATH={pythonpath()} {sys.executable} -m iggtools build_pangenome -s {species_id} --zzz_slave_mode --zzz_slave_toc {os.path.abspath(local_toc)} {'--debug' if args.debug else ''} &>> {slave_log}"
            with open(f"{slave_subdir}/{slave_log}", "w") as slog:
                slog.write(msg + "\n")
                slog.write(slave_cmd + "\n")
            try:
                command(slave_cmd)
            finally:
                # Cleanup should not raise exceptions of its own, so as not to interfere with any
                # prior exceptions that may be more informative.  Hence check=False.
                upload(f"{slave_subdir}/{slave_log}",
                       destpath(slave_log),
                       check=False)
                if not args.debug:
                    command(f"rm -rf {slave_subdir}", check=False)

    # Check for destination presence in s3 with up to 10-way concurrency.
    # If destination is absent, commence build with up to 3-way concurrency as constrained by CONCURRENT_SPECIES_BUILDS.
    species_id_list = decode_species_arg(args, species)
    multithreading_map(species_work, species_id_list, num_threads=10)
コード例 #7
0
def midas_run_snps(args):

    tempdir = f"{args.outdir}/snps/temp_sc{args.species_cov}"
    if args.debug and os.path.exists(tempdir):
        tsprint(
            f"INFO:  Reusing existing temp data in {tempdir} according to --debug flag."
        )
    else:
        command(f"rm -rf {tempdir}")
        command(f"mkdir -p {tempdir}")

    outputdir = f"{args.outdir}/snps/output_sc{args.species_cov}"
    if not os.path.exists(outputdir):
        command(f"mkdir -p {outputdir}")

    try:
        # The full species profile must exist -- it is output by run_midas_species.
        # Restrict to species above requested coverage.
        full_species_profile = parse_species_profile(args.outdir)
        species_profile = select_species(full_species_profile,
                                         args.species_cov)

        local_toc = download_reference(outputs.genomes)
        db = UHGG(local_toc)
        representatives = db.representatives

        def download_contigs(species_id):
            return download_reference(
                imported_genome_file(representatives[species_id], species_id,
                                     "fna.lz4"), f"{tempdir}/{species_id}")

        # Download repgenome_id.fna for every species in the restricted species profile.
        contigs_files = multithreading_hashmap(download_contigs,
                                               species_profile.keys(),
                                               num_threads=20)

        # Use Bowtie2 to map reads to a representative genomes
        bt2_db_name = "repgenomes"
        build_bowtie2_db(tempdir, bt2_db_name, contigs_files)
        bowtie2_align(args, tempdir, bt2_db_name, sort_aln=True)

        # Use mpileup to identify SNPs
        samtools_index(args, tempdir, bt2_db_name)
        species_pileup_stats = pysam_pileup(args, list(species_profile.keys()),
                                            tempdir, outputdir, contigs_files)

        write_snps_summary(
            species_pileup_stats,
            f"{args.outdir}/snps/output_sc{args.species_cov}/summary.txt")

    except:
        if not args.debug:
            tsprint(
                "Deleting untrustworthy outputs due to error. Specify --debug flag to keep."
            )
            command(f"rm -rf {tempdir}", check=False)
            command(f"rm -rf {outputdir}", check=False)
コード例 #8
0
def import_uhgg_master(args):

    # Fetch table of contents from s3.
    # This will be read separately by each species build subcommand, so we make a local copy.
    local_toc = os.path.basename(outputs.genomes)
    command(f"rm -f {local_toc}")
    command(f"aws s3 cp --only-show-errors {outputs.genomes} {local_toc}")

    db = UHGG(local_toc)
    species_for_genome = db.genomes

    def genome_work(genome_id):
        assert genome_id in species_for_genome, f"Genome {genome_id} is not in the database."
        species_id = species_for_genome[genome_id]

        dest_file = imported_genome_file(genome_id, species_id,
                                         f"{genome_id}.fna.lz4")
        msg = f"Importing genome {genome_id} from species {species_id}."
        if find_files_with_retry(dest_file):
            if not args.force:
                tsprint(
                    f"Destination {dest_file} for genome {genome_id} already exists.  Specify --force to overwrite."
                )
                return
            msg = msg.replace("Importing", "Reimporting")

        tsprint(msg)
        slave_log = "import_uhgg.log"
        slave_subdir = f"{species_id}__{genome_id}"
        if not args.debug:
            command(f"rm -rf {slave_subdir}")
        if not os.path.isdir(slave_subdir):
            command(f"mkdir {slave_subdir}")
        # Recurisve call via subcommand.  Use subdir, redirect logs.
        slave_cmd = f"cd {slave_subdir}; PYTHONPATH={pythonpath()} {sys.executable} -m iggtools import_uhgg --genome {genome_id} --zzz_slave_mode --zzz_slave_toc {os.path.abspath(local_toc)} {'--debug' if args.debug else ''} &>> {slave_log}"
        with open(f"{slave_subdir}/{slave_log}", "w") as slog:
            slog.write(msg + "\n")
            slog.write(slave_cmd + "\n")
        try:
            command(slave_cmd)
        finally:
            # Cleanup should not raise exceptions of its own, so as not to interfere with any
            # prior exceptions that may be more informative.  Hence check=False.
            upload(f"{slave_subdir}/{slave_log}",
                   imported_genome_file(genome_id, species_id,
                                        slave_log + ".lz4"),
                   check=False)
            if not args.debug:
                command(f"rm -rf {slave_subdir}", check=False)

    genome_id_list = decode_genomes_arg(args, species_for_genome)
    multithreading_map(genome_work,
                       genome_id_list,
                       num_threads=CONCURRENT_GENOME_IMPORTS)
コード例 #9
0
    def genome_work(genome_id):
        assert genome_id in species_for_genome, f"Genome {genome_id} is not in the database."
        species_id = species_for_genome[genome_id]

        dest_file = destpath(genome_id, species_id, lastoutput(genome_id))
        msg = f"Running HMMsearch for genome {genome_id} from species {species_id}."
        if find_files_with_retry(dest_file):
            if not args.force:
                tsprint(f"Destination {dest_file} for genome {genome_id} already exists.  Specify --force to overwrite.")
                return
            msg = msg.replace("Running", "Rerunning")

        tsprint(msg)
        slave_log = "build_marker_genes.log"
        slave_subdir = f"{species_id}__{genome_id}"
        if not args.debug:
            command(f"rm -rf {slave_subdir}")
        if not os.path.isdir(slave_subdir):
            command(f"mkdir {slave_subdir}")

        # Recurisve call via subcommand.  Use subdir, redirect logs.
        slave_cmd = f"cd {slave_subdir}; PYTHONPATH={pythonpath()} {sys.executable} -m iggtools build_marker_genes --genome {genome_id} --zzz_slave_mode --zzz_slave_toc {os.path.abspath(local_toc)} --zzz_slave_marker_genes_hmm {os.path.abspath(marker_genes_hmm)} {'--debug' if args.debug else ''} &>> {slave_log}"
        with open(f"{slave_subdir}/{slave_log}", "w") as slog:
            slog.write(msg + "\n")
            slog.write(slave_cmd + "\n")
        try:
            command(slave_cmd)
        finally:
            # Cleanup should not raise exceptions of its own, so as not to interfere with any
            # prior exceptions that may be more informative.  Hence check=False.
            upload(f"{slave_subdir}/{slave_log}", destpath(genome_id, species_id, slave_log), check=False)
            if not args.debug:
                command(f"rm -rf {slave_subdir}", check=False)
コード例 #10
0
ファイル: bowtie2.py プロジェクト: bsmith89/iggtools
def samtools_index(args, bt2_db_dir, bt2_db_name):
    if args.debug and os.path.exists(f"{bt2_db_dir}/{bt2_db_name}.bam.bai"):
        tsprint(
            f"Skipping samtools index in debug mode as temporary data exists: {bt2_db_dir}/{bt2_db_name}.bam"
        )
        return

    try:
        command(
            f"samtools index -@ {num_physical_cores} {bt2_db_dir}/{bt2_db_name}.bam"
        )
    except:
        command(f"rm -f {bt2_db_dir}/{bt2_db_name}.bam.bai")
        raise
コード例 #11
0
def hmmsearch(genome_id, species_id, marker_genes_hmm, num_threads=1):
    # Input
    annotated_genes_s3_path = input_annotations_file(genome_id, species_id, f"{genome_id}.faa.lz4")
    annotated_genes = download_reference(annotated_genes_s3_path)

    # Output
    hmmsearch_file = f"{genome_id}.hmmsearch"

    # Command
    if find_files(hmmsearch_file):
        # This only happens in debug mode, where we can use pre-existing file.
        tsprint(f"Found hmmsearch results for genome {genome_id} from prior run.")
    else:
        try:
            command(f"hmmsearch --noali --cpu {num_threads} --domtblout {hmmsearch_file} {marker_genes_hmm} {annotated_genes}")
        except:
            # Do not keep bogus zero-length files;  those are harmful if we rerun in place.
            command(f"mv {hmmsearch_file} {hmmsearch_file}.bogus", check=False)
            raise

    return hmmsearch_file
コード例 #12
0
ファイル: midas_run_genes.py プロジェクト: bsmith89/iggtools
def write_results(outdir, species, num_covered_genes, species_markers_coverage, species_mean_coverage):
    if not os.path.exists(f"{outdir}/genes/output"):
        command(f"mkdir -p {outdir}/genes/output")
    # open outfiles for each species_id
    header = ['gene_id', 'count_reads', 'coverage', 'copy_number']
    for species_id, species_genes in species.items():
        path = f"{outdir}/genes/output/{species_id}.genes.lz4"
        with OutputStream(path) as sp_out:
            sp_out.write('\t'.join(header) + '\n')
            for gene_id, gene in species_genes.items():
                if gene["depth"] == 0:
                    # Sparse by default here.  You can get the pangenome_size from the summary file, emitted below.
                    continue
                values = [gene_id, str(gene["mapped_reads"]), format(gene["depth"], DECIMALS), format(gene["copies"], DECIMALS)]
                sp_out.write('\t'.join(values) + '\n')
    # summary stats
    header = ['species_id', 'pangenome_size', 'covered_genes', 'fraction_covered', 'mean_coverage', 'marker_coverage', 'aligned_reads', 'mapped_reads']
    path = f"{outdir}/genes/summary.txt"
    with OutputStream(path) as file:
        file.write('\t'.join(header) + '\n')
        for species_id, species_genes in species.items():
            # No sparsity here -- should be extremely rare for a species row to be all 0.
            aligned_reads = sum(g["aligned_reads"] for g in species_genes.values())
            mapped_reads = sum(g["mapped_reads"] for g in species_genes.values())
            pangenome_size = len(species_genes)
            values = [
                species_id,
                str(pangenome_size),
                str(num_covered_genes[species_id]),
                format(num_covered_genes[species_id] / pangenome_size, DECIMALS),
                format(species_mean_coverage[species_id], DECIMALS),
                format(species_markers_coverage[species_id], DECIMALS),
                str(aligned_reads),
                str(mapped_reads)
            ]
            file.write('\t'.join(values) + '\n')
コード例 #13
0
ファイル: build_pangenome.py プロジェクト: bsmith89/iggtools
    def species_work(species_id):
        assert species_id in species, f"Species {species_id} is not in the database."
        species_genomes = species[species_id]

        def destpath(src):
            return pangenome_file(species_id, src + ".lz4")

        # The species build will upload this file last, after everything else is successfully uploaded.
        # Therefore, if this file exists in s3, there is no need to redo the species build.
        dest_file = destpath("gene_info.txt")
        msg = f"Building pangenome for species {species_id} with {len(species_genomes)} total genomes."
        if find_files_with_retry(dest_file):
            if not args.force:
                tsprint(
                    f"Destination {dest_file} for species {species_id} pangenome already exists.  Specify --force to overwrite."
                )
                return
            msg = msg.replace("Building", "Rebuilding")

        with CONCURRENT_SPECIES_BUILDS:
            tsprint(msg)
            slave_log = "pangenome_build.log"
            slave_subdir = str(species_id)
            if not args.debug:
                command(f"rm -rf {slave_subdir}")
            if not os.path.isdir(slave_subdir):
                command(f"mkdir {slave_subdir}")
            # Recurisve call via subcommand.  Use subdir, redirect logs.
            slave_cmd = f"cd {slave_subdir}; PYTHONPATH={pythonpath()} {sys.executable} -m iggtools build_pangenome -s {species_id} --zzz_slave_mode --zzz_slave_toc {os.path.abspath(local_toc)} {'--debug' if args.debug else ''} &>> {slave_log}"
            with open(f"{slave_subdir}/{slave_log}", "w") as slog:
                slog.write(msg + "\n")
                slog.write(slave_cmd + "\n")
            try:
                command(slave_cmd)
            finally:
                # Cleanup should not raise exceptions of its own, so as not to interfere with any
                # prior exceptions that may be more informative.  Hence check=False.
                upload(f"{slave_subdir}/{slave_log}",
                       destpath(slave_log),
                       check=False)
                if not args.debug:
                    command(f"rm -rf {slave_subdir}", check=False)
コード例 #14
0
ファイル: bowtie2.py プロジェクト: bsmith89/iggtools
def build_bowtie2_db(bt2_db_dir, bt2_db_name, downloaded_files):
    """
    Build Bowtie2 database of representative genomes or centroid genes
    for the species present in the sample, e.g. repgenomes OR pangenomes
    """
    bt2_db_suffixes = [
        "1.bt2", "2.bt2", "3.bt2", "4.bt2", "rev.1.bt2", "rev.2.bt2"
    ]
    if all(
            os.path.exists(f"{bt2_db_dir}/{bt2_db_name}.{ext}")
            for ext in bt2_db_suffixes):
        tsprint("Skipping bowtie2-build as database files appear to exist.")
        return
    command(f"rm -f {bt2_db_dir}/{bt2_db_name}.fa")
    command(f"touch {bt2_db_dir}/{bt2_db_name}.fa")

    for files in split(downloaded_files.values(),
                       20):  # keep "cat" commands short
        command("cat " + " ".join(files) +
                f" >> {bt2_db_dir}/{bt2_db_name}.fa")

    command(
        f"bowtie2-build --threads {num_physical_cores} {bt2_db_dir}/{bt2_db_name}.fa {bt2_db_dir}/{bt2_db_name} > {bt2_db_dir}/bowtie2-build.log"
    )
コード例 #15
0
ファイル: build_pangenome.py プロジェクト: bsmith89/iggtools
def vsearch(percent_id, genes, num_threads=num_vcpu):
    centroids = f"centroids.{percent_id}.ffn"
    uclust = f"uclust.{percent_id}.txt"
    # log = f"uclust.{percent_id}.log"
    if find_files(centroids) and find_files(uclust):
        tsprint(
            f"Found vsearch results at percent identity {percent_id} from prior run."
        )
    else:
        try:
            command(
                f"vsearch --quiet --cluster_fast {genes} --id {percent_id/100.0} --threads {num_threads} --centroids {centroids} --uc {uclust}"
            )
        except:
            # Do not keep bogus zero-length files;  those are harmful if we rerun in place.
            command(f"mv {centroids} {centroids}.bogus", check=False)
            command(f"mv {uclust} {uclust}.bogus", check=False)
            raise
    return centroids, uclust  #, log
コード例 #16
0
ファイル: bowtie2.py プロジェクト: bsmith89/iggtools
def bowtie2_align(args, bt2_db_dir, bt2_db_name, sort_aln=False):
    """
    Use Bowtie2 to map reads to specified representative genomes or
    collections of centroids genes for the pangenome flow.
    """

    if args.debug and os.path.exists(f"{bt2_db_dir}/{bt2_db_name}.bam"):
        tsprint(
            f"Skipping Bowtie2 alignment in debug mode as temporary data exists: {bt2_db_dir}/{bt2_db_name}.bam"
        )
        return

    # Construct bowtie2 align input arguments
    max_reads = f"-u {args.max_reads}" if args.max_reads else ""
    aln_mode = "local" if args.aln_mode == "local" else "end-to-end"
    aln_speed = args.aln_speed if aln_mode == "end_to_end" else args.aln_speed + "-local"
    r2 = ""
    if args.r2:
        r1 = f"-1 {args.r1}"
        r2 = f"-2 {args.r2}"
    elif args.aln_interleaved:
        r1 = f"--interleaved {args.r1}"
    else:
        r1 = f"-U {args.r1}"

    try:
        bt2_command = f"bowtie2 --no-unal -x {bt2_db_dir}/{bt2_db_name} {max_reads} --{aln_mode} --{aln_speed} --threads {num_physical_cores} -q {r1} {r2}"
        if sort_aln:
            command(f"set -o pipefail; {bt2_command} | \
                    samtools view --threads {num_physical_cores} -b - | \
                    samtools sort --threads {num_physical_cores} -o {bt2_db_dir}/{bt2_db_name}.bam"
                    )
        else:
            command(f"set -o pipefail; {bt2_command} | \
                    samtools view --threads {num_physical_cores} -b - > {bt2_db_dir}/{bt2_db_name}.bam"
                    )
    except:
        tsprint(
            f"Bowtie2 align to {bt2_db_dir}/{bt2_db_name}.bam run into error")
        command(f"rm -f {bt2_db_dir}/{bt2_db_name}.bam")
        raise
コード例 #17
0
ファイル: midas_run_genes.py プロジェクト: bsmith89/iggtools
def midas_run_genes(args):

    tempdir = f"{args.outdir}/genes/temp_sc{args.species_cov}"

    if args.debug and os.path.exists(tempdir):
        tsprint(f"INFO:  Reusing existing temp data in {tempdir} according to --debug flag.")
    else:
        command(f"rm -rf {tempdir}")
        command(f"mkdir -p {tempdir}")

    try:
        # The full species profile must exist -- it is output by run_midas_species.
        # Restrict to species above requested coverage.
        full_species_profile = parse_species_profile(args.outdir)
        species_profile = select_species(full_species_profile, args.species_cov)

        def download_centroid(species_id):
            return download_reference(pangenome_file(species_id, "centroids.ffn.lz4"), f"{tempdir}/{species_id}")  # TODO colocate samples to overlap reference downloads

        # Download centroids.ffn for every species in the restricted species profile.
        centroids_files = multithreading_hashmap(download_centroid, species_profile.keys(), num_threads=20)

        # Perhaps avoid this giant conglomerated file, fetching instead submaps for each species.
        # Also colocate/cache/download in master for multiple slave subcommand invocations.
        bt2_db_name = "pangenomes"
        build_bowtie2_db(tempdir, bt2_db_name, centroids_files)
        bowtie2_align(args, tempdir, bt2_db_name, sort_aln=False)

        # Compute coverage of pangenome for each present species and write results to disk
        marker_genes_map = "s3://microbiome-igg/2.0/marker_genes/phyeco/phyeco.map.lz4"
        species, genes = scan_centroids(centroids_files)
        num_covered_genes, species_mean_coverage, covered_genes = count_mapped_bp(args, tempdir, genes)
        markers = scan_markers(genes, marker_genes_map)
        species_markers_coverage = normalize(genes, covered_genes, markers)

        write_results(args.outdir, species, num_covered_genes, species_markers_coverage, species_mean_coverage)
    except:
        if not args.debug:
            tsprint("Deleting untrustworthy outputs due to error.  Specify --debug flag to keep.")
            command(f"rm -rf {tempdir}", check=False)
コード例 #18
0
ファイル: annotate_genes.py プロジェクト: bsmith89/iggtools
def annotate_genome(genome_id, species_id):
    # Prokka will crash if installed <6 months ago.  It's a feature.  See tbl2asn.
    cleaned_genome = imported_genome_file(genome_id, species_id,
                                          f"{genome_id}.fna.lz4")
    ugid = unified_genome_id(genome_id)

    download_genome(genome_id, cleaned_genome)

    subdir = "prokka_dir"
    command(f"rm -rf {subdir}")

    output_files = [
        f"{genome_id}.faa", f"{genome_id}.ffn", f"{genome_id}.fna",
        f"{genome_id}.gff", f"{genome_id}.tsv"
    ]
    command(
        f"prokka --kingdom Bacteria --outdir {subdir} --cpus 8 --prefix {genome_id} --locustag {ugid} --compliant {genome_id}.fasta"
    )
    for o in output_files:
        command(f"mv {subdir}/{o} .")

    return output_files
コード例 #19
0
ファイル: annotate_genes.py プロジェクト: bsmith89/iggtools
def download_genome(genome_id, cleaned_genome):
    command(f"rm -f {genome_id}.fasta")
    command(
        f"aws s3 cp --only-show-errors {cleaned_genome} - | lz4 -dc > {genome_id}.fasta"
    )
コード例 #20
0
def collate_repgenome_markers(args):

    db = UHGG()
    species = db.species
    representatives = db.representatives

    collate_log = "collate_repgenome_markers.log"
    collate_subdir = f"collate_repgenome_markers"

    dest_file = destpath(collate_log)
    msg = f"Collating marker genes sequences."
    if find_files_with_retry(dest_file):
        if not args.force:
            tsprint(
                f"Destination {dest_file} already exists.  Specify --force to overwrite."
            )
            return
        msg = msg.replace(msg.split(" ")[0], "Re-" + msg.split(" ")[0])

    tsprint(msg)
    if not args.debug:
        command(f"rm -rf {collate_subdir}")
    if not os.path.isdir(collate_subdir):
        command(f"mkdir {collate_subdir}")
    with open(f"{collate_subdir}/{collate_log}", "w") as slog:
        slog.write(msg + "\n")

    # Download
    download_seq_tasks = []
    download_map_tasks = []
    for species_id in species.keys():
        rep_id = representatives[species_id]
        remote_path_seq = input_marker_genes_file(rep_id, species_id,
                                                  f"{rep_id}.markers.fa.lz4")
        remote_path_map = input_marker_genes_file(rep_id, species_id,
                                                  f"{rep_id}.markers.map.lz4")
        download_seq_tasks.append((remote_path_seq, collate_subdir))
        download_map_tasks.append((remote_path_map, collate_subdir))
    downloaded_marker_seqs = multithreading_map(
        download_reference,
        download_seq_tasks,
        num_threads=CONCURRENT_MARKER_GENES_DOWNLOAD)
    downloaded_marker_maps = multithreading_map(
        download_reference,
        download_map_tasks,
        num_threads=CONCURRENT_MARKER_GENES_DOWNLOAD)

    ## Collate
    collated_rep_marker_seqs = output_all_rep_marker_genes("fa")
    collated_genes = os.path.basename(collated_rep_marker_seqs)
    for marker_fa_files in split(downloaded_marker_seqs, 20):
        command("cat " + " ".join(marker_fa_files) +
                f" >> {collate_subdir}/{collated_genes}")

    collated_rep_marker_maps = output_all_rep_marker_genes("map")
    collated_maps = os.path.basename(collated_rep_marker_maps)
    for marker_map_files in split(downloaded_marker_maps, 20):
        command("cat " + " ".join(marker_map_files) +
                f" >> {collate_subdir}/{collated_maps}")

    ## Index
    cmd_index = f"cd {collate_subdir}; hs-blastn index {collated_genes} &>> {collate_log}"
    with open(f"{collate_subdir}/{collate_log}", "a") as slog:
        slog.write(cmd_index + "\n")
    command(cmd_index)
    index_suffix = ["fa", "map", "fa.bwt", "fa.header", "fa.sa", "fa.sequence"]
    output_files = [
        f"{collate_subdir}/{inputs.marker_set}.{isuffix}"
        for isuffix in index_suffix
    ]

    ## Upload
    upload_tasks = []
    for o in output_files:
        upload_tasks.append((o, destpath(os.path.basename(o))))
    multithreading_map(upload_star, upload_tasks)

    # Upload the log file in the last
    upload(f"{collate_subdir}/{collate_log}",
           destpath(collate_log),
           check=False)

    ## Clean up
    if not args.debug:
        command(f"rm -rf {collate_subdir}", check=False)
コード例 #21
0
ファイル: build_pangenome.py プロジェクト: bsmith89/iggtools
def build_pangenome_slave(args):
    """
    Input spec:  https://github.com/czbiohub/iggtools/wiki#gene-annotations
    Output spec: https://github.com/czbiohub/iggtools/wiki#pan-genomes
    """

    violation = "Please do not call build_pangenome_slave directly.  Violation"
    assert args.zzz_slave_mode, f"{violation}:  Missing --zzz_slave_mode arg."
    assert os.path.isfile(
        args.zzz_slave_toc
    ), f"{violation}: File does not exist: {args.zzz_slave_toc}"
    assert os.path.basename(
        os.getcwd()
    ) == args.species, f"{violation}: {os.path.basename(os.getcwd())} != {args.species}"

    db = UHGG(args.zzz_slave_toc)
    species = db.species
    species_id = args.species

    assert species_id in species, f"{violation}: Species {species_id} is not in the database."

    species_genomes = species[species_id]
    species_genomes_ids = species_genomes.keys()

    def destpath(src):
        return pangenome_file(species_id, src + ".lz4")

    command(f"aws s3 rm --recursive {pangenome_file(species_id, '')}")

    cleaned = multiprocessing_map(clean_genes,
                                  ((species_id, genome_id)
                                   for genome_id in species_genomes_ids))

    command("rm -f genes.ffn genes.len")

    for temp_files in split(cleaned, 20):  # keep "cat" commands short
        ffn_files, len_files = transpose(temp_files)
        command("cat " + " ".join(ffn_files) + " >> genes.ffn")
        command("cat " + " ".join(len_files) + " >> genes.len")

    # The initial clustering to max_percent takes longest.
    max_percent, lower_percents = CLUSTERING_PERCENTS[0], CLUSTERING_PERCENTS[
        1:]
    cluster_files = {max_percent: vsearch(max_percent, "genes.ffn")}

    # Reclustering of the max_percent centroids is usually quick, and can proceed in prallel.
    recluster = lambda percent_id: vsearch(percent_id, cluster_files[
        max_percent][0])
    cluster_files.update(multithreading_hashmap(recluster, lower_percents))

    xref(cluster_files, "gene_info.txt")

    # Create list of (source, dest) pairs for uploading.
    # Note that centroids.{max_percent}.ffn is uploaded to 2 different destinations.
    upload_tasks = [
        ("genes.ffn", destpath("genes.ffn")),
        ("genes.len", destpath("genes.len")),
        (f"centroids.{max_percent}.ffn", destpath("centroids.ffn")
         )  # no percent in dest, per spec
    ]
    for src in flatten(cluster_files.values()):
        upload_tasks.append((src, destpath("temp/" + src)))

    # Upload in parallel.
    multithreading_map(upload_star, upload_tasks)

    # Leave this upload for last, so the presence of this file in s3 would indicate the entire species build has succeeded.
    upload("gene_info.txt", destpath("gene_info.txt"))