Exemple #1
0
def aws_batch_submit(args):
    """Submit given command to AWS Batch and log timestamped event under s3://operations/... folder in json format."""
    assert_have_aegea()
    # Replace anything that's not alphanumeric in batch_command with '_'
    name = str.join('',
                    (c if c.isalnum() else '_' for c in args.batch_command))
    cmd = f"""aegea batch submit --name {name} --ecr-image {args.batch_ecr_image} --memory {args.batch_memory} --vcpus {args.batch_vcpus} --queue {args.batch_queue} --privileged --command="pip3 install 'git+https://github.com/czbiohub/iggtools.git@{args.batch_branch}' --upgrade ; iggtools --version ; aws s3 cp s3://microbiome-igg/2.0/README.TXT - ; iggtools aws_batch_init ; cd /mnt/nvme ; {args.batch_command} ; echo DONE" """
    tsprint(
        f"Submitting to AWS Batch queue {args.batch_queue}:  {args.batch_command}"
    )
    aegea_output_json = backtick(cmd)
    ao = json.loads(aegea_output_json)
    job_id = ao['jobId']
    t_submit = int(time.time())
    datestamp, timestamp = datecode(t_submit).split("__")
    # timestamp is a string, and that's good, because JSON can lose resolution for large integers
    event = {
        "unix_timestamp": timestamp,
        "utc_date": datestamp,
        "type": "aws_batch_submit",
        "job_id": job_id,
        "job_target": args.batch_command,
        "aegea_command": cmd,
    }
    eventpath = f"{opsdir}/events/{datestamp}/{timestamp}__aws_batch_submit__{job_id}.json"
    with OutputStream(eventpath) as e:
        e.write(json.dumps(event))
    tsprint("You may watch the job with the command\n" +
            f"aegea batch watch {job_id}")
    def genome_work(genome_id):
        assert genome_id in species_for_genome, f"Genome {genome_id} is not in the database."
        species_id = species_for_genome[genome_id]

        dest_file = destpath(genome_id, species_id, lastoutput(genome_id))
        msg = f"Running HMMsearch for genome {genome_id} from species {species_id}."
        if find_files_with_retry(dest_file):
            if not args.force:
                tsprint(f"Destination {dest_file} for genome {genome_id} already exists.  Specify --force to overwrite.")
                return
            msg = msg.replace("Running", "Rerunning")

        tsprint(msg)
        slave_log = "build_marker_genes.log"
        slave_subdir = f"{species_id}__{genome_id}"
        if not args.debug:
            command(f"rm -rf {slave_subdir}")
        if not os.path.isdir(slave_subdir):
            command(f"mkdir {slave_subdir}")

        # Recurisve call via subcommand.  Use subdir, redirect logs.
        slave_cmd = f"cd {slave_subdir}; PYTHONPATH={pythonpath()} {sys.executable} -m iggtools build_marker_genes --genome {genome_id} --zzz_slave_mode --zzz_slave_toc {os.path.abspath(local_toc)} --zzz_slave_marker_genes_hmm {os.path.abspath(marker_genes_hmm)} {'--debug' if args.debug else ''} &>> {slave_log}"
        with open(f"{slave_subdir}/{slave_log}", "w") as slog:
            slog.write(msg + "\n")
            slog.write(slave_cmd + "\n")
        try:
            command(slave_cmd)
        finally:
            # Cleanup should not raise exceptions of its own, so as not to interfere with any
            # prior exceptions that may be more informative.  Hence check=False.
            upload(f"{slave_subdir}/{slave_log}", destpath(genome_id, species_id, slave_log), check=False)
            if not args.debug:
                command(f"rm -rf {slave_subdir}", check=False)
Exemple #3
0
def main(args):
    tsprint(
        f"Executing iggtools subcommand {args.subcommand} with args {vars(args)}."
    )
    uname = backtick("uname")
    assert uname == "Linux", f"Operating system {uname} is not Linux."
    init_nvme(args)
Exemple #4
0
def decode_species_arg(args, species):
    selected_species = set()
    try:  # pylint: disable=too-many-nested-blocks
        if args.species.upper() == "ALL":
            selected_species = set(species)
        else:
            for s in args.species.split(","):
                if ":" not in s:
                    assert str(
                        int(s)) == s, f"Species id is not an integer: {s}"
                    selected_species.add(s)
                else:
                    i, n = s.split(":")
                    i = int(i)
                    n = int(n)
                    assert 0 <= i < n, f"Species class and modulus make no sense: {i}, {n}"
                    for sid in species:
                        if int(sid) % n == i:
                            selected_species.add(sid)
    except:
        tsprint(
            f"ERROR:  Species argument is not a list of species ids or slices: {s}"
        )
        raise
    return sorted(selected_species)
Exemple #5
0
def midas_run_snps(args):

    tempdir = f"{args.outdir}/snps/temp_sc{args.species_cov}"
    if args.debug and os.path.exists(tempdir):
        tsprint(
            f"INFO:  Reusing existing temp data in {tempdir} according to --debug flag."
        )
    else:
        command(f"rm -rf {tempdir}")
        command(f"mkdir -p {tempdir}")

    outputdir = f"{args.outdir}/snps/output_sc{args.species_cov}"
    if not os.path.exists(outputdir):
        command(f"mkdir -p {outputdir}")

    try:
        # The full species profile must exist -- it is output by run_midas_species.
        # Restrict to species above requested coverage.
        full_species_profile = parse_species_profile(args.outdir)
        species_profile = select_species(full_species_profile,
                                         args.species_cov)

        local_toc = download_reference(outputs.genomes)
        db = UHGG(local_toc)
        representatives = db.representatives

        def download_contigs(species_id):
            return download_reference(
                imported_genome_file(representatives[species_id], species_id,
                                     "fna.lz4"), f"{tempdir}/{species_id}")

        # Download repgenome_id.fna for every species in the restricted species profile.
        contigs_files = multithreading_hashmap(download_contigs,
                                               species_profile.keys(),
                                               num_threads=20)

        # Use Bowtie2 to map reads to a representative genomes
        bt2_db_name = "repgenomes"
        build_bowtie2_db(tempdir, bt2_db_name, contigs_files)
        bowtie2_align(args, tempdir, bt2_db_name, sort_aln=True)

        # Use mpileup to identify SNPs
        samtools_index(args, tempdir, bt2_db_name)
        species_pileup_stats = pysam_pileup(args, list(species_profile.keys()),
                                            tempdir, outputdir, contigs_files)

        write_snps_summary(
            species_pileup_stats,
            f"{args.outdir}/snps/output_sc{args.species_cov}/summary.txt")

    except:
        if not args.debug:
            tsprint(
                "Deleting untrustworthy outputs due to error. Specify --debug flag to keep."
            )
            command(f"rm -rf {tempdir}", check=False)
            command(f"rm -rf {outputdir}", check=False)
Exemple #6
0
def samtools_index(args, bt2_db_dir, bt2_db_name):
    if args.debug and os.path.exists(f"{bt2_db_dir}/{bt2_db_name}.bam.bai"):
        tsprint(
            f"Skipping samtools index in debug mode as temporary data exists: {bt2_db_dir}/{bt2_db_name}.bam"
        )
        return

    try:
        command(
            f"samtools index -@ {num_physical_cores} {bt2_db_dir}/{bt2_db_name}.bam"
        )
    except:
        command(f"rm -f {bt2_db_dir}/{bt2_db_name}.bam.bai")
        raise
Exemple #7
0
def assign_unique(alns, species_info, marker_info):
    """ Count the number of uniquely mapped reads to each genome species """
    unique_alns = {si: [] for si in species_info}
    unique = 0
    non_unique = 0
    for aln in alns:
        if len(aln) == 1:
            unique += 1
            species_id = marker_info[aln[0]['target']]['species_id']
            unique_alns[species_id].append(aln[0])
        else:
            non_unique += 1
    tsprint(f"  uniquely mapped reads: {unique}")
    tsprint(f"  ambiguously mapped reads: {non_unique}")
    return unique_alns
Exemple #8
0
def parse_reads(filename, max_reads=None):
    if not filename:
        return
    read_count_filter = None
    if max_reads != None:
        read_count_filter = f"head -n {4 * max_reads}"
    read_count = 0
    with InputStream(filename, read_count_filter) as fp:
        for name, seq, _ in readfq(fp):
            read_count += 1
            new_name = construct_queryid(name, len(seq))  # We need to encode the length in the query id to be able to recover it from hs-blastn output
            yield (new_name, seq)
        if read_count_filter:
            fp.ignore_errors()
    tsprint(f"Parsed {read_count} reads from {filename}")
Exemple #9
0
def vsearch(percent_id, genes, num_threads=num_vcpu):
    centroids = f"centroids.{percent_id}.ffn"
    uclust = f"uclust.{percent_id}.txt"
    # log = f"uclust.{percent_id}.log"
    if find_files(centroids) and find_files(uclust):
        tsprint(
            f"Found vsearch results at percent identity {percent_id} from prior run."
        )
    else:
        try:
            command(
                f"vsearch --quiet --cluster_fast {genes} --id {percent_id/100.0} --threads {num_threads} --centroids {centroids} --uc {uclust}"
            )
        except:
            # Do not keep bogus zero-length files;  those are harmful if we rerun in place.
            command(f"mv {centroids} {centroids}.bogus", check=False)
            command(f"mv {uclust} {uclust}.bogus", check=False)
            raise
    return centroids, uclust  #, log
Exemple #10
0
    def species_work(species_id):
        assert species_id in species, f"Species {species_id} is not in the database."
        species_genomes = species[species_id]

        def destpath(src):
            return pangenome_file(species_id, src + ".lz4")

        # The species build will upload this file last, after everything else is successfully uploaded.
        # Therefore, if this file exists in s3, there is no need to redo the species build.
        dest_file = destpath("gene_info.txt")
        msg = f"Building pangenome for species {species_id} with {len(species_genomes)} total genomes."
        if find_files_with_retry(dest_file):
            if not args.force:
                tsprint(
                    f"Destination {dest_file} for species {species_id} pangenome already exists.  Specify --force to overwrite."
                )
                return
            msg = msg.replace("Building", "Rebuilding")

        with CONCURRENT_SPECIES_BUILDS:
            tsprint(msg)
            slave_log = "pangenome_build.log"
            slave_subdir = str(species_id)
            if not args.debug:
                command(f"rm -rf {slave_subdir}")
            if not os.path.isdir(slave_subdir):
                command(f"mkdir {slave_subdir}")
            # Recurisve call via subcommand.  Use subdir, redirect logs.
            slave_cmd = f"cd {slave_subdir}; PYTHONPATH={pythonpath()} {sys.executable} -m iggtools build_pangenome -s {species_id} --zzz_slave_mode --zzz_slave_toc {os.path.abspath(local_toc)} {'--debug' if args.debug else ''} &>> {slave_log}"
            with open(f"{slave_subdir}/{slave_log}", "w") as slog:
                slog.write(msg + "\n")
                slog.write(slave_cmd + "\n")
            try:
                command(slave_cmd)
            finally:
                # Cleanup should not raise exceptions of its own, so as not to interfere with any
                # prior exceptions that may be more informative.  Hence check=False.
                upload(f"{slave_subdir}/{slave_log}",
                       destpath(slave_log),
                       check=False)
                if not args.debug:
                    command(f"rm -rf {slave_subdir}", check=False)
Exemple #11
0
def assert_have_aegea(min_version="3.2.1"):
    try:
        # Assert that aegea is installed and at least the minimum supported version.
        #
        # Tooling installed in the Dockerfile does not require these types of checks.
        #
        # A few of the iggtools admin subcommands (including this one) are supported to run
        # directly on a laptop or dev server, outside of docker, and (for extra operational
        # lightness and flexibility) even without being installed by a package manager --
        # with the downside of having to perform this check.  We should keep these checks
        # to a minimum.  If more creep up, we will require docker and erase these checks.
        #
        aegea, version = backtick("aegea --version | head -1").split()
        assert aegea == "aegea"
        vvv = tuple(int(v) for v in version.split("."))
        uuu = tuple(int(u) for u in min_version.split("."))
        assert vvv >= uuu, f"Aegea {version} is too old, please upgrade to {min_version} or above."
    except:
        tsprint("SUGGESTION:  Please 'pip3 install --upgrade aegea'")
        raise
Exemple #12
0
def normalize_counts(species_alns, total_gene_length):
    """ Normalize counts by gene length and sum contrain """
    # norm by gene length, compute cov
    species_abundance = {}
    for species_id, alns in species_alns.items():
        # compute coverage
        if alns:
            bp = sum(aln['aln'] for aln in alns)
            cov = float(bp)/total_gene_length[species_id]
        else:
            cov = 0.0
        # TODO:  Use NamedTuple instead of dict
        species_abundance[species_id] = {'count':len(alns), 'cov':cov, 'rel_abun': 0.0}
    # compute relative abundance
    total_cov = sum(sav['cov'] for sav in species_abundance.values())
    if total_cov > 0:
        for sav in species_abundance.values():
            sav['rel_abun'] = sav['cov'] / total_cov
    tsprint(f"  total marker-gene coverage {total_cov:.3f}")
    return species_abundance
Exemple #13
0
def bowtie2_align(args, bt2_db_dir, bt2_db_name, sort_aln=False):
    """
    Use Bowtie2 to map reads to specified representative genomes or
    collections of centroids genes for the pangenome flow.
    """

    if args.debug and os.path.exists(f"{bt2_db_dir}/{bt2_db_name}.bam"):
        tsprint(
            f"Skipping Bowtie2 alignment in debug mode as temporary data exists: {bt2_db_dir}/{bt2_db_name}.bam"
        )
        return

    # Construct bowtie2 align input arguments
    max_reads = f"-u {args.max_reads}" if args.max_reads else ""
    aln_mode = "local" if args.aln_mode == "local" else "end-to-end"
    aln_speed = args.aln_speed if aln_mode == "end_to_end" else args.aln_speed + "-local"
    r2 = ""
    if args.r2:
        r1 = f"-1 {args.r1}"
        r2 = f"-2 {args.r2}"
    elif args.aln_interleaved:
        r1 = f"--interleaved {args.r1}"
    else:
        r1 = f"-U {args.r1}"

    try:
        bt2_command = f"bowtie2 --no-unal -x {bt2_db_dir}/{bt2_db_name} {max_reads} --{aln_mode} --{aln_speed} --threads {num_physical_cores} -q {r1} {r2}"
        if sort_aln:
            command(f"set -o pipefail; {bt2_command} | \
                    samtools view --threads {num_physical_cores} -b - | \
                    samtools sort --threads {num_physical_cores} -o {bt2_db_dir}/{bt2_db_name}.bam"
                    )
        else:
            command(f"set -o pipefail; {bt2_command} | \
                    samtools view --threads {num_physical_cores} -b - > {bt2_db_dir}/{bt2_db_name}.bam"
                    )
    except:
        tsprint(
            f"Bowtie2 align to {bt2_db_dir}/{bt2_db_name}.bam run into error")
        command(f"rm -f {bt2_db_dir}/{bt2_db_name}.bam")
        raise
def hmmsearch(genome_id, species_id, marker_genes_hmm, num_threads=1):
    # Input
    annotated_genes_s3_path = input_annotations_file(genome_id, species_id, f"{genome_id}.faa.lz4")
    annotated_genes = download_reference(annotated_genes_s3_path)

    # Output
    hmmsearch_file = f"{genome_id}.hmmsearch"

    # Command
    if find_files(hmmsearch_file):
        # This only happens in debug mode, where we can use pre-existing file.
        tsprint(f"Found hmmsearch results for genome {genome_id} from prior run.")
    else:
        try:
            command(f"hmmsearch --noali --cpu {num_threads} --domtblout {hmmsearch_file} {marker_genes_hmm} {annotated_genes}")
        except:
            # Do not keep bogus zero-length files;  those are harmful if we rerun in place.
            command(f"mv {hmmsearch_file} {hmmsearch_file}.bogus", check=False)
            raise

    return hmmsearch_file
Exemple #15
0
def midas_run_genes(args):

    tempdir = f"{args.outdir}/genes/temp_sc{args.species_cov}"

    if args.debug and os.path.exists(tempdir):
        tsprint(f"INFO:  Reusing existing temp data in {tempdir} according to --debug flag.")
    else:
        command(f"rm -rf {tempdir}")
        command(f"mkdir -p {tempdir}")

    try:
        # The full species profile must exist -- it is output by run_midas_species.
        # Restrict to species above requested coverage.
        full_species_profile = parse_species_profile(args.outdir)
        species_profile = select_species(full_species_profile, args.species_cov)

        def download_centroid(species_id):
            return download_reference(pangenome_file(species_id, "centroids.ffn.lz4"), f"{tempdir}/{species_id}")  # TODO colocate samples to overlap reference downloads

        # Download centroids.ffn for every species in the restricted species profile.
        centroids_files = multithreading_hashmap(download_centroid, species_profile.keys(), num_threads=20)

        # Perhaps avoid this giant conglomerated file, fetching instead submaps for each species.
        # Also colocate/cache/download in master for multiple slave subcommand invocations.
        bt2_db_name = "pangenomes"
        build_bowtie2_db(tempdir, bt2_db_name, centroids_files)
        bowtie2_align(args, tempdir, bt2_db_name, sort_aln=False)

        # Compute coverage of pangenome for each present species and write results to disk
        marker_genes_map = "s3://microbiome-igg/2.0/marker_genes/phyeco/phyeco.map.lz4"
        species, genes = scan_centroids(centroids_files)
        num_covered_genes, species_mean_coverage, covered_genes = count_mapped_bp(args, tempdir, genes)
        markers = scan_markers(genes, marker_genes_map)
        species_markers_coverage = normalize(genes, covered_genes, markers)

        write_results(args.outdir, species, num_covered_genes, species_markers_coverage, species_mean_coverage)
    except:
        if not args.debug:
            tsprint("Deleting untrustworthy outputs due to error.  Specify --debug flag to keep.")
            command(f"rm -rf {tempdir}", check=False)
Exemple #16
0
def init_nvme(args):
    # TODO:  Generalize the magic numbers 838 and 1715518 (those are for AWS instance type r5.12xlarge).  # pylint: disable=fixme
    # https://github.com/czbiohub/iggtools/issues/17
    if nvme_size_str() != '1715518':
        # Raid, format, and mount the NVME drives attached to this instance.
        tsprint("Initializing instance NVME storage.")
        try:
            command(
                """set -o pipefail; lsblk | grep 838 | awk '{print "/dev/"$1}' | xargs -n 10 s3mi raid nvme"""
            )
        except Exception as e:
            try:
                # Sometimes we've formatted it in a prior incarnation but the mountpoint can't exist in the container to tell us.
                # In those cases we can just try to mount it.
                command("""mount /dev/md0 /mnt/nvme""")
            except:
                raise e
        assert nvme_size_str(
        ) == '1715518', "Failed to initialize and mount instance NVME storage."
    else:
        tsprint("Instance NVME storage previously initialized.")
        if args.force:
            tsprint(
                "Ignoring --force argument.  It is usually unnecessary to reinitialize AWS instance storage."
            )
def decode_genomes_arg(args, genomes):
    selected_genomes = set()
    try:  # pylint: disable=too-many-nested-blocks
        if args.genomes.upper() == "ALL":
            selected_genomes = set(genomes)
        else:
            for g in args.genomes.split(","):
                if ":" not in g:
                    selected_genomes.add(g)
                else:
                    i, n = g.split(":")
                    i = int(i)
                    n = int(n)
                    assert 0 <= i < n, f"Genome class and modulus make no sense: {i}, {n}"
                    for gid in genomes:
                        gid_int = int(gid.replace("GUT_GENOME", ""))
                        if gid_int % n == i:
                            selected_genomes.add(gid)
    except:
        tsprint(f"ERROR:  Genomes argument is not a list of genome ids or slices: {g}")
        raise
    return sorted(selected_genomes)
Exemple #18
0
def count_mapped_bp(args, tempdir, genes):
    """ Count number of bp mapped to each gene across pangenomes.
    Return number covered genes and average gene depth per species.
    Result contains only covered species, but being a defaultdict,
    would yield 0 for any uncovered species, which is appropriate.
    """
    bam_path = f"{tempdir}/pangenomes.bam"
    bamfile = AlignmentFile(bam_path, "rb")
    covered_genes = {}

    # loop over alignments, sum values per gene
    for aln in bamfile.fetch(until_eof=True):
        gene_id = bamfile.getrname(aln.reference_id)
        gene = genes[gene_id]
        gene["aligned_reads"] += 1
        if keep_read(aln, args.aln_mapid, args.aln_readq, args.aln_mapq, args.aln_cov):
            gene["mapped_reads"] += 1
            gene["depth"] += len(aln.query_alignment_sequence) / float(gene["length"])
            covered_genes[gene_id] = gene

    tsprint("Pangenome count_mapped_bp:  total aligned reads: %s" % sum(g["aligned_reads"] for g in genes.values()))
    tsprint("Pangenome count_mapped_bp:  total mapped reads: %s" % sum(g["mapped_reads"] for g in genes.values()))

    # Filter to genes with non-zero depth, then group by species
    nonzero_gene_depths = defaultdict(list)
    for g in covered_genes.values():
        gene_depth = g["depth"]
        if gene_depth > 0:  # This should always pass, because ags.aln_cov is always >0.
            species_id = g["species_id"]
            nonzero_gene_depths[species_id].append(gene_depth)

    # Compute number of covered genes per species, and average gene depth.
    num_covered_genes = defaultdict(int)
    mean_coverage = defaultdict(float)
    for species_id, non_zero_depths in nonzero_gene_depths.items():
        num_covered_genes[species_id] = len(non_zero_depths)
        mean_coverage[species_id] = np.mean(non_zero_depths)

    return num_covered_genes, mean_coverage, covered_genes
Exemple #19
0
def find_best_hits(args, marker_info, m8_file, marker_cutoffs):
    """ Find top scoring alignment for each read """
    best_hits = {}
    i = 0
    with InputStream(m8_file) as m8_stream:
        for aln in select_from_tsv(m8_stream, schema=BLAST_M8_SCHEMA, result_structure=dict):
            i += 1
            cutoff = args.aln_mapid
            if cutoff == None:
                marker_id = marker_info[aln['target']]['marker_id'] # get gene family from marker_info
                cutoff = marker_cutoffs[marker_id]
            if aln['pid'] < cutoff: # does not meet marker cutoff
                continue
            if query_coverage(aln) < args.aln_cov: # filter local alignments
                continue
            if aln['query'] not in best_hits: # record aln
                best_hits[aln['query']] = [aln]
            elif best_hits[aln['query']][0]['score'] == aln['score']: # add aln
                best_hits[aln['query']] += [aln]
            elif best_hits[aln['query']][0]['score'] < aln['score']: # update aln
                best_hits[aln['query']] = [aln]
    tsprint(f"  total alignments: {i}")
    return list(best_hits.values())
Exemple #20
0
def build_bowtie2_db(bt2_db_dir, bt2_db_name, downloaded_files):
    """
    Build Bowtie2 database of representative genomes or centroid genes
    for the species present in the sample, e.g. repgenomes OR pangenomes
    """
    bt2_db_suffixes = [
        "1.bt2", "2.bt2", "3.bt2", "4.bt2", "rev.1.bt2", "rev.2.bt2"
    ]
    if all(
            os.path.exists(f"{bt2_db_dir}/{bt2_db_name}.{ext}")
            for ext in bt2_db_suffixes):
        tsprint("Skipping bowtie2-build as database files appear to exist.")
        return
    command(f"rm -f {bt2_db_dir}/{bt2_db_name}.fa")
    command(f"touch {bt2_db_dir}/{bt2_db_name}.fa")

    for files in split(downloaded_files.values(),
                       20):  # keep "cat" commands short
        command("cat " + " ".join(files) +
                f" >> {bt2_db_dir}/{bt2_db_name}.fa")

    command(
        f"bowtie2-build --threads {num_physical_cores} {bt2_db_dir}/{bt2_db_name}.fa {bt2_db_dir}/{bt2_db_name} > {bt2_db_dir}/bowtie2-build.log"
    )
Exemple #21
0
def init(args):
    """
    Input spec: https://github.com/czbiohub/iggtools/wiki#inputs
    Output spec: https://github.com/czbiohub/iggtools/wiki#target-layout-in-s3
    """

    msg = f"Building {outputs.genomes}."
    if find_files(outputs.genomes):
        if not args.force:
            tsprint(
                f"Destination {outputs.genomes} already exists.  Specify --force to overwrite."
            )
            return
        msg = f"Rebuilding {outputs.genomes}."
    tsprint(msg)

    id_remap = {}
    with InputStream(inputs.alt_species_ids) as ids:
        for row in select_from_tsv(
                ids, selected_columns=["alt_species_id", "species_id"]):
            new_id, old_id = row
            id_remap[old_id] = new_id

    seen_genomes, seen_species = set(), set()
    with OutputStream(outputs.genomes) as out:

        target_columns = [
            "genome", "species", "representative", "genome_is_representative"
        ]
        out.write("\t".join(target_columns) + "\n")

        with InputStream(inputs.genomes2species) as g2s:
            for row in select_from_tsv(
                    g2s, selected_columns=["MAG_code", "Species_id"]):
                genome, representative = row
                species = id_remap[representative]
                genome_is_representative = str(int(genome == representative))
                target_row = [
                    genome, species, representative, genome_is_representative
                ]
                out.write("\t".join(target_row) + "\n")
                seen_genomes.add(genome)
                seen_species.add(species)

    tsprint(
        f"Emitted {len(seen_genomes)} genomes and {len(seen_species)} species to {outputs.genomes}."
    )
Exemple #22
0
def pysam_pileup(args, species_ids, tempdir, outputdir, contigs_files):
    "Counting alleles and run pileups per species in parallel"

    # Update alignment stats for species
    species_pileup_stats = defaultdict()
    contigs_db_stats = {
        'species_counts': 0,
        'total_seqs': 0,
        'total_length': 0
    }

    mp = multiprocessing.Pool(num_physical_cores)
    argument_list = [(sp_id, args, tempdir, outputdir, contigs_files[sp_id],
                      contigs_db_stats) for sp_id in species_ids]

    for species_id, aln_stats in mp.starmap(species_pileup, argument_list):
        sp_stats = {
            "genome_length": int(aln_stats['genome_length']),
            "covered_bases": int(aln_stats['covered_bases']),
            "total_depth": int(aln_stats['total_depth']),
            "aligned_reads": int(aln_stats['aligned_reads']),
            "mapped_reads": int(aln_stats['mapped_reads']),
            "fraction_covered": 0.0,
            "mean_coverage": 0.0,
        }

        if sp_stats["genome_length"] > 0:
            sp_stats["fraction_covered"] = format(
                sp_stats["covered_bases"] / sp_stats["genome_length"],
                DECIMALS)

        if sp_stats["covered_bases"] > 0:
            sp_stats["mean_coverage"] = format(
                sp_stats["total_depth"] / sp_stats["covered_bases"], DECIMALS)

        species_pileup_stats[species_id] = sp_stats

    tsprint(
        f"contigs_db_stats - total genomes: {contigs_db_stats['species_counts']}"
    )
    tsprint(
        f"contigs_db_stats - total contigs: {contigs_db_stats['total_seqs']}")
    tsprint(
        f"contigs_db_stats - total base-pairs: {contigs_db_stats['total_length']}"
    )

    return species_pileup_stats
Exemple #23
0
def species_pileup(species_id, args, tempdir, outputdir, contig_file,
                   contigs_db_stats):
    # Read in contigs information for current species_id

    contigs = {}
    contigs_db_stats[
        'species_counts'] += 1  # not being updated and passed as expected

    with InputStream(contig_file) as file:
        for rec in Bio.SeqIO.parse(file, 'fasta'):
            contigs[rec.id] = {
                "species_id": species_id,
                "contig_len": int(len(rec.seq)),
                "contig_seq": str(rec.seq),
            }
            contigs_db_stats['total_length'] += contigs[rec.id]["contig_len"]
            contigs_db_stats['total_seqs'] += 1

    # Summary statistics
    aln_stats = {
        "genome_length": 0,
        "total_depth": 0,
        "covered_bases": 0,
        "aligned_reads": 0,
        "mapped_reads": 0,
    }

    def keep_read(x):
        return keep_read_worker(x, args, aln_stats)

    header = [
        'ref_id', 'ref_pos', 'ref_allele', 'depth', 'count_a', 'count_c',
        'count_g', 'count_t'
    ]
    path = f"{outputdir}/{species_id}.snps.lz4"

    with OutputStream(path) as file:

        file.write('\t'.join(header) + '\n')
        zero_rows_allowed = not args.sparse

        # Loop over alignment for current species's contigs
        with AlignmentFile(f"{tempdir}/repgenomes.bam") as bamfile:
            for contig_id in sorted(list(contigs.keys())):  # why need to sort?
                contig = contigs[contig_id]
                counts = bamfile.count_coverage(
                    contig_id,
                    start=0,
                    end=contig["contig_len"],
                    quality_threshold=args.aln_baseq,
                    read_callback=keep_read)

                for ref_pos in range(0, contig["contig_len"]):
                    ref_allele = contig["contig_seq"][ref_pos]
                    depth = sum([counts[nt][ref_pos] for nt in range(4)])
                    count_a = counts[0][ref_pos]
                    count_c = counts[1][ref_pos]
                    count_g = counts[2][ref_pos]
                    count_t = counts[3][ref_pos]
                    values = [
                        contig_id, ref_pos + 1, ref_allele, depth, count_a,
                        count_c, count_g, count_t
                    ]

                    if depth > 0 or zero_rows_allowed:
                        file.write('\t'.join(str(val)
                                             for val in values) + '\n')

                    aln_stats['genome_length'] += 1
                    aln_stats['total_depth'] += depth
                    if depth > 0:
                        aln_stats['covered_bases'] += 1

    tsprint(json.dumps({species_id: aln_stats}, indent=4))
    return (species_id, {k: str(v) for k, v in aln_stats.items()})
Exemple #24
0
def main(args):
    tsprint(
        f"Executing iggtools subcommand {args.subcommand} with args {vars(args)}."
    )
    aws_batch_submit(args)
Exemple #25
0
def main(args):
    tsprint(
        f"Executing iggtools subcommand {args.subcommand} with args {vars(args)}."
    )
    build_pangenome(args)
Exemple #26
0
def main(args):
    tsprint(
        f"Executing iggtools subcommand {args.subcommand} with args {vars(args)}."
    )
    init(args)
Exemple #27
0
def main(args):
    tsprint(
        f"Executing iggtools subcommand {args.subcommand} with args {vars(args)}."
    )
    annotate_genes(args)
def main(args):
    tsprint(f"Doing important work in subcommand {args.subcommand} with args {vars(args)}")
Exemple #29
0
def main(args):
    tsprint(f"Doing important work in subcommand {args.subcommand} with args\n{json.dumps(vars(args), indent=4)}")
    midas_run_species(args)
def main(args):
    tsprint(f"Executing iggtools subcommand {args.subcommand} with args {vars(args)}.")
    build_marker_genes(args)