def aws_batch_submit(args): """Submit given command to AWS Batch and log timestamped event under s3://operations/... folder in json format.""" assert_have_aegea() # Replace anything that's not alphanumeric in batch_command with '_' name = str.join('', (c if c.isalnum() else '_' for c in args.batch_command)) cmd = f"""aegea batch submit --name {name} --ecr-image {args.batch_ecr_image} --memory {args.batch_memory} --vcpus {args.batch_vcpus} --queue {args.batch_queue} --privileged --command="pip3 install 'git+https://github.com/czbiohub/iggtools.git@{args.batch_branch}' --upgrade ; iggtools --version ; aws s3 cp s3://microbiome-igg/2.0/README.TXT - ; iggtools aws_batch_init ; cd /mnt/nvme ; {args.batch_command} ; echo DONE" """ tsprint( f"Submitting to AWS Batch queue {args.batch_queue}: {args.batch_command}" ) aegea_output_json = backtick(cmd) ao = json.loads(aegea_output_json) job_id = ao['jobId'] t_submit = int(time.time()) datestamp, timestamp = datecode(t_submit).split("__") # timestamp is a string, and that's good, because JSON can lose resolution for large integers event = { "unix_timestamp": timestamp, "utc_date": datestamp, "type": "aws_batch_submit", "job_id": job_id, "job_target": args.batch_command, "aegea_command": cmd, } eventpath = f"{opsdir}/events/{datestamp}/{timestamp}__aws_batch_submit__{job_id}.json" with OutputStream(eventpath) as e: e.write(json.dumps(event)) tsprint("You may watch the job with the command\n" + f"aegea batch watch {job_id}")
def genome_work(genome_id): assert genome_id in species_for_genome, f"Genome {genome_id} is not in the database." species_id = species_for_genome[genome_id] dest_file = destpath(genome_id, species_id, lastoutput(genome_id)) msg = f"Running HMMsearch for genome {genome_id} from species {species_id}." if find_files_with_retry(dest_file): if not args.force: tsprint(f"Destination {dest_file} for genome {genome_id} already exists. Specify --force to overwrite.") return msg = msg.replace("Running", "Rerunning") tsprint(msg) slave_log = "build_marker_genes.log" slave_subdir = f"{species_id}__{genome_id}" if not args.debug: command(f"rm -rf {slave_subdir}") if not os.path.isdir(slave_subdir): command(f"mkdir {slave_subdir}") # Recurisve call via subcommand. Use subdir, redirect logs. slave_cmd = f"cd {slave_subdir}; PYTHONPATH={pythonpath()} {sys.executable} -m iggtools build_marker_genes --genome {genome_id} --zzz_slave_mode --zzz_slave_toc {os.path.abspath(local_toc)} --zzz_slave_marker_genes_hmm {os.path.abspath(marker_genes_hmm)} {'--debug' if args.debug else ''} &>> {slave_log}" with open(f"{slave_subdir}/{slave_log}", "w") as slog: slog.write(msg + "\n") slog.write(slave_cmd + "\n") try: command(slave_cmd) finally: # Cleanup should not raise exceptions of its own, so as not to interfere with any # prior exceptions that may be more informative. Hence check=False. upload(f"{slave_subdir}/{slave_log}", destpath(genome_id, species_id, slave_log), check=False) if not args.debug: command(f"rm -rf {slave_subdir}", check=False)
def main(args): tsprint( f"Executing iggtools subcommand {args.subcommand} with args {vars(args)}." ) uname = backtick("uname") assert uname == "Linux", f"Operating system {uname} is not Linux." init_nvme(args)
def decode_species_arg(args, species): selected_species = set() try: # pylint: disable=too-many-nested-blocks if args.species.upper() == "ALL": selected_species = set(species) else: for s in args.species.split(","): if ":" not in s: assert str( int(s)) == s, f"Species id is not an integer: {s}" selected_species.add(s) else: i, n = s.split(":") i = int(i) n = int(n) assert 0 <= i < n, f"Species class and modulus make no sense: {i}, {n}" for sid in species: if int(sid) % n == i: selected_species.add(sid) except: tsprint( f"ERROR: Species argument is not a list of species ids or slices: {s}" ) raise return sorted(selected_species)
def midas_run_snps(args): tempdir = f"{args.outdir}/snps/temp_sc{args.species_cov}" if args.debug and os.path.exists(tempdir): tsprint( f"INFO: Reusing existing temp data in {tempdir} according to --debug flag." ) else: command(f"rm -rf {tempdir}") command(f"mkdir -p {tempdir}") outputdir = f"{args.outdir}/snps/output_sc{args.species_cov}" if not os.path.exists(outputdir): command(f"mkdir -p {outputdir}") try: # The full species profile must exist -- it is output by run_midas_species. # Restrict to species above requested coverage. full_species_profile = parse_species_profile(args.outdir) species_profile = select_species(full_species_profile, args.species_cov) local_toc = download_reference(outputs.genomes) db = UHGG(local_toc) representatives = db.representatives def download_contigs(species_id): return download_reference( imported_genome_file(representatives[species_id], species_id, "fna.lz4"), f"{tempdir}/{species_id}") # Download repgenome_id.fna for every species in the restricted species profile. contigs_files = multithreading_hashmap(download_contigs, species_profile.keys(), num_threads=20) # Use Bowtie2 to map reads to a representative genomes bt2_db_name = "repgenomes" build_bowtie2_db(tempdir, bt2_db_name, contigs_files) bowtie2_align(args, tempdir, bt2_db_name, sort_aln=True) # Use mpileup to identify SNPs samtools_index(args, tempdir, bt2_db_name) species_pileup_stats = pysam_pileup(args, list(species_profile.keys()), tempdir, outputdir, contigs_files) write_snps_summary( species_pileup_stats, f"{args.outdir}/snps/output_sc{args.species_cov}/summary.txt") except: if not args.debug: tsprint( "Deleting untrustworthy outputs due to error. Specify --debug flag to keep." ) command(f"rm -rf {tempdir}", check=False) command(f"rm -rf {outputdir}", check=False)
def samtools_index(args, bt2_db_dir, bt2_db_name): if args.debug and os.path.exists(f"{bt2_db_dir}/{bt2_db_name}.bam.bai"): tsprint( f"Skipping samtools index in debug mode as temporary data exists: {bt2_db_dir}/{bt2_db_name}.bam" ) return try: command( f"samtools index -@ {num_physical_cores} {bt2_db_dir}/{bt2_db_name}.bam" ) except: command(f"rm -f {bt2_db_dir}/{bt2_db_name}.bam.bai") raise
def assign_unique(alns, species_info, marker_info): """ Count the number of uniquely mapped reads to each genome species """ unique_alns = {si: [] for si in species_info} unique = 0 non_unique = 0 for aln in alns: if len(aln) == 1: unique += 1 species_id = marker_info[aln[0]['target']]['species_id'] unique_alns[species_id].append(aln[0]) else: non_unique += 1 tsprint(f" uniquely mapped reads: {unique}") tsprint(f" ambiguously mapped reads: {non_unique}") return unique_alns
def parse_reads(filename, max_reads=None): if not filename: return read_count_filter = None if max_reads != None: read_count_filter = f"head -n {4 * max_reads}" read_count = 0 with InputStream(filename, read_count_filter) as fp: for name, seq, _ in readfq(fp): read_count += 1 new_name = construct_queryid(name, len(seq)) # We need to encode the length in the query id to be able to recover it from hs-blastn output yield (new_name, seq) if read_count_filter: fp.ignore_errors() tsprint(f"Parsed {read_count} reads from {filename}")
def vsearch(percent_id, genes, num_threads=num_vcpu): centroids = f"centroids.{percent_id}.ffn" uclust = f"uclust.{percent_id}.txt" # log = f"uclust.{percent_id}.log" if find_files(centroids) and find_files(uclust): tsprint( f"Found vsearch results at percent identity {percent_id} from prior run." ) else: try: command( f"vsearch --quiet --cluster_fast {genes} --id {percent_id/100.0} --threads {num_threads} --centroids {centroids} --uc {uclust}" ) except: # Do not keep bogus zero-length files; those are harmful if we rerun in place. command(f"mv {centroids} {centroids}.bogus", check=False) command(f"mv {uclust} {uclust}.bogus", check=False) raise return centroids, uclust #, log
def species_work(species_id): assert species_id in species, f"Species {species_id} is not in the database." species_genomes = species[species_id] def destpath(src): return pangenome_file(species_id, src + ".lz4") # The species build will upload this file last, after everything else is successfully uploaded. # Therefore, if this file exists in s3, there is no need to redo the species build. dest_file = destpath("gene_info.txt") msg = f"Building pangenome for species {species_id} with {len(species_genomes)} total genomes." if find_files_with_retry(dest_file): if not args.force: tsprint( f"Destination {dest_file} for species {species_id} pangenome already exists. Specify --force to overwrite." ) return msg = msg.replace("Building", "Rebuilding") with CONCURRENT_SPECIES_BUILDS: tsprint(msg) slave_log = "pangenome_build.log" slave_subdir = str(species_id) if not args.debug: command(f"rm -rf {slave_subdir}") if not os.path.isdir(slave_subdir): command(f"mkdir {slave_subdir}") # Recurisve call via subcommand. Use subdir, redirect logs. slave_cmd = f"cd {slave_subdir}; PYTHONPATH={pythonpath()} {sys.executable} -m iggtools build_pangenome -s {species_id} --zzz_slave_mode --zzz_slave_toc {os.path.abspath(local_toc)} {'--debug' if args.debug else ''} &>> {slave_log}" with open(f"{slave_subdir}/{slave_log}", "w") as slog: slog.write(msg + "\n") slog.write(slave_cmd + "\n") try: command(slave_cmd) finally: # Cleanup should not raise exceptions of its own, so as not to interfere with any # prior exceptions that may be more informative. Hence check=False. upload(f"{slave_subdir}/{slave_log}", destpath(slave_log), check=False) if not args.debug: command(f"rm -rf {slave_subdir}", check=False)
def assert_have_aegea(min_version="3.2.1"): try: # Assert that aegea is installed and at least the minimum supported version. # # Tooling installed in the Dockerfile does not require these types of checks. # # A few of the iggtools admin subcommands (including this one) are supported to run # directly on a laptop or dev server, outside of docker, and (for extra operational # lightness and flexibility) even without being installed by a package manager -- # with the downside of having to perform this check. We should keep these checks # to a minimum. If more creep up, we will require docker and erase these checks. # aegea, version = backtick("aegea --version | head -1").split() assert aegea == "aegea" vvv = tuple(int(v) for v in version.split(".")) uuu = tuple(int(u) for u in min_version.split(".")) assert vvv >= uuu, f"Aegea {version} is too old, please upgrade to {min_version} or above." except: tsprint("SUGGESTION: Please 'pip3 install --upgrade aegea'") raise
def normalize_counts(species_alns, total_gene_length): """ Normalize counts by gene length and sum contrain """ # norm by gene length, compute cov species_abundance = {} for species_id, alns in species_alns.items(): # compute coverage if alns: bp = sum(aln['aln'] for aln in alns) cov = float(bp)/total_gene_length[species_id] else: cov = 0.0 # TODO: Use NamedTuple instead of dict species_abundance[species_id] = {'count':len(alns), 'cov':cov, 'rel_abun': 0.0} # compute relative abundance total_cov = sum(sav['cov'] for sav in species_abundance.values()) if total_cov > 0: for sav in species_abundance.values(): sav['rel_abun'] = sav['cov'] / total_cov tsprint(f" total marker-gene coverage {total_cov:.3f}") return species_abundance
def bowtie2_align(args, bt2_db_dir, bt2_db_name, sort_aln=False): """ Use Bowtie2 to map reads to specified representative genomes or collections of centroids genes for the pangenome flow. """ if args.debug and os.path.exists(f"{bt2_db_dir}/{bt2_db_name}.bam"): tsprint( f"Skipping Bowtie2 alignment in debug mode as temporary data exists: {bt2_db_dir}/{bt2_db_name}.bam" ) return # Construct bowtie2 align input arguments max_reads = f"-u {args.max_reads}" if args.max_reads else "" aln_mode = "local" if args.aln_mode == "local" else "end-to-end" aln_speed = args.aln_speed if aln_mode == "end_to_end" else args.aln_speed + "-local" r2 = "" if args.r2: r1 = f"-1 {args.r1}" r2 = f"-2 {args.r2}" elif args.aln_interleaved: r1 = f"--interleaved {args.r1}" else: r1 = f"-U {args.r1}" try: bt2_command = f"bowtie2 --no-unal -x {bt2_db_dir}/{bt2_db_name} {max_reads} --{aln_mode} --{aln_speed} --threads {num_physical_cores} -q {r1} {r2}" if sort_aln: command(f"set -o pipefail; {bt2_command} | \ samtools view --threads {num_physical_cores} -b - | \ samtools sort --threads {num_physical_cores} -o {bt2_db_dir}/{bt2_db_name}.bam" ) else: command(f"set -o pipefail; {bt2_command} | \ samtools view --threads {num_physical_cores} -b - > {bt2_db_dir}/{bt2_db_name}.bam" ) except: tsprint( f"Bowtie2 align to {bt2_db_dir}/{bt2_db_name}.bam run into error") command(f"rm -f {bt2_db_dir}/{bt2_db_name}.bam") raise
def hmmsearch(genome_id, species_id, marker_genes_hmm, num_threads=1): # Input annotated_genes_s3_path = input_annotations_file(genome_id, species_id, f"{genome_id}.faa.lz4") annotated_genes = download_reference(annotated_genes_s3_path) # Output hmmsearch_file = f"{genome_id}.hmmsearch" # Command if find_files(hmmsearch_file): # This only happens in debug mode, where we can use pre-existing file. tsprint(f"Found hmmsearch results for genome {genome_id} from prior run.") else: try: command(f"hmmsearch --noali --cpu {num_threads} --domtblout {hmmsearch_file} {marker_genes_hmm} {annotated_genes}") except: # Do not keep bogus zero-length files; those are harmful if we rerun in place. command(f"mv {hmmsearch_file} {hmmsearch_file}.bogus", check=False) raise return hmmsearch_file
def midas_run_genes(args): tempdir = f"{args.outdir}/genes/temp_sc{args.species_cov}" if args.debug and os.path.exists(tempdir): tsprint(f"INFO: Reusing existing temp data in {tempdir} according to --debug flag.") else: command(f"rm -rf {tempdir}") command(f"mkdir -p {tempdir}") try: # The full species profile must exist -- it is output by run_midas_species. # Restrict to species above requested coverage. full_species_profile = parse_species_profile(args.outdir) species_profile = select_species(full_species_profile, args.species_cov) def download_centroid(species_id): return download_reference(pangenome_file(species_id, "centroids.ffn.lz4"), f"{tempdir}/{species_id}") # TODO colocate samples to overlap reference downloads # Download centroids.ffn for every species in the restricted species profile. centroids_files = multithreading_hashmap(download_centroid, species_profile.keys(), num_threads=20) # Perhaps avoid this giant conglomerated file, fetching instead submaps for each species. # Also colocate/cache/download in master for multiple slave subcommand invocations. bt2_db_name = "pangenomes" build_bowtie2_db(tempdir, bt2_db_name, centroids_files) bowtie2_align(args, tempdir, bt2_db_name, sort_aln=False) # Compute coverage of pangenome for each present species and write results to disk marker_genes_map = "s3://microbiome-igg/2.0/marker_genes/phyeco/phyeco.map.lz4" species, genes = scan_centroids(centroids_files) num_covered_genes, species_mean_coverage, covered_genes = count_mapped_bp(args, tempdir, genes) markers = scan_markers(genes, marker_genes_map) species_markers_coverage = normalize(genes, covered_genes, markers) write_results(args.outdir, species, num_covered_genes, species_markers_coverage, species_mean_coverage) except: if not args.debug: tsprint("Deleting untrustworthy outputs due to error. Specify --debug flag to keep.") command(f"rm -rf {tempdir}", check=False)
def init_nvme(args): # TODO: Generalize the magic numbers 838 and 1715518 (those are for AWS instance type r5.12xlarge). # pylint: disable=fixme # https://github.com/czbiohub/iggtools/issues/17 if nvme_size_str() != '1715518': # Raid, format, and mount the NVME drives attached to this instance. tsprint("Initializing instance NVME storage.") try: command( """set -o pipefail; lsblk | grep 838 | awk '{print "/dev/"$1}' | xargs -n 10 s3mi raid nvme""" ) except Exception as e: try: # Sometimes we've formatted it in a prior incarnation but the mountpoint can't exist in the container to tell us. # In those cases we can just try to mount it. command("""mount /dev/md0 /mnt/nvme""") except: raise e assert nvme_size_str( ) == '1715518', "Failed to initialize and mount instance NVME storage." else: tsprint("Instance NVME storage previously initialized.") if args.force: tsprint( "Ignoring --force argument. It is usually unnecessary to reinitialize AWS instance storage." )
def decode_genomes_arg(args, genomes): selected_genomes = set() try: # pylint: disable=too-many-nested-blocks if args.genomes.upper() == "ALL": selected_genomes = set(genomes) else: for g in args.genomes.split(","): if ":" not in g: selected_genomes.add(g) else: i, n = g.split(":") i = int(i) n = int(n) assert 0 <= i < n, f"Genome class and modulus make no sense: {i}, {n}" for gid in genomes: gid_int = int(gid.replace("GUT_GENOME", "")) if gid_int % n == i: selected_genomes.add(gid) except: tsprint(f"ERROR: Genomes argument is not a list of genome ids or slices: {g}") raise return sorted(selected_genomes)
def count_mapped_bp(args, tempdir, genes): """ Count number of bp mapped to each gene across pangenomes. Return number covered genes and average gene depth per species. Result contains only covered species, but being a defaultdict, would yield 0 for any uncovered species, which is appropriate. """ bam_path = f"{tempdir}/pangenomes.bam" bamfile = AlignmentFile(bam_path, "rb") covered_genes = {} # loop over alignments, sum values per gene for aln in bamfile.fetch(until_eof=True): gene_id = bamfile.getrname(aln.reference_id) gene = genes[gene_id] gene["aligned_reads"] += 1 if keep_read(aln, args.aln_mapid, args.aln_readq, args.aln_mapq, args.aln_cov): gene["mapped_reads"] += 1 gene["depth"] += len(aln.query_alignment_sequence) / float(gene["length"]) covered_genes[gene_id] = gene tsprint("Pangenome count_mapped_bp: total aligned reads: %s" % sum(g["aligned_reads"] for g in genes.values())) tsprint("Pangenome count_mapped_bp: total mapped reads: %s" % sum(g["mapped_reads"] for g in genes.values())) # Filter to genes with non-zero depth, then group by species nonzero_gene_depths = defaultdict(list) for g in covered_genes.values(): gene_depth = g["depth"] if gene_depth > 0: # This should always pass, because ags.aln_cov is always >0. species_id = g["species_id"] nonzero_gene_depths[species_id].append(gene_depth) # Compute number of covered genes per species, and average gene depth. num_covered_genes = defaultdict(int) mean_coverage = defaultdict(float) for species_id, non_zero_depths in nonzero_gene_depths.items(): num_covered_genes[species_id] = len(non_zero_depths) mean_coverage[species_id] = np.mean(non_zero_depths) return num_covered_genes, mean_coverage, covered_genes
def find_best_hits(args, marker_info, m8_file, marker_cutoffs): """ Find top scoring alignment for each read """ best_hits = {} i = 0 with InputStream(m8_file) as m8_stream: for aln in select_from_tsv(m8_stream, schema=BLAST_M8_SCHEMA, result_structure=dict): i += 1 cutoff = args.aln_mapid if cutoff == None: marker_id = marker_info[aln['target']]['marker_id'] # get gene family from marker_info cutoff = marker_cutoffs[marker_id] if aln['pid'] < cutoff: # does not meet marker cutoff continue if query_coverage(aln) < args.aln_cov: # filter local alignments continue if aln['query'] not in best_hits: # record aln best_hits[aln['query']] = [aln] elif best_hits[aln['query']][0]['score'] == aln['score']: # add aln best_hits[aln['query']] += [aln] elif best_hits[aln['query']][0]['score'] < aln['score']: # update aln best_hits[aln['query']] = [aln] tsprint(f" total alignments: {i}") return list(best_hits.values())
def build_bowtie2_db(bt2_db_dir, bt2_db_name, downloaded_files): """ Build Bowtie2 database of representative genomes or centroid genes for the species present in the sample, e.g. repgenomes OR pangenomes """ bt2_db_suffixes = [ "1.bt2", "2.bt2", "3.bt2", "4.bt2", "rev.1.bt2", "rev.2.bt2" ] if all( os.path.exists(f"{bt2_db_dir}/{bt2_db_name}.{ext}") for ext in bt2_db_suffixes): tsprint("Skipping bowtie2-build as database files appear to exist.") return command(f"rm -f {bt2_db_dir}/{bt2_db_name}.fa") command(f"touch {bt2_db_dir}/{bt2_db_name}.fa") for files in split(downloaded_files.values(), 20): # keep "cat" commands short command("cat " + " ".join(files) + f" >> {bt2_db_dir}/{bt2_db_name}.fa") command( f"bowtie2-build --threads {num_physical_cores} {bt2_db_dir}/{bt2_db_name}.fa {bt2_db_dir}/{bt2_db_name} > {bt2_db_dir}/bowtie2-build.log" )
def init(args): """ Input spec: https://github.com/czbiohub/iggtools/wiki#inputs Output spec: https://github.com/czbiohub/iggtools/wiki#target-layout-in-s3 """ msg = f"Building {outputs.genomes}." if find_files(outputs.genomes): if not args.force: tsprint( f"Destination {outputs.genomes} already exists. Specify --force to overwrite." ) return msg = f"Rebuilding {outputs.genomes}." tsprint(msg) id_remap = {} with InputStream(inputs.alt_species_ids) as ids: for row in select_from_tsv( ids, selected_columns=["alt_species_id", "species_id"]): new_id, old_id = row id_remap[old_id] = new_id seen_genomes, seen_species = set(), set() with OutputStream(outputs.genomes) as out: target_columns = [ "genome", "species", "representative", "genome_is_representative" ] out.write("\t".join(target_columns) + "\n") with InputStream(inputs.genomes2species) as g2s: for row in select_from_tsv( g2s, selected_columns=["MAG_code", "Species_id"]): genome, representative = row species = id_remap[representative] genome_is_representative = str(int(genome == representative)) target_row = [ genome, species, representative, genome_is_representative ] out.write("\t".join(target_row) + "\n") seen_genomes.add(genome) seen_species.add(species) tsprint( f"Emitted {len(seen_genomes)} genomes and {len(seen_species)} species to {outputs.genomes}." )
def pysam_pileup(args, species_ids, tempdir, outputdir, contigs_files): "Counting alleles and run pileups per species in parallel" # Update alignment stats for species species_pileup_stats = defaultdict() contigs_db_stats = { 'species_counts': 0, 'total_seqs': 0, 'total_length': 0 } mp = multiprocessing.Pool(num_physical_cores) argument_list = [(sp_id, args, tempdir, outputdir, contigs_files[sp_id], contigs_db_stats) for sp_id in species_ids] for species_id, aln_stats in mp.starmap(species_pileup, argument_list): sp_stats = { "genome_length": int(aln_stats['genome_length']), "covered_bases": int(aln_stats['covered_bases']), "total_depth": int(aln_stats['total_depth']), "aligned_reads": int(aln_stats['aligned_reads']), "mapped_reads": int(aln_stats['mapped_reads']), "fraction_covered": 0.0, "mean_coverage": 0.0, } if sp_stats["genome_length"] > 0: sp_stats["fraction_covered"] = format( sp_stats["covered_bases"] / sp_stats["genome_length"], DECIMALS) if sp_stats["covered_bases"] > 0: sp_stats["mean_coverage"] = format( sp_stats["total_depth"] / sp_stats["covered_bases"], DECIMALS) species_pileup_stats[species_id] = sp_stats tsprint( f"contigs_db_stats - total genomes: {contigs_db_stats['species_counts']}" ) tsprint( f"contigs_db_stats - total contigs: {contigs_db_stats['total_seqs']}") tsprint( f"contigs_db_stats - total base-pairs: {contigs_db_stats['total_length']}" ) return species_pileup_stats
def species_pileup(species_id, args, tempdir, outputdir, contig_file, contigs_db_stats): # Read in contigs information for current species_id contigs = {} contigs_db_stats[ 'species_counts'] += 1 # not being updated and passed as expected with InputStream(contig_file) as file: for rec in Bio.SeqIO.parse(file, 'fasta'): contigs[rec.id] = { "species_id": species_id, "contig_len": int(len(rec.seq)), "contig_seq": str(rec.seq), } contigs_db_stats['total_length'] += contigs[rec.id]["contig_len"] contigs_db_stats['total_seqs'] += 1 # Summary statistics aln_stats = { "genome_length": 0, "total_depth": 0, "covered_bases": 0, "aligned_reads": 0, "mapped_reads": 0, } def keep_read(x): return keep_read_worker(x, args, aln_stats) header = [ 'ref_id', 'ref_pos', 'ref_allele', 'depth', 'count_a', 'count_c', 'count_g', 'count_t' ] path = f"{outputdir}/{species_id}.snps.lz4" with OutputStream(path) as file: file.write('\t'.join(header) + '\n') zero_rows_allowed = not args.sparse # Loop over alignment for current species's contigs with AlignmentFile(f"{tempdir}/repgenomes.bam") as bamfile: for contig_id in sorted(list(contigs.keys())): # why need to sort? contig = contigs[contig_id] counts = bamfile.count_coverage( contig_id, start=0, end=contig["contig_len"], quality_threshold=args.aln_baseq, read_callback=keep_read) for ref_pos in range(0, contig["contig_len"]): ref_allele = contig["contig_seq"][ref_pos] depth = sum([counts[nt][ref_pos] for nt in range(4)]) count_a = counts[0][ref_pos] count_c = counts[1][ref_pos] count_g = counts[2][ref_pos] count_t = counts[3][ref_pos] values = [ contig_id, ref_pos + 1, ref_allele, depth, count_a, count_c, count_g, count_t ] if depth > 0 or zero_rows_allowed: file.write('\t'.join(str(val) for val in values) + '\n') aln_stats['genome_length'] += 1 aln_stats['total_depth'] += depth if depth > 0: aln_stats['covered_bases'] += 1 tsprint(json.dumps({species_id: aln_stats}, indent=4)) return (species_id, {k: str(v) for k, v in aln_stats.items()})
def main(args): tsprint( f"Executing iggtools subcommand {args.subcommand} with args {vars(args)}." ) aws_batch_submit(args)
def main(args): tsprint( f"Executing iggtools subcommand {args.subcommand} with args {vars(args)}." ) build_pangenome(args)
def main(args): tsprint( f"Executing iggtools subcommand {args.subcommand} with args {vars(args)}." ) init(args)
def main(args): tsprint( f"Executing iggtools subcommand {args.subcommand} with args {vars(args)}." ) annotate_genes(args)
def main(args): tsprint(f"Doing important work in subcommand {args.subcommand} with args {vars(args)}")
def main(args): tsprint(f"Doing important work in subcommand {args.subcommand} with args\n{json.dumps(vars(args), indent=4)}") midas_run_species(args)
def main(args): tsprint(f"Executing iggtools subcommand {args.subcommand} with args {vars(args)}.") build_marker_genes(args)