def import_uhgg_slave(args): """ https://github.com/czbiohub/iggtools/wiki """ violation = "Please do not call build_pangenome_slave directly. Violation" assert args.zzz_slave_mode, f"{violation}: Missing --zzz_slave_mode arg." assert os.path.isfile( args.zzz_slave_toc ), f"{violation}: File does not exist: {args.zzz_slave_toc}" db = UHGG(args.zzz_slave_toc) representatives = db.representatives species_for_genome = db.genomes genome_id = args.genomes species_id = species_for_genome[genome_id] representative_id = representatives[species_id] dest = imported_genome_file(genome_id, species_id, f"{genome_id}.fna.lz4") command( f"aws s3 rm --recursive {imported_genome_file(genome_id, species_id, '')}" ) cleaned = clean_genome(genome_id, representative_id) upload(cleaned, dest)
def identify_marker_genes(genome_id, species_id, marker_genes_hmm): command(f"aws s3 rm --recursive {output_marker_genes_file(genome_id, species_id, '')}") hmmsearch_file = hmmsearch(genome_id, species_id, marker_genes_hmm, num_threads=1) annotated_genes_s3_path = input_annotations_file(genome_id, species_id, f"{genome_id}.ffn.lz4") genes = fetch_genes(annotated_genes_s3_path) # Parse local hmmsearch file hmmsearch_seq = f"{genome_id}.markers.fa" hmmsearch_map = f"{genome_id}.markers.map" with open(hmmsearch_seq, "w") as o_seq, open(hmmsearch_map, "w") as o_map: for rec in find_hits(hmmsearch_file): marker_gene = genes[rec["query"]].upper() marker_info = [species_id, genome_id, rec["query"], len(marker_gene), rec["target"]] o_map.write('\t'.join(str(mi) for mi in marker_info) + '\n') o_seq.write('>%s\n%s\n' % (rec['query'], marker_gene)) output_files = [hmmsearch_file, hmmsearch_seq, hmmsearch_map] # Make sure output hmmsearch_map last cuz it indicates all other files has successed assert output_files[-1] == lastoutput(genome_id) return output_files
def midas_run_species(args): tempdir = f"{args.outdir}/species/temp/" command(f"rm -rf {tempdir}") command(f"mkdir -p {tempdir}") markers_db_files = multithreading_map(download_reference, [f"s3://microbiome-igg/2.0/marker_genes/phyeco/phyeco.fa{ext}.lz4" for ext in ["", ".bwt", ".header", ".sa", ".sequence"]] + ["s3://microbiome-igg/2.0/marker_genes/phyeco/phyeco.map.lz4"]) db = UHGG() species_info = db.species marker_info = read_marker_info_repgenomes(markers_db_files[-1]) with TimedSection("aligning reads to marker-genes database"): m8_file = map_reads_hsblast(tempdir, args.r1, args.r2, args.word_size, markers_db_files[0], args.max_reads) with InputStream(params.inputs.marker_genes_hmm_cutoffs) as cutoff_params: marker_cutoffs = dict(select_from_tsv(cutoff_params, selected_columns={"marker_id": str, "marker_cutoff": float})) with TimedSection("classifying reads"): best_hits = find_best_hits(args, marker_info, m8_file, marker_cutoffs) unique_alns = assign_unique(best_hits, species_info, marker_info) species_alns = assign_non_unique(best_hits, unique_alns, marker_info) with TimedSection("estimating species abundance"): total_gene_length = sum_marker_gene_lengths(marker_info) species_abundance = normalize_counts(species_alns, total_gene_length) write_abundance(args.outdir, species_abundance)
def init_nvme(args): # TODO: Generalize the magic numbers 838 and 1715518 (those are for AWS instance type r5.12xlarge). # pylint: disable=fixme # https://github.com/czbiohub/iggtools/issues/17 if nvme_size_str() != '1715518': # Raid, format, and mount the NVME drives attached to this instance. tsprint("Initializing instance NVME storage.") try: command( """set -o pipefail; lsblk | grep 838 | awk '{print "/dev/"$1}' | xargs -n 10 s3mi raid nvme""" ) except Exception as e: try: # Sometimes we've formatted it in a prior incarnation but the mountpoint can't exist in the container to tell us. # In those cases we can just try to mount it. command("""mount /dev/md0 /mnt/nvme""") except: raise e assert nvme_size_str( ) == '1715518', "Failed to initialize and mount instance NVME storage." else: tsprint("Instance NVME storage previously initialized.") if args.force: tsprint( "Ignoring --force argument. It is usually unnecessary to reinitialize AWS instance storage." )
def annotate_genes_slave(args): """ https://github.com/czbiohub/iggtools/wiki """ violation = "Please do not call build_pangenome_slave directly. Violation" assert args.zzz_slave_mode, f"{violation}: Missing --zzz_slave_mode arg." assert os.path.isfile( args.zzz_slave_toc ), f"{violation}: File does not exist: {args.zzz_slave_toc}" db = UHGG(args.zzz_slave_toc) species_for_genome = db.genomes genome_id = args.genomes species_id = species_for_genome[genome_id] last_output = f"{genome_id}.fna.lz4" dest_file = annotations_file(genome_id, species_id, last_output) command(f"aws s3 rm --recursive {dest_file.rsplit('/', 1)[0]}") output_files = annotate_genome(genome_id, species_id) upload_tasks = [] for o in output_files: olz = o + ".lz4" if olz != last_output: upload_tasks.append((o, annotations_file(genome_id, species_id, olz))) multithreading_map(upload_star, upload_tasks) # Upload this last because it indicates all other work has succeeded. upload(drop_lz4(last_output), annotations_file(genome_id, species_id, last_output))
def build_pangenome_master(args): # Fetch table of contents from s3. # This will be read separately by each species build subcommand, so we make a local copy. local_toc = os.path.basename(outputs.genomes) command(f"rm -f {local_toc}") command(f"aws s3 cp --only-show-errors {outputs.genomes} {local_toc}") db = UHGG(local_toc) species = db.species def species_work(species_id): assert species_id in species, f"Species {species_id} is not in the database." species_genomes = species[species_id] def destpath(src): return pangenome_file(species_id, src + ".lz4") # The species build will upload this file last, after everything else is successfully uploaded. # Therefore, if this file exists in s3, there is no need to redo the species build. dest_file = destpath("gene_info.txt") msg = f"Building pangenome for species {species_id} with {len(species_genomes)} total genomes." if find_files_with_retry(dest_file): if not args.force: tsprint( f"Destination {dest_file} for species {species_id} pangenome already exists. Specify --force to overwrite." ) return msg = msg.replace("Building", "Rebuilding") with CONCURRENT_SPECIES_BUILDS: tsprint(msg) slave_log = "pangenome_build.log" slave_subdir = str(species_id) if not args.debug: command(f"rm -rf {slave_subdir}") if not os.path.isdir(slave_subdir): command(f"mkdir {slave_subdir}") # Recurisve call via subcommand. Use subdir, redirect logs. slave_cmd = f"cd {slave_subdir}; PYTHONPATH={pythonpath()} {sys.executable} -m iggtools build_pangenome -s {species_id} --zzz_slave_mode --zzz_slave_toc {os.path.abspath(local_toc)} {'--debug' if args.debug else ''} &>> {slave_log}" with open(f"{slave_subdir}/{slave_log}", "w") as slog: slog.write(msg + "\n") slog.write(slave_cmd + "\n") try: command(slave_cmd) finally: # Cleanup should not raise exceptions of its own, so as not to interfere with any # prior exceptions that may be more informative. Hence check=False. upload(f"{slave_subdir}/{slave_log}", destpath(slave_log), check=False) if not args.debug: command(f"rm -rf {slave_subdir}", check=False) # Check for destination presence in s3 with up to 10-way concurrency. # If destination is absent, commence build with up to 3-way concurrency as constrained by CONCURRENT_SPECIES_BUILDS. species_id_list = decode_species_arg(args, species) multithreading_map(species_work, species_id_list, num_threads=10)
def midas_run_snps(args): tempdir = f"{args.outdir}/snps/temp_sc{args.species_cov}" if args.debug and os.path.exists(tempdir): tsprint( f"INFO: Reusing existing temp data in {tempdir} according to --debug flag." ) else: command(f"rm -rf {tempdir}") command(f"mkdir -p {tempdir}") outputdir = f"{args.outdir}/snps/output_sc{args.species_cov}" if not os.path.exists(outputdir): command(f"mkdir -p {outputdir}") try: # The full species profile must exist -- it is output by run_midas_species. # Restrict to species above requested coverage. full_species_profile = parse_species_profile(args.outdir) species_profile = select_species(full_species_profile, args.species_cov) local_toc = download_reference(outputs.genomes) db = UHGG(local_toc) representatives = db.representatives def download_contigs(species_id): return download_reference( imported_genome_file(representatives[species_id], species_id, "fna.lz4"), f"{tempdir}/{species_id}") # Download repgenome_id.fna for every species in the restricted species profile. contigs_files = multithreading_hashmap(download_contigs, species_profile.keys(), num_threads=20) # Use Bowtie2 to map reads to a representative genomes bt2_db_name = "repgenomes" build_bowtie2_db(tempdir, bt2_db_name, contigs_files) bowtie2_align(args, tempdir, bt2_db_name, sort_aln=True) # Use mpileup to identify SNPs samtools_index(args, tempdir, bt2_db_name) species_pileup_stats = pysam_pileup(args, list(species_profile.keys()), tempdir, outputdir, contigs_files) write_snps_summary( species_pileup_stats, f"{args.outdir}/snps/output_sc{args.species_cov}/summary.txt") except: if not args.debug: tsprint( "Deleting untrustworthy outputs due to error. Specify --debug flag to keep." ) command(f"rm -rf {tempdir}", check=False) command(f"rm -rf {outputdir}", check=False)
def import_uhgg_master(args): # Fetch table of contents from s3. # This will be read separately by each species build subcommand, so we make a local copy. local_toc = os.path.basename(outputs.genomes) command(f"rm -f {local_toc}") command(f"aws s3 cp --only-show-errors {outputs.genomes} {local_toc}") db = UHGG(local_toc) species_for_genome = db.genomes def genome_work(genome_id): assert genome_id in species_for_genome, f"Genome {genome_id} is not in the database." species_id = species_for_genome[genome_id] dest_file = imported_genome_file(genome_id, species_id, f"{genome_id}.fna.lz4") msg = f"Importing genome {genome_id} from species {species_id}." if find_files_with_retry(dest_file): if not args.force: tsprint( f"Destination {dest_file} for genome {genome_id} already exists. Specify --force to overwrite." ) return msg = msg.replace("Importing", "Reimporting") tsprint(msg) slave_log = "import_uhgg.log" slave_subdir = f"{species_id}__{genome_id}" if not args.debug: command(f"rm -rf {slave_subdir}") if not os.path.isdir(slave_subdir): command(f"mkdir {slave_subdir}") # Recurisve call via subcommand. Use subdir, redirect logs. slave_cmd = f"cd {slave_subdir}; PYTHONPATH={pythonpath()} {sys.executable} -m iggtools import_uhgg --genome {genome_id} --zzz_slave_mode --zzz_slave_toc {os.path.abspath(local_toc)} {'--debug' if args.debug else ''} &>> {slave_log}" with open(f"{slave_subdir}/{slave_log}", "w") as slog: slog.write(msg + "\n") slog.write(slave_cmd + "\n") try: command(slave_cmd) finally: # Cleanup should not raise exceptions of its own, so as not to interfere with any # prior exceptions that may be more informative. Hence check=False. upload(f"{slave_subdir}/{slave_log}", imported_genome_file(genome_id, species_id, slave_log + ".lz4"), check=False) if not args.debug: command(f"rm -rf {slave_subdir}", check=False) genome_id_list = decode_genomes_arg(args, species_for_genome) multithreading_map(genome_work, genome_id_list, num_threads=CONCURRENT_GENOME_IMPORTS)
def genome_work(genome_id): assert genome_id in species_for_genome, f"Genome {genome_id} is not in the database." species_id = species_for_genome[genome_id] dest_file = destpath(genome_id, species_id, lastoutput(genome_id)) msg = f"Running HMMsearch for genome {genome_id} from species {species_id}." if find_files_with_retry(dest_file): if not args.force: tsprint(f"Destination {dest_file} for genome {genome_id} already exists. Specify --force to overwrite.") return msg = msg.replace("Running", "Rerunning") tsprint(msg) slave_log = "build_marker_genes.log" slave_subdir = f"{species_id}__{genome_id}" if not args.debug: command(f"rm -rf {slave_subdir}") if not os.path.isdir(slave_subdir): command(f"mkdir {slave_subdir}") # Recurisve call via subcommand. Use subdir, redirect logs. slave_cmd = f"cd {slave_subdir}; PYTHONPATH={pythonpath()} {sys.executable} -m iggtools build_marker_genes --genome {genome_id} --zzz_slave_mode --zzz_slave_toc {os.path.abspath(local_toc)} --zzz_slave_marker_genes_hmm {os.path.abspath(marker_genes_hmm)} {'--debug' if args.debug else ''} &>> {slave_log}" with open(f"{slave_subdir}/{slave_log}", "w") as slog: slog.write(msg + "\n") slog.write(slave_cmd + "\n") try: command(slave_cmd) finally: # Cleanup should not raise exceptions of its own, so as not to interfere with any # prior exceptions that may be more informative. Hence check=False. upload(f"{slave_subdir}/{slave_log}", destpath(genome_id, species_id, slave_log), check=False) if not args.debug: command(f"rm -rf {slave_subdir}", check=False)
def samtools_index(args, bt2_db_dir, bt2_db_name): if args.debug and os.path.exists(f"{bt2_db_dir}/{bt2_db_name}.bam.bai"): tsprint( f"Skipping samtools index in debug mode as temporary data exists: {bt2_db_dir}/{bt2_db_name}.bam" ) return try: command( f"samtools index -@ {num_physical_cores} {bt2_db_dir}/{bt2_db_name}.bam" ) except: command(f"rm -f {bt2_db_dir}/{bt2_db_name}.bam.bai") raise
def hmmsearch(genome_id, species_id, marker_genes_hmm, num_threads=1): # Input annotated_genes_s3_path = input_annotations_file(genome_id, species_id, f"{genome_id}.faa.lz4") annotated_genes = download_reference(annotated_genes_s3_path) # Output hmmsearch_file = f"{genome_id}.hmmsearch" # Command if find_files(hmmsearch_file): # This only happens in debug mode, where we can use pre-existing file. tsprint(f"Found hmmsearch results for genome {genome_id} from prior run.") else: try: command(f"hmmsearch --noali --cpu {num_threads} --domtblout {hmmsearch_file} {marker_genes_hmm} {annotated_genes}") except: # Do not keep bogus zero-length files; those are harmful if we rerun in place. command(f"mv {hmmsearch_file} {hmmsearch_file}.bogus", check=False) raise return hmmsearch_file
def write_results(outdir, species, num_covered_genes, species_markers_coverage, species_mean_coverage): if not os.path.exists(f"{outdir}/genes/output"): command(f"mkdir -p {outdir}/genes/output") # open outfiles for each species_id header = ['gene_id', 'count_reads', 'coverage', 'copy_number'] for species_id, species_genes in species.items(): path = f"{outdir}/genes/output/{species_id}.genes.lz4" with OutputStream(path) as sp_out: sp_out.write('\t'.join(header) + '\n') for gene_id, gene in species_genes.items(): if gene["depth"] == 0: # Sparse by default here. You can get the pangenome_size from the summary file, emitted below. continue values = [gene_id, str(gene["mapped_reads"]), format(gene["depth"], DECIMALS), format(gene["copies"], DECIMALS)] sp_out.write('\t'.join(values) + '\n') # summary stats header = ['species_id', 'pangenome_size', 'covered_genes', 'fraction_covered', 'mean_coverage', 'marker_coverage', 'aligned_reads', 'mapped_reads'] path = f"{outdir}/genes/summary.txt" with OutputStream(path) as file: file.write('\t'.join(header) + '\n') for species_id, species_genes in species.items(): # No sparsity here -- should be extremely rare for a species row to be all 0. aligned_reads = sum(g["aligned_reads"] for g in species_genes.values()) mapped_reads = sum(g["mapped_reads"] for g in species_genes.values()) pangenome_size = len(species_genes) values = [ species_id, str(pangenome_size), str(num_covered_genes[species_id]), format(num_covered_genes[species_id] / pangenome_size, DECIMALS), format(species_mean_coverage[species_id], DECIMALS), format(species_markers_coverage[species_id], DECIMALS), str(aligned_reads), str(mapped_reads) ] file.write('\t'.join(values) + '\n')
def species_work(species_id): assert species_id in species, f"Species {species_id} is not in the database." species_genomes = species[species_id] def destpath(src): return pangenome_file(species_id, src + ".lz4") # The species build will upload this file last, after everything else is successfully uploaded. # Therefore, if this file exists in s3, there is no need to redo the species build. dest_file = destpath("gene_info.txt") msg = f"Building pangenome for species {species_id} with {len(species_genomes)} total genomes." if find_files_with_retry(dest_file): if not args.force: tsprint( f"Destination {dest_file} for species {species_id} pangenome already exists. Specify --force to overwrite." ) return msg = msg.replace("Building", "Rebuilding") with CONCURRENT_SPECIES_BUILDS: tsprint(msg) slave_log = "pangenome_build.log" slave_subdir = str(species_id) if not args.debug: command(f"rm -rf {slave_subdir}") if not os.path.isdir(slave_subdir): command(f"mkdir {slave_subdir}") # Recurisve call via subcommand. Use subdir, redirect logs. slave_cmd = f"cd {slave_subdir}; PYTHONPATH={pythonpath()} {sys.executable} -m iggtools build_pangenome -s {species_id} --zzz_slave_mode --zzz_slave_toc {os.path.abspath(local_toc)} {'--debug' if args.debug else ''} &>> {slave_log}" with open(f"{slave_subdir}/{slave_log}", "w") as slog: slog.write(msg + "\n") slog.write(slave_cmd + "\n") try: command(slave_cmd) finally: # Cleanup should not raise exceptions of its own, so as not to interfere with any # prior exceptions that may be more informative. Hence check=False. upload(f"{slave_subdir}/{slave_log}", destpath(slave_log), check=False) if not args.debug: command(f"rm -rf {slave_subdir}", check=False)
def build_bowtie2_db(bt2_db_dir, bt2_db_name, downloaded_files): """ Build Bowtie2 database of representative genomes or centroid genes for the species present in the sample, e.g. repgenomes OR pangenomes """ bt2_db_suffixes = [ "1.bt2", "2.bt2", "3.bt2", "4.bt2", "rev.1.bt2", "rev.2.bt2" ] if all( os.path.exists(f"{bt2_db_dir}/{bt2_db_name}.{ext}") for ext in bt2_db_suffixes): tsprint("Skipping bowtie2-build as database files appear to exist.") return command(f"rm -f {bt2_db_dir}/{bt2_db_name}.fa") command(f"touch {bt2_db_dir}/{bt2_db_name}.fa") for files in split(downloaded_files.values(), 20): # keep "cat" commands short command("cat " + " ".join(files) + f" >> {bt2_db_dir}/{bt2_db_name}.fa") command( f"bowtie2-build --threads {num_physical_cores} {bt2_db_dir}/{bt2_db_name}.fa {bt2_db_dir}/{bt2_db_name} > {bt2_db_dir}/bowtie2-build.log" )
def vsearch(percent_id, genes, num_threads=num_vcpu): centroids = f"centroids.{percent_id}.ffn" uclust = f"uclust.{percent_id}.txt" # log = f"uclust.{percent_id}.log" if find_files(centroids) and find_files(uclust): tsprint( f"Found vsearch results at percent identity {percent_id} from prior run." ) else: try: command( f"vsearch --quiet --cluster_fast {genes} --id {percent_id/100.0} --threads {num_threads} --centroids {centroids} --uc {uclust}" ) except: # Do not keep bogus zero-length files; those are harmful if we rerun in place. command(f"mv {centroids} {centroids}.bogus", check=False) command(f"mv {uclust} {uclust}.bogus", check=False) raise return centroids, uclust #, log
def bowtie2_align(args, bt2_db_dir, bt2_db_name, sort_aln=False): """ Use Bowtie2 to map reads to specified representative genomes or collections of centroids genes for the pangenome flow. """ if args.debug and os.path.exists(f"{bt2_db_dir}/{bt2_db_name}.bam"): tsprint( f"Skipping Bowtie2 alignment in debug mode as temporary data exists: {bt2_db_dir}/{bt2_db_name}.bam" ) return # Construct bowtie2 align input arguments max_reads = f"-u {args.max_reads}" if args.max_reads else "" aln_mode = "local" if args.aln_mode == "local" else "end-to-end" aln_speed = args.aln_speed if aln_mode == "end_to_end" else args.aln_speed + "-local" r2 = "" if args.r2: r1 = f"-1 {args.r1}" r2 = f"-2 {args.r2}" elif args.aln_interleaved: r1 = f"--interleaved {args.r1}" else: r1 = f"-U {args.r1}" try: bt2_command = f"bowtie2 --no-unal -x {bt2_db_dir}/{bt2_db_name} {max_reads} --{aln_mode} --{aln_speed} --threads {num_physical_cores} -q {r1} {r2}" if sort_aln: command(f"set -o pipefail; {bt2_command} | \ samtools view --threads {num_physical_cores} -b - | \ samtools sort --threads {num_physical_cores} -o {bt2_db_dir}/{bt2_db_name}.bam" ) else: command(f"set -o pipefail; {bt2_command} | \ samtools view --threads {num_physical_cores} -b - > {bt2_db_dir}/{bt2_db_name}.bam" ) except: tsprint( f"Bowtie2 align to {bt2_db_dir}/{bt2_db_name}.bam run into error") command(f"rm -f {bt2_db_dir}/{bt2_db_name}.bam") raise
def midas_run_genes(args): tempdir = f"{args.outdir}/genes/temp_sc{args.species_cov}" if args.debug and os.path.exists(tempdir): tsprint(f"INFO: Reusing existing temp data in {tempdir} according to --debug flag.") else: command(f"rm -rf {tempdir}") command(f"mkdir -p {tempdir}") try: # The full species profile must exist -- it is output by run_midas_species. # Restrict to species above requested coverage. full_species_profile = parse_species_profile(args.outdir) species_profile = select_species(full_species_profile, args.species_cov) def download_centroid(species_id): return download_reference(pangenome_file(species_id, "centroids.ffn.lz4"), f"{tempdir}/{species_id}") # TODO colocate samples to overlap reference downloads # Download centroids.ffn for every species in the restricted species profile. centroids_files = multithreading_hashmap(download_centroid, species_profile.keys(), num_threads=20) # Perhaps avoid this giant conglomerated file, fetching instead submaps for each species. # Also colocate/cache/download in master for multiple slave subcommand invocations. bt2_db_name = "pangenomes" build_bowtie2_db(tempdir, bt2_db_name, centroids_files) bowtie2_align(args, tempdir, bt2_db_name, sort_aln=False) # Compute coverage of pangenome for each present species and write results to disk marker_genes_map = "s3://microbiome-igg/2.0/marker_genes/phyeco/phyeco.map.lz4" species, genes = scan_centroids(centroids_files) num_covered_genes, species_mean_coverage, covered_genes = count_mapped_bp(args, tempdir, genes) markers = scan_markers(genes, marker_genes_map) species_markers_coverage = normalize(genes, covered_genes, markers) write_results(args.outdir, species, num_covered_genes, species_markers_coverage, species_mean_coverage) except: if not args.debug: tsprint("Deleting untrustworthy outputs due to error. Specify --debug flag to keep.") command(f"rm -rf {tempdir}", check=False)
def annotate_genome(genome_id, species_id): # Prokka will crash if installed <6 months ago. It's a feature. See tbl2asn. cleaned_genome = imported_genome_file(genome_id, species_id, f"{genome_id}.fna.lz4") ugid = unified_genome_id(genome_id) download_genome(genome_id, cleaned_genome) subdir = "prokka_dir" command(f"rm -rf {subdir}") output_files = [ f"{genome_id}.faa", f"{genome_id}.ffn", f"{genome_id}.fna", f"{genome_id}.gff", f"{genome_id}.tsv" ] command( f"prokka --kingdom Bacteria --outdir {subdir} --cpus 8 --prefix {genome_id} --locustag {ugid} --compliant {genome_id}.fasta" ) for o in output_files: command(f"mv {subdir}/{o} .") return output_files
def download_genome(genome_id, cleaned_genome): command(f"rm -f {genome_id}.fasta") command( f"aws s3 cp --only-show-errors {cleaned_genome} - | lz4 -dc > {genome_id}.fasta" )
def collate_repgenome_markers(args): db = UHGG() species = db.species representatives = db.representatives collate_log = "collate_repgenome_markers.log" collate_subdir = f"collate_repgenome_markers" dest_file = destpath(collate_log) msg = f"Collating marker genes sequences." if find_files_with_retry(dest_file): if not args.force: tsprint( f"Destination {dest_file} already exists. Specify --force to overwrite." ) return msg = msg.replace(msg.split(" ")[0], "Re-" + msg.split(" ")[0]) tsprint(msg) if not args.debug: command(f"rm -rf {collate_subdir}") if not os.path.isdir(collate_subdir): command(f"mkdir {collate_subdir}") with open(f"{collate_subdir}/{collate_log}", "w") as slog: slog.write(msg + "\n") # Download download_seq_tasks = [] download_map_tasks = [] for species_id in species.keys(): rep_id = representatives[species_id] remote_path_seq = input_marker_genes_file(rep_id, species_id, f"{rep_id}.markers.fa.lz4") remote_path_map = input_marker_genes_file(rep_id, species_id, f"{rep_id}.markers.map.lz4") download_seq_tasks.append((remote_path_seq, collate_subdir)) download_map_tasks.append((remote_path_map, collate_subdir)) downloaded_marker_seqs = multithreading_map( download_reference, download_seq_tasks, num_threads=CONCURRENT_MARKER_GENES_DOWNLOAD) downloaded_marker_maps = multithreading_map( download_reference, download_map_tasks, num_threads=CONCURRENT_MARKER_GENES_DOWNLOAD) ## Collate collated_rep_marker_seqs = output_all_rep_marker_genes("fa") collated_genes = os.path.basename(collated_rep_marker_seqs) for marker_fa_files in split(downloaded_marker_seqs, 20): command("cat " + " ".join(marker_fa_files) + f" >> {collate_subdir}/{collated_genes}") collated_rep_marker_maps = output_all_rep_marker_genes("map") collated_maps = os.path.basename(collated_rep_marker_maps) for marker_map_files in split(downloaded_marker_maps, 20): command("cat " + " ".join(marker_map_files) + f" >> {collate_subdir}/{collated_maps}") ## Index cmd_index = f"cd {collate_subdir}; hs-blastn index {collated_genes} &>> {collate_log}" with open(f"{collate_subdir}/{collate_log}", "a") as slog: slog.write(cmd_index + "\n") command(cmd_index) index_suffix = ["fa", "map", "fa.bwt", "fa.header", "fa.sa", "fa.sequence"] output_files = [ f"{collate_subdir}/{inputs.marker_set}.{isuffix}" for isuffix in index_suffix ] ## Upload upload_tasks = [] for o in output_files: upload_tasks.append((o, destpath(os.path.basename(o)))) multithreading_map(upload_star, upload_tasks) # Upload the log file in the last upload(f"{collate_subdir}/{collate_log}", destpath(collate_log), check=False) ## Clean up if not args.debug: command(f"rm -rf {collate_subdir}", check=False)
def build_pangenome_slave(args): """ Input spec: https://github.com/czbiohub/iggtools/wiki#gene-annotations Output spec: https://github.com/czbiohub/iggtools/wiki#pan-genomes """ violation = "Please do not call build_pangenome_slave directly. Violation" assert args.zzz_slave_mode, f"{violation}: Missing --zzz_slave_mode arg." assert os.path.isfile( args.zzz_slave_toc ), f"{violation}: File does not exist: {args.zzz_slave_toc}" assert os.path.basename( os.getcwd() ) == args.species, f"{violation}: {os.path.basename(os.getcwd())} != {args.species}" db = UHGG(args.zzz_slave_toc) species = db.species species_id = args.species assert species_id in species, f"{violation}: Species {species_id} is not in the database." species_genomes = species[species_id] species_genomes_ids = species_genomes.keys() def destpath(src): return pangenome_file(species_id, src + ".lz4") command(f"aws s3 rm --recursive {pangenome_file(species_id, '')}") cleaned = multiprocessing_map(clean_genes, ((species_id, genome_id) for genome_id in species_genomes_ids)) command("rm -f genes.ffn genes.len") for temp_files in split(cleaned, 20): # keep "cat" commands short ffn_files, len_files = transpose(temp_files) command("cat " + " ".join(ffn_files) + " >> genes.ffn") command("cat " + " ".join(len_files) + " >> genes.len") # The initial clustering to max_percent takes longest. max_percent, lower_percents = CLUSTERING_PERCENTS[0], CLUSTERING_PERCENTS[ 1:] cluster_files = {max_percent: vsearch(max_percent, "genes.ffn")} # Reclustering of the max_percent centroids is usually quick, and can proceed in prallel. recluster = lambda percent_id: vsearch(percent_id, cluster_files[ max_percent][0]) cluster_files.update(multithreading_hashmap(recluster, lower_percents)) xref(cluster_files, "gene_info.txt") # Create list of (source, dest) pairs for uploading. # Note that centroids.{max_percent}.ffn is uploaded to 2 different destinations. upload_tasks = [ ("genes.ffn", destpath("genes.ffn")), ("genes.len", destpath("genes.len")), (f"centroids.{max_percent}.ffn", destpath("centroids.ffn") ) # no percent in dest, per spec ] for src in flatten(cluster_files.values()): upload_tasks.append((src, destpath("temp/" + src))) # Upload in parallel. multithreading_map(upload_star, upload_tasks) # Leave this upload for last, so the presence of this file in s3 would indicate the entire species build has succeeded. upload("gene_info.txt", destpath("gene_info.txt"))