def run(self): ''' Run alignmment remotely ''' input_fas = self.get_input_fas() [output_m8, deduped_output_m8, output_hitsummary, output_counts_json] = self.output_files_local() service = self.additional_attributes["service"] assert service in ("gsnap", "rapsearch2") # TODO: run the alignment remotely and make lazy_chunk=True, revisit this later self.run_remotely(input_fas, output_m8, service) # get database lineage_db = fetch_from_s3(self.additional_files["lineage_db"], self.ref_dir_local, allow_s3mi=True) accession2taxid_db = fetch_from_s3(self.additional_files["accession2taxid_db"], self.ref_dir_local, allow_s3mi=True) blacklist_s3_file = self.additional_attributes.get('taxon_blacklist', DEFAULT_BLACKLIST_S3) taxon_blacklist = fetch_from_s3(blacklist_s3_file, self.ref_dir_local) m8.call_hits_m8(output_m8, lineage_db, accession2taxid_db, deduped_output_m8, output_hitsummary, taxon_blacklist) # check deuterostome deuterostome_db = None db_type = 'NT' if service == 'gsnap' else 'NR' evalue_type = 'log10' if service == 'rapsearch2' else 'raw' if self.additional_files.get("deuterostome_db"): deuterostome_db = fetch_from_s3(self.additional_files["deuterostome_db"], self.ref_dir_local, allow_s3mi=True) m8.generate_taxon_count_json_from_m8( deduped_output_m8, output_hitsummary, evalue_type, db_type, lineage_db, deuterostome_db, output_counts_json)
def run(self): ''' 1. fetch the taxid -> wikipedia link mapping 2. fetch wikipedia content 3. store everything ''' taxid_list = self.input_files_local[0][0] (taxid2wiki, taxid2desc) = self.output_files_local() taxid2wikidict = {} Entrez.email = self.additional_attributes.get( "entrez_email", "*****@*****.**") num_threads = self.additional_attributes.get("threads", 16) batch_size = self.additional_attributes.get("batch_size", 100) namecsv = self.additional_files.get("taxon2name") id2namedict = {} if namecsv: # This is fetching a reference without fetch_reference; but ok because does not run from the actual pipeline namecsvf = s3.fetch_from_s3(namecsv, "/mnt/idseq/ref") with open(namecsvf, 'r') as namef: for line in namef: fields = line.rstrip().split(",") id2namedict[fields[0]] = fields[1] # This is fetching a reference without fetch_reference and doing a presence check; but ok because does not run from the actual pipeline if s3.check_s3_presence(self.s3_path(taxid2wiki)): # generated taxid2wiki = s3.fetch_from_s3(self.s3_path(taxid2wiki), taxid2wiki) with open(taxid2wiki, "r") as taf: for line in taf: (key, val) = line.rstrip("\n").split("\t") taxid2wikidict[key] = val else: self.fetch_ncbi_wiki_map(num_threads, batch_size, taxid_list, taxid2wikidict) # output dummay for actual wiki content for now taxid2wikicontent = {} self.fetch_wiki_content(num_threads * 4, taxid2wikidict, taxid2wikicontent, id2namedict) with open(taxid2desc, 'w') as desc_outputf: json.dump(taxid2wikicontent, desc_outputf) # output the taxid 2 wikiurl data with open(taxid2wiki, 'w') as taxidoutf: for taxid, wikiurl in taxid2wikidict.items(): if wikiurl == "": pageid = taxid2wikicontent.get(taxid, {}).get('pageid', None) if pageid: wikiurl = f"http://en.wikipedia.org/wiki/index.html?curid={pageid}" taxidoutf.write(f"{taxid}\t{wikiurl}\n")
def run(self): input_fas = self.input_files_local[0][0:2] output_fas = self.output_files_local() output_sam_file = os.path.join( self.output_dir_local, self.additional_attributes["output_sam_file"]) self.additional_files_to_upload.append(output_sam_file) genome_dir = fetch_from_s3(self.additional_files["gsnap_genome"], self.ref_dir_local, allow_s3mi=True, auto_untar=True) gsnap_base_dir = os.path.dirname(genome_dir) gsnap_index_name = os.path.basename(genome_dir) # Run Gsnap gsnap_params = [ 'gsnapl', '-A sam', '--batch=0', '--use-shared-memory=0', '--gmap-mode=all', '--npaths=1', '--ordered', '-t 32', '--max-mismatches=40', '-D', gsnap_base_dir, '-d', gsnap_index_name, '-o', output_sam_file ] + input_fas command.execute(" ".join(gsnap_params)) log.write("Finished GSNAP alignment.") # Extract out unmapped files from sam if len(input_fas) == 2: convert.generate_unmapped_pairs_from_sam(output_sam_file, output_fas) else: convert.generate_unmapped_singles_from_sam(output_sam_file, output_fas[0])
def get_genbank_genomes(self, reference_taxids, destination_dir, superkingdom_name, n=10): ''' Retrieve up to n GenBank reference genomes under the reference_taxids. Assumes reference_taxids are species-level or below. Also assumes they are all in the same superkingdom, which is the only thing we need in our application. Saves the references under file names compatible with MakeKSNP3infile. TODO: Retrieve the genomes from S3 rather than ftp.ncbi.nih.gov (JIRA/IDSEQ-334). ''' if n == 0 or not reference_taxids: return {} n_per_taxid = max(n // len(reference_taxids), 1) genbank_categories_by_superkingdom = { "Viruses": ["viral"], "Bacteria": ["bacteria"], "Eukaryota": ["fungi", "protozoa"], None: ["bacteria", "viral", "fungi", "protozoa"] } # additional options in genbank that we probably don't need right now: # ["archaea", "plant", # "vertebrate_mammalian", "vertebrate_other", "invertebrate", # "other", "metagenomes"] categories = genbank_categories_by_superkingdom[superkingdom_name] for cat in categories: genome_list_path_s3 = f"s3://idseq-public-references/genbank/{cat}/assembly_summary.txt" # source: ftp://ftp.ncbi.nih.gov/genomes/genbank/{cat}/assembly_summary.txt genome_list_local = s3.fetch_from_s3(genome_list_path_s3, destination_dir) genomes = [] for taxid in reference_taxids: taxid_genomes = PipelineStepGeneratePhyloTree.get_taxid_genomes( genome_list_local, taxid, n_per_taxid) genomes += [ entry for entry in taxid_genomes if entry not in genomes ] genomes = genomes[:n] command.remove_file(genome_list_local) if genomes: genbank_fastas = {} for line in genomes: assembly_accession, taxid, _species_taxid, _organism_name, ftp_path = line.split( "\t") ftp_fasta_gz = f"{ftp_path}/{os.path.basename(ftp_path)}_genomic.fna.gz" tree_node_name = f"genbank_{self.clean_name_for_ksnp3(assembly_accession)}" local_fasta = f"{destination_dir}/{tree_node_name}.fasta" if os.path.isfile(local_fasta): local_fasta = f"{local_fasta.split('.')[0]}__I.fasta" command.execute( command_patterns.SingleCommand( cmd='wget', args=["-O", f"{local_fasta}.gz", ftp_fasta_gz])) command.execute( command_patterns.SingleCommand( cmd='gunzip', args=[f"{local_fasta}.gz"])) genbank_fastas[assembly_accession] = local_fasta return genbank_fastas return {}
def run(self): # Setup if len(self.input_files_local) > 1: input_fa_name = self.input_files_local[0][0] hit_summary_files = { 'NT': self.input_files_local[1][2], 'NR': self.input_files_local[2][2] } else: # TODO(yf): Old implementation. TO BE DEPRECATED once 3.1 is fully deployed input_files = self.input_files_local[0] input_fa_name = input_files[0] hit_summary_files = {'NT': input_files[1], 'NR': input_files[2]} # Open lineage db lineage_db = s3.fetch_from_s3(self.additional_files["lineage_db"], self.ref_dir_local, allow_s3mi=True) lineage_map = shelve.open(lineage_db.replace(".db", "")) # Get primary hit mappings valid_hits = PipelineStepGenerateTaxidFasta.parse_hits( hit_summary_files) input_fa = open(input_fa_name, 'rb') output_fa = open(self.output_files_local()[0], 'wb') seq_name = input_fa.readline() seq_data = input_fa.readline() while len(seq_name) > 0 and len(seq_data) > 0: # Example read_id: "NR::NT:CP010376.2:NB501961:14:HM7TLBGX2:1:23109 # :12720:8743/2" # Translate the read information into our custom format with fake # taxids at non-specific hit levels. annotated_read_id = seq_name.decode("utf-8").rstrip().lstrip('>') read_id = annotated_read_id.split(":", 4)[-1] nr_taxid_species, nr_taxid_genus, nr_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage( valid_hits, lineage_map, read_id, 'NR') nt_taxid_species, nt_taxid_genus, nt_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage( valid_hits, lineage_map, read_id, 'NT') fields = [ "family_nr", nr_taxid_family, "family_nt", nt_taxid_family ] fields += ["genus_nr", nr_taxid_genus, "genus_nt", nt_taxid_genus] fields += [ "species_nr", nr_taxid_species, "species_nt", nt_taxid_species ] fields += [annotated_read_id] new_read_name = ('>' + ':'.join(fields) + '\n').encode() output_fa.write(new_read_name) output_fa.write(seq_data) seq_name = input_fa.readline() seq_data = input_fa.readline() input_fa.close() output_fa.close()
def download_to_compare(to_compare): for i, path in enumerate(to_compare): if path.startswith("s3://"): local_name = f"tmp-{i}-" + os.path.basename(path) if os.path.isfile(local_name): os.remove(local_name) path = s3.fetch_from_s3(path, local_name, allow_s3mi=True) if path is None: raise RuntimeError(f"Fetch from S3 failed for {path}") to_compare[i] = path
def get_common_params(self): """Helper that gets srst2 parameters common to both paired and single rds.""" db_file_path = fetch_from_s3(self.additional_files["resist_gene_db"], self.output_dir_local) min_cov = str(self.additional_attributes['min_cov']) # srst2 expects this to be a string, in dag could be passed in as a number n_threads = str(self.additional_attributes['n_threads']) return [ '--min_coverage', min_cov, '--threads', n_threads, '--output', os.path.join(self.output_dir_local, 'output'), '--log', '--gene_db', db_file_path ]
def run(self): (_align_m8, _deduped_m8, hit_summary, _orig_counts) = self.input_files_local[0] output_reference_fasta = self.output_files_local()[0] loc_db = s3.fetch_from_s3( self.additional_files["loc_db"], self.ref_dir_local, allow_s3mi=True) db_s3_path = self.additional_attributes["db"] db_type = self.additional_attributes["db_type"] lineage_db = s3.fetch_from_s3( self.additional_files["lineage_db"], self.ref_dir_local, allow_s3mi=True) (read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary) if len(accession_dict) < MIN_ACCESSIONS_WHOLE_DB_DOWNLOAD: self.download_ref_sequences_from_s3(accession_dict, output_reference_fasta, db_type, loc_db, db_s3_path) else: # download the whole alignment db db_path = s3.fetch_from_s3(db_s3_path, self.ref_dir_local, allow_s3mi=True) self.download_ref_sequences_from_file(accession_dict, loc_db, db_path, output_reference_fasta)
def run(self): """Run STAR to filter out host reads.""" # Setup input_files = self.input_files_local[0][0:2] num_inputs = len(input_files) scratch_dir = os.path.join(self.output_dir_local, "scratch_star") output_files_local = self.output_files_local() output_gene_file = self.additional_attributes.get("output_gene_file") genome_dir = s3.fetch_from_s3(self.additional_files["star_genome"], self.ref_dir_local, allow_s3mi=True, auto_untar=True) # Check parts file for the number of partitioned indexes parts_file = os.path.join(genome_dir, "parts.txt") assert os.path.isfile(parts_file) with open(parts_file, 'rb') as parts_f: num_parts = int(parts_f.read()) # Run STAR on each partition and save the unmapped read info unmapped = input_files for part_idx in range(num_parts): tmp = f"{scratch_dir}/star-part-{part_idx}" genome_part = f"{genome_dir}/part-{part_idx}" count_genes = part_idx == 0 self.run_star_part(tmp, genome_part, unmapped, count_genes) unmapped = PipelineStepRunStar.sync_pairs( PipelineStepRunStar.unmapped_files_in(tmp, num_inputs)) # Run part 0 in gene-counting mode: # (a) ERCCs are doped into part 0 and we want their counts. # (b) If there is only 1 part (e.g. human), the host gene counts also # make sense. if part_idx == 0: gene_count_file = os.path.join(tmp, "ReadsPerGene.out.tab") if os.path.isfile(gene_count_file) and output_gene_file: moved = os.path.join(self.output_dir_local, output_gene_file) command.execute(f"mv {gene_count_file} {moved}") self.additional_files_to_upload.append(moved) # Cleanup for src, dst in zip(unmapped, output_files_local): command.execute(f"mv {src} {dst}") # Move out of scratch dir command.execute("cd %s; rm -rf *" % scratch_dir)
def get_genbank_genomes(self, reference_taxids, destination_dir, superkingdom_name, n=10): ''' Retrieve up to n GenBank reference genomes under the reference_taxids. Assumes reference_taxids are species-level or below. Also assumes they are all in the same superkingdom, which is the only thing we need in our application. Saves the references under file names compatible with MakeKSNP3infile. TODO: Retrieve the genomes from S3 rather than ftp.ncbi.nih.gov (JIRA/IDSEQ-334). ''' if n == 0 or not reference_taxids: return {} n_per_taxid = max(n // len(reference_taxids), 1) genbank_categories_by_superkingdom = { "Viruses": ["viral"], "Bacteria": ["bacteria"], "Eukaryota": ["fungi", "protozoa"], None: ["bacteria", "viral", "fungi", "protozoa"] } # additional options in genbank that we probably don't need right now: # ["archaea", "plant", # "vertebrate_mammalian", "vertebrate_other", "invertebrate", # "other", "metagenomes"] categories = genbank_categories_by_superkingdom[superkingdom_name] for cat in categories: genome_list_path_s3 = f"s3://idseq-database/genbank/{cat}/assembly_summary.txt" # source: ftp://ftp.ncbi.nih.gov/genomes/genbank/{cat}/assembly_summary.txt genome_list_local = s3.fetch_from_s3(genome_list_path_s3, destination_dir) genomes = [] for taxid in reference_taxids: cmd = f"cut -f1,6,7,8,20 {genome_list_local}" # columns: 1 = assembly_accession; 6 = taxid; 7 = species_taxid, 8 = organism_name, 20 = ftp_path cmd += f" | awk -F '\t' '$2 == {taxid}'" # try to find taxid in the taxid column (2nd column of the piped input) cmd += f" | head -n {n_per_taxid}" # take only top n_per_taxid results taxid_genomes = list(filter(None, command.execute_with_output(cmd).split("\n"))) genomes += [entry for entry in taxid_genomes if entry not in genomes] genomes = genomes[:n] command.execute_with_output(f"rm {genome_list_local}") if genomes: genbank_fastas = {} for line in genomes: assembly_accession, taxid, species_taxid, organism_name, ftp_path = line.split("\t") ftp_fasta_gz = f"{ftp_path}/{os.path.basename(ftp_path)}_genomic.fna.gz" tree_node_name = f"genbank_{self.clean_name_for_ksnp3(assembly_accession)}" local_fasta = f"{destination_dir}/{tree_node_name}.fasta" if os.path.isfile(local_fasta): local_fasta = f"{local_fasta.split('.')[0]}__I.fasta" command.execute(f"wget -O {local_fasta}.gz {ftp_fasta_gz}") command.execute(f"gunzip {local_fasta}.gz") genbank_fastas[assembly_accession] = local_fasta return genbank_fastas return {}
def run(self): """ Trim any residual Illumina adapters. Discard any reads that become too short. See: http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/TrimmomaticManual_V0.32.pdf """ input_files = self.input_files_local[0][0:2] output_files = self.output_files_local() is_paired = (len(input_files) == 2) adapter_fasta = s3.fetch_from_s3( self.additional_files["adapter_fasta"], self.ref_dir_local) if fasta.input_file_type(input_files[0]) != 'fastq': # Not fastq for in_file, out_file in zip(input_files, output_files): command.execute(f"cp {in_file} {out_file}") return if is_paired: paired_arg = "PE" output_args = [ output_files[0], # R1, paired, to be kept f"{output_files[0]}__unpaired", # R1, no longer paired, to be discarded output_files[1], # R2, paired, to be kept f"{output_files[1]}__unpaired" ] # R2, no longer paired, to be discarded else: paired_arg = "SE" output_args = output_files cmd = " ".join([ "java -jar /usr/local/bin/trimmomatic-0.38.jar", paired_arg, "-phred33", *input_files, *output_args, f"ILLUMINACLIP:{adapter_fasta}:2:30:10", # Remove Illumina adapters provided in the fasta file. Initially, look for seed matches # allowing maximally *2* mismatches. These seeds will be extended and clipped if in the case of paired end # reads a score of *30* is reached, or in the case of single ended reads a # score of *10*. "MINLEN:75" # Discard reads which are less than *75* bases long after these steps. ]) command.execute(cmd)
def run(self): input_fas = self.input_files_local[0][0:2] output_fas = self.output_files_local() genome_dir = fetch_from_s3(self.additional_files["bowtie2_genome"], self.ref_dir_local, allow_s3mi=True, auto_untar=True) output_sam_file = os.path.join( self.output_dir_local, self.additional_attributes["output_sam_file"]) self.additional_files_to_upload.append(output_sam_file) # The file structure looks like # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2" # The code below will handle up to "bowtie2_genome/GRCh38.primary_assembly. # genome.99.bt2" but not 100. cmd = "ls {genome_dir}/*.bt2*".format(genome_dir=genome_dir) local_genome_dir_ls = command.execute_with_output(cmd) genome_basename = local_genome_dir_ls.split("\n")[0][:-6] if genome_basename[-1] == '.': genome_basename = genome_basename[:-1] bowtie2_params = [ 'bowtie2', '-q', '-x', genome_basename, '-f', '--very-sensitive-local', '-S', output_sam_file ] seed = self.additional_attributes.get("random_seed") if seed: bowtie2_params.extend(['--seed', str(seed)]) else: # Seed option won't work with -p threading option. bowtie2_params.extend(['-p', str(multiprocessing.cpu_count())]) if len(input_fas) == 2: bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]]) else: bowtie2_params.extend(['-U', input_fas[0]]) command.execute(" ".join(bowtie2_params)) log.write("Finished Bowtie alignment.") if len(input_fas) == 2: convert.generate_unmapped_pairs_from_sam(output_sam_file, output_fas) else: convert.generate_unmapped_singles_from_sam(output_sam_file, output_fas[0])
def run(self): """ Build custom blast index from an S3 location or a url """ _input_files = self.input_files_local[0] # dummy in this case # noqa output_tar_file = self.output_files_local()[0] db_type = self.additional_attributes['db_type'] file_source = self.additional_attributes['data_source'] output_db_name = output_tar_file.replace(".tar", "") if file_source.startswith("s3://"): db_file = s3.fetch_from_s3(file_source, self.output_dir_local) else: # Download with wget db_file = os.path.join(self.output_dir_local, os.path.basename(file_source)) urllib.request.urlretrieve(file_source, db_file) self.additional_output_files_hidden.append(db_file) # Build blast index if db_file.endswith(".bz2"): command.execute( command_patterns.SingleCommand(cmd='bzip2', args=["-dk", db_file])) db_file = db_file[:-4] elif db_file.endswith(".zip"): command.execute( command_patterns.SingleCommand(cmd='unzip', args=[db_file])) db_file = db_file[:-4] command.execute( command_patterns.SingleCommand(cmd='makeblastdb', args=[ "-in", db_file, "-dbtype", db_type, "-out", output_db_name ])) command.execute( command_patterns.SingleCommand( cmd='tar', args=["cvf", output_tar_file, output_db_name + ".*"]))
def run_chunk(self, part_suffix, remote_home_dir, remote_index_dir, remote_work_dir, remote_username, input_files, key_path, service, lazy_run): """Dispatch a chunk to worker machines for distributed GSNAP or RAPSearch group machines and handle their execution. """ assert service in ("gsnap", "rapsearch2") chunk_id = int(input_files[0].split(part_suffix)[-1]) multihit_basename = f"multihit-{service}-out{part_suffix}{chunk_id}.m8" multihit_local_outfile = os.path.join(self.chunks_result_dir_local, multihit_basename) multihit_remote_outfile = os.path.join(remote_work_dir, multihit_basename) multihit_s3_outfile = os.path.join(self.chunks_result_dir_s3, multihit_basename) def aws_cp_operation(input_fa): return "aws s3 cp --only-show-errors {src} {dest}".format( src=shlex.quote( os.path.join(self.chunks_result_dir_s3, input_fa)), dest=shlex.quote(os.path.join(remote_work_dir, input_fa))) download_input_from_s3 = " ; ".join(map(aws_cp_operation, input_files)) # Clean up remote work directory before running # This ensures that files from a failed previous run that may still be on the instance # are removed so they don't corrupt the current run base_str = "rm -rf {remote_work_dir} ; mkdir -p {remote_work_dir} ; {download_input_from_s3} ; " environment = self.additional_attributes["environment"] # See step class docstrings for more parameter details. if service == "gsnap": commands = base_str + "{remote_home_dir}/bin/gsnapl -A m8 --batch=0 --use-shared-memory=0 --gmap-mode=none --npaths=100 --ordered -t 48 --max-mismatches=40 -D {remote_index_dir} -d nt_k16 {remote_input_files} > {multihit_remote_outfile}" else: commands = base_str + "/usr/local/bin/rapsearch -d {remote_index_dir}/nr_rapsearch -e -6 -l 10 -a T -b 0 -v 50 -z 24 -q {remote_input_files} -o {multihit_remote_outfile}" commands = commands.format( remote_work_dir=shlex.quote(remote_work_dir), download_input_from_s3=download_input_from_s3, remote_home_dir=shlex.quote(remote_home_dir), remote_index_dir=shlex.quote(remote_index_dir), remote_input_files=" ".join( shlex.quote(remote_work_dir + "/" + input_fa) for input_fa in input_files), multihit_remote_outfile=shlex.quote(multihit_remote_outfile) if service == "gsnap" else shlex.quote(multihit_remote_outfile[:-3]) # Strip the .m8 for RAPSearch as it adds that ) if lazy_run and fetch_from_s3(multihit_s3_outfile, multihit_local_outfile, okay_if_missing=True, allow_s3mi=False): log.write( f"finished alignment for chunk {chunk_id} with {service} by lazily fetching last result" ) else: chunk_timeout = int( self.additional_attributes.get( f"{service.lower()}_chunk_timeout", DEFAULT_CHUNK_TIMEOUT)) for try_number in range(1, CHUNK_MAX_TRIES + 1): log.write( f"waiting for {service} server for chunk {chunk_id}. Try #{try_number}" ) with ASGInstance(service, key_path, remote_username, environment, chunk_id, try_number, self.additional_attributes) as instance_ip: # Try/Except block needs to be inside the ASGInstance context. # A failure to acquire an ASGInstnace is and should be unrecoverable. chunk_status = None elapsed = 0.0 try: t_start = time.time() try: command.execute(command.remote( commands, key_path, remote_username, instance_ip), timeout=chunk_timeout) except: chunk_status = ChunkStatus.CRASH raise finally: elapsed = time.time() - t_start if chunk_status == ChunkStatus.CRASH and elapsed >= chunk_timeout: chunk_status = ChunkStatus.TIMEOUT output_corrupt = self.__check_if_output_is_corrupt( service, key_path, remote_username, instance_ip, multihit_remote_outfile, chunk_id, try_number) if output_corrupt: chunk_status = ChunkStatus.CORRUPT_OUTPUT assert not output_corrupt, output_corrupt # Yay, chunk succeeded. Copy from server and break out of retry loop. try: self.__copy_multihit_remote_outfile( key_path, remote_username, instance_ip, multihit_remote_outfile, multihit_local_outfile) chunk_status = ChunkStatus.SUCCESS break except: # If we failed to copy from the server, it's as bad as a crash in alignment. chunk_status = ChunkStatus.CRASH raise except Exception as e: # 1. No backoff needed here before retrying. We rate limit chunk dispatch (the ASGInstance # acquisition above is blocking). ASGInstance acquisition also tries to ensure that every # chunk flight gets its first try before any retry is dispatched. # 2. If the reason we failed is timeout on the server, we don't retry. The operator must decide # whether to QC the data more, or use smaller chunk size. In fact, we only retry for CRASH and # CORRUPT_OUTPUT. # 3. If this is the last attempt, we gotta re-raise the exception. # 4. Elapsed time is only the time spent in alignment. It excludes the time spent waiting to # acquire ASGinstance. log.log_event('alignment_remote_error', values={ "chunk": chunk_id, "try_number": try_number, "CHUNK_MAX_TRIES": CHUNK_MAX_TRIES, "chunk_status": chunk_status, "elapsed": elapsed, "chunk_timeout": chunk_timeout, "exception": log.parse_exception(e) }) retrying_might_help = chunk_status in ( ChunkStatus.CORRUPT_OUTPUT, ChunkStatus.CRASH) if try_number < CHUNK_MAX_TRIES and retrying_might_help: # Retry! continue else: # End of the road. raise finally: # None chunk_status indicates code bug above. An exception has been raised already # for it, and it says nothing about whether the alignment succeeded or not. if chunk_status != None: chunk_status_tracker(service).note_outcome( instance_ip, chunk_id, elapsed, chunk_status, try_number) self.__delete_remote_dir(remote_work_dir, key_path, remote_username, instance_ip) # Upload to s3 with self.iostream_upload: # Limit concurrent uploads so as not to stall the pipeline. command.execute( command_patterns.SingleCommand( cmd="aws", args=[ "s3", "cp", "--only-show-errors", multihit_local_outfile, os.path.join(self.chunks_result_dir_s3, "") ])) log.write( f"finished alignment for chunk {chunk_id} on {service} server {instance_ip}" ) # Whether lazy or not lazy, we've now got the chunk result locally here. return multihit_local_outfile
def fetch_key(self, key_path_s3): key_path = fetch_from_s3(key_path_s3, self.output_dir_local) command.chmod(key_path, 0o400) return key_path
def run(self): # Setup nt_db = self.additional_attributes["nt_db"] if nt_db.startswith("s3://") and not s3.check_s3_presence(nt_db): raise RuntimeError(f"nt_db at {nt_db} not found.") nt_loc_db = s3.fetch_from_s3(self.additional_files["nt_loc_db"], self.ref_dir_local, allow_s3mi=True) db_type = "nt" # Only NT supported for now # TODO: Design a way to map in/out files more robustly, e.g. by name/type annotated_m8 = self.input_files_local[0][0] annotated_fasta = self.input_files_local[1][0] output_json_dir = os.path.join(self.output_dir_local, "align_viz") # Go through annotated_fasta with a db_type (NT/NR match). Infer the # family/genus/species info read2seq = PipelineStepGenerateAlignmentViz.parse_reads( annotated_fasta, db_type) log.write(f"Read to Seq dictionary size: {len(read2seq)}") db_path = nt_loc_db.replace(".db", "") nt_loc_dict = shelve.open(db_path) groups, line_count = self.process_reads_from_m8_file( annotated_m8, read2seq) if nt_db.startswith("s3://"): log.write("Getting sequences by accession list from S3...") PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_s3( groups, nt_loc_dict, nt_db) else: log.write("Getting sequences by accession list from file...") PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_file( groups, nt_loc_dict, nt_db) for accession_id, ad in groups.items(): ad['coverage_summary'] = PipelineStepGenerateAlignmentViz.calculate_alignment_coverage( ad) result_dict, to_be_deleted = self.populate_reference_sequences(groups) # Delete temp files def safe_multi_delete(files): for f in files: try: os.remove(f) except: pass deleter_thread = threading.Thread(target=safe_multi_delete, args=[to_be_deleted]) deleter_thread.start() self.dump_align_viz_json(output_json_dir, db_type, result_dict) deleter_thread.join() # Write summary file summary_msg = f"Read2Seq Size: {len(read2seq)}, M8 lines {line_count}, " \ f"{len(groups)} unique accession ids " summary_file_name = f"{output_json_dir}.summary" with open(summary_file_name, 'w') as summary_f: summary_f.write(summary_msg)
def run(self): """ Generate host genome indexes for STAR and bowtie2 """ # Set up input_fasta_path = self.input_files_local[0][0] ercc_fasta_path = s3.fetch_from_s3(self.additional_files["ercc_fasta"], self.output_dir_local, allow_s3mi=True, auto_unzip=True) if input_fasta_path[-3:] == '.gz': # unzip the file dest_path = input_fasta_path[:-3] command.execute( command_patterns.ShellScriptCommand( script= r'''gzip -dc "${input_fasta_path}" > "${dest_path}";''', named_args={ 'input_fasta_path': input_fasta_path, 'dest_path': dest_path })) input_fasta_path = dest_path input_gtf_path = None if self.additional_files.get("input_gtf"): input_gtf_path = s3.fetch_from_s3( self.additional_files["input_gtf"], self.output_dir_local, allow_s3mi=True) ercc_gtf_path = s3.fetch_from_s3(self.additional_files["ercc_gtf"], self.output_dir_local, allow_s3mi=True, auto_unzip=True) host_name = self.additional_attributes["host_name"] max_star_part_size = self.additional_attributes.get( "max_star_part_size") input_fasta_with_ercc = f"{input_fasta_path}.with_ercc" command.execute( command_patterns.ShellScriptCommand( script= r'''cat "${ercc_fasta_path}" "${input_fasta_path}" > "${input_fasta_with_ercc}";''', named_args={ 'ercc_fasta_path': ercc_fasta_path, 'input_fasta_path': input_fasta_path, 'input_fasta_with_ercc': input_fasta_with_ercc })) input_gtf_with_ercc = ercc_gtf_path if input_gtf_path: input_gtf_with_ercc = f"{input_gtf_path}.with_ercc" command.execute( command_patterns.ShellScriptCommand( script= r'''cat "${ercc_gtf_path}" "${input_gtf_path}" > "${input_gtf_with_ercc}";''', named_args={ 'ercc_gtf_path': ercc_gtf_path, 'input_gtf_path': input_gtf_path, 'input_gtf_with_ercc': input_gtf_with_ercc })) output_fasta_file, output_gtf_file, output_star_index, output_bowtie2_index = self.output_files_local( ) command.copy_file(input_fasta_with_ercc, output_fasta_file) command.copy_file(input_gtf_with_ercc, output_gtf_file) # make STAR index self.make_star_index(input_fasta_with_ercc, input_gtf_with_ercc, output_star_index, max_star_part_size) # make bowtie2 index self.make_bowtie2_index(host_name, input_fasta_with_ercc, output_bowtie2_index)
def get_accession_sequences(self, dest_dir, n=10): ''' Retrieve NCBI NT references for the most-matched accession in each alignment viz file, up to a maximum of n references. Write each reference to a separate fasta file. ''' if n == 0: return {} # Retrieve files nt_db = self.additional_attributes["nt_db"] nt_loc_db = s3.fetch_from_s3( self.additional_files["nt_loc_db"], self.ref_dir_local, allow_s3mi=True) s3_align_viz_files = self.additional_attributes["align_viz_files"].values() local_align_viz_files = [] for s3_file in s3_align_viz_files: local_basename = s3_file.replace("/", "-").replace(":", "-") # needs to be unique locally local_file = s3.fetch_from_s3( s3_file, os.path.join(self.ref_dir_local, local_basename)) if local_file != None: local_align_viz_files.append(local_file) # Choose accessions to process. # align_viz files are a bit brittle, so we just log exceptions rather than failing the job. accessions = set() for local_file in local_align_viz_files: try: with open(local_file, 'rb') as f: align_viz_dict = json.load(f) most_matched_accession = None max_num_reads = 0 flat_align_viz_dict = {} self.parse_tree(align_viz_dict, flat_align_viz_dict) for acc, info in flat_align_viz_dict.items(): num_reads = info["coverage_summary"]["num_reads"] if num_reads > max_num_reads: max_num_reads = num_reads most_matched_accession = acc accessions.add(most_matched_accession) if len(accessions) >= n: break except: log.write(f"Warning: couldn't get accession from {local_file}!") traceback.print_exc() if len(accessions) > n: accessions = set(list(accessions)[0:n]) # Make map of accession to sequence file accession2info = dict((acc, {}) for acc in accessions) nt_loc_dict = shelve.open(nt_loc_db.replace(".db", "")) PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_s3( accession2info, nt_loc_dict, nt_db) # Put 1 fasta file per accession into the destination directory accession_fastas = {} for acc, info in accession2info.items(): clean_accession = self.clean_name_for_ksnp3(acc) local_fasta = f"{dest_dir}/NCBI_NT_accession_{clean_accession}.fasta" command.execute(f"ln -s {info['seq_file']} {local_fasta}") command.execute(f"echo '>{acc}' | cat - {local_fasta} > temp_file && mv temp_file {local_fasta}") accession_fastas[acc] = local_fasta # Return kept accessions and paths of their fasta files return accession_fastas
def run(self): ''' 1. summarize hits 2. built blast index 3. blast assembled contigs to the index 4. update the summary ''' (align_m8, deduped_m8, hit_summary, orig_counts) = self.input_files_local[0] assembled_contig, _assembled_scaffold, bowtie_sam, _contig_stats = self.input_files_local[ 1] reference_fasta = self.input_files_local[2][0] (blast_m8, refined_m8, refined_hit_summary, refined_counts, contig_summary_json) = self.output_files_local() db_type = self.additional_attributes["db_type"] if os.path.getsize(assembled_contig) < MIN_ASEEMBLED_CONTIG_SIZE or \ os.path.getsize(reference_fasta) < MIN_REF_FASTA_SIZE: # No assembled results or refseq fasta available command.execute(f"echo ' ' > {blast_m8}") command.execute(f"cp {deduped_m8} {refined_m8}") command.execute(f"cp {hit_summary} {refined_hit_summary}") command.execute(f"cp {orig_counts} {refined_counts}") command.execute("echo '[]' > " + contig_summary_json) return (read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary) top_entry_m8 = blast_m8.replace(".m8", ".top.m8") PipelineStepBlastContigs.run_blast(assembled_contig, reference_fasta, db_type, blast_m8, top_entry_m8) read2contig = {} contig_stats = defaultdict(int) PipelineStepRunAssembly.generate_info_from_sam(bowtie_sam, read2contig, contig_stats) (updated_read_dict, read2blastm8, contig2lineage, added_reads) = self.update_read_dict(read2contig, top_entry_m8, read_dict, accession_dict) self.generate_m8_and_hit_summary(updated_read_dict, added_reads, read2blastm8, hit_summary, deduped_m8, refined_hit_summary, refined_m8) # Generating taxon counts based on updated results lineage_db = s3.fetch_from_s3(self.additional_files["lineage_db"], self.ref_dir_local, allow_s3mi=True) deuterostome_db = None evalue_type = 'raw' if self.additional_files.get("deuterostome_db"): deuterostome_db = s3.fetch_from_s3( self.additional_files["deuterostome_db"], self.ref_dir_local, allow_s3mi=True) m8.generate_taxon_count_json_from_m8(refined_m8, refined_hit_summary, evalue_type, db_type.upper(), lineage_db, deuterostome_db, refined_counts) # generate contig stats at genus/species level contig_taxon_summary = self.generate_taxon_summary( read2contig, contig2lineage, updated_read_dict, added_reads, db_type) with open(contig_summary_json, 'w') as contig_outf: json.dump(contig_taxon_summary, contig_outf) # Upload additional file contig2lineage_json = os.path.join( os.path.dirname(contig_summary_json), f"contig2lineage.{db_type}.json") with open(contig2lineage_json, 'w') as c2lf: json.dump(contig2lineage, c2lf) self.additional_files_to_upload.append(top_entry_m8) self.additional_files_to_upload.append(contig2lineage_json)
def fetch_key(self, key_path_s3): key_path = fetch_from_s3(key_path_s3, self.output_dir_local) command.execute("chmod 400 %s" % key_path) return key_path
def run_chunk(self, part_suffix, remote_home_dir, remote_index_dir, remote_work_dir, remote_username, input_files, key_path, service, lazy_run): """Dispatch a chunk to worker machines for distributed GSNAP or RAPSearch group machines and handle their execution. """ assert service in ("gsnap", "rapsearch2") chunk_id = input_files[0].split(part_suffix)[-1] # TODO: Switch to python 3.6 which supports interpolation in string # formatting, and we will half the number of lines below. multihit_basename = "multihit-{service}-out{part_suffix}{chunk_id}.m8".format( service=service, part_suffix=part_suffix, chunk_id=chunk_id, ) multihit_local_outfile = os.path.join(self.chunks_result_dir_local, multihit_basename) multihit_remote_outfile = os.path.join(remote_work_dir, multihit_basename) multihit_s3_outfile = os.path.join(self.chunks_result_dir_s3, multihit_basename) base_str = "aws s3 cp --only-show-errors {s3_path}/{input_fa} {remote_work_dir}/{input_fa} " download_input_from_s3 = " ; ".join( base_str.format(s3_path=self.chunks_result_dir_s3, input_fa=input_fa, remote_work_dir=remote_work_dir) for input_fa in input_files) base_str = "mkdir -p {remote_work_dir} ; {download_input_from_s3} ; " if service == "gsnap": commands = base_str + "{remote_home_dir}/bin/gsnapl -A m8 --batch=0 --use-shared-memory=0 --gmap-mode=none --npaths=100 --ordered -t 36 --maxsearch=1000 --max-mismatches=40 -D {remote_index_dir} -d nt_k16 {remote_input_files} > {multihit_remote_outfile}" else: commands = base_str + "/usr/local/bin/rapsearch -d {remote_index_dir}/nr_rapsearch -e -6 -l 10 -a T -b 0 -v 50 -z 24 -q {remote_input_files} -o {multihit_remote_outfile}" commands = commands.format( remote_work_dir=remote_work_dir, download_input_from_s3=download_input_from_s3, remote_home_dir=remote_home_dir, remote_index_dir=remote_index_dir, remote_input_files=" ".join(remote_work_dir + "/" + input_fa for input_fa in input_files), multihit_remote_outfile=multihit_remote_outfile if service == "gsnap" else multihit_remote_outfile[:-3] # Strip the .m8 for RAPSearch as it adds that ) if not lazy_run or not fetch_from_s3(multihit_s3_outfile, multihit_local_outfile): correct_number_of_output_columns = 12 min_column_number = 0 max_tries = 2 try_number = 1 instance_ip = "" def interpret_min_column_number_string( min_column_number_string, correct_number_of_output_columns, try_number): if min_column_number_string: min_column_number = float(min_column_number_string) log.write( "Try no. %d: Smallest number of columns observed in any line was %d" % (try_number, min_column_number)) else: log.write("Try no. %d: No hits" % try_number) min_column_number = correct_number_of_output_columns return min_column_number # Check if every row has correct number of columns (12) in the output # file on the remote machine while min_column_number != correct_number_of_output_columns \ and try_number <= max_tries: log.write("waiting for {} server for chunk {}".format( service, chunk_id)) max_concurrent = self.additional_attributes["max_concurrent"] environment = self.additional_attributes["environment"] instance_ip = server.wait_for_server_ip( service, key_path, remote_username, environment, max_concurrent, chunk_id) log.write("starting alignment for chunk %s on %s server %s" % (chunk_id, service, instance_ip)) command.execute( command.remote(commands, key_path, remote_username, instance_ip)) if service == "gsnap": verification_command = "cat %s" % multihit_remote_outfile else: # For rapsearch, first remove header lines starting with '#' verification_command = "grep -v '^#' %s" % multihit_remote_outfile verification_command += " | awk '{print NF}' | sort -nu | head -n 1" min_column_number_string = command.execute_with_output( command.remote(verification_command, key_path, remote_username, instance_ip)) min_column_number = interpret_min_column_number_string( min_column_number_string, correct_number_of_output_columns, try_number) try_number += 1 # Move output from remote machine to local machine msg = "Chunk %s output corrupt; not copying to S3. Re-start pipeline " \ "to try again." % chunk_id assert min_column_number == correct_number_of_output_columns, msg with self.iostream_upload: # Limit concurrent uploads so as not to stall the pipeline. command.execute( command.scp(key_path, remote_username, instance_ip, multihit_remote_outfile, multihit_local_outfile)) command.execute( "aws s3 cp --only-show-errors %s %s/" % (multihit_local_outfile, self.chunks_result_dir_s3)) log.write("finished alignment for chunk %s on %s server %s" % (chunk_id, service, instance_ip)) return multihit_local_outfile
def get_accession_sequences(self, dest_dir, taxid, n=10): ''' Retrieve NCBI NT references for the most-matched accession in each hitsummary2 file, up to a maximum of n references. Write each reference to a separate fasta file. ''' if n == 0: return {} # Retrieve files nt_db = self.additional_attributes["nt_db"] nt_loc_db = s3.fetch_reference( self.additional_files["nt_loc_db"], self.ref_dir_local, allow_s3mi=True) # Choose accessions to process. s3_hitsummary2_files = self.additional_attributes["hitsummary2_files"].values() accessions = defaultdict(lambda: 0) # TODO: Address issue where accessions in nr can be chosen in the following code. # These accessions will not be found in nt_loc and will be subsequently omitted. for file_list in s3_hitsummary2_files: tally = defaultdict(lambda: 0) for s3_file in file_list: local_basename = s3_file.replace("/", "-").replace(":", "-") local_file = s3.fetch_from_s3( s3_file, os.path.join(self.output_dir_local, local_basename)) if local_file is None: continue with open(local_file, 'r') as f: for line in f: acc, species_taxid, genus_taxid, family_taxid = line.rstrip().split("\t")[3:7] if any(int(hit_taxid) == taxid for hit_taxid in [species_taxid, genus_taxid, family_taxid]): tally[acc] += 1 if tally: best_acc, max_count = max(tally.items(), key=lambda x: x[1]) accessions[best_acc] += max_count if len(accessions) > n: accessions = dict(sorted(accessions.items(), key=lambda x: x[1], reverse=True)[:n]) accessions = set(accessions.keys()) # Make map of accession to sequence file accession2info = dict((acc, {}) for acc in accessions) with open_file_db_by_extension(nt_loc_db) as nt_loc_dict: PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_s3( accession2info, nt_loc_dict, nt_db) # Put 1 fasta file per accession into the destination directory accession_fastas = {} for acc, info in accession2info.items(): if 'seq_file' not in info or info['seq_file'] is None: log.write(f"WARNING: No sequence retrieved for {acc}") continue clean_accession = self.clean_name_for_ksnp3(acc) local_fasta = f"{dest_dir}/NCBI_NT_accession_{clean_accession}.fasta" command.execute( command_patterns.SingleCommand( cmd="ln", args=[ "-s", info['seq_file'], local_fasta ] ) ) command.execute_with_output( command_patterns.ShellScriptCommand( script=r'''echo ">${acc}" | cat - "${local_fasta}" > temp_file;''', named_args={ 'acc': acc, 'local_fasta': local_fasta } ) ) command.move_file('temp_file', local_fasta) accession_fastas[acc] = local_fasta # Return kept accessions and paths of their fasta files return accession_fastas