def fetch_sequence_for_thread(error_flags, accession, accession_out_file, loc_dict, bucket, key, semaphore, mutex): ''' fetch sequence from S3 for the specific accession''' try: entry = loc_dict.get(accession) if entry: range_start, name_length, seq_len = entry range_end = range_start + name_length + seq_len - 1 if seq_len <= MAX_ACCESSION_SEQUENCE_LEN: num_retries = 3 for attempt in range(num_retries): try: s3.fetch_byterange(range_start, range_end, bucket, key, accession_out_file) break except Exception as e: if attempt + 1 < num_retries: # Exponential backoff time.sleep(1.0 * (4**attempt)) else: msg = f"All retries failed for getting sequence by accession ID {accession}: {e}" raise RuntimeError(msg) except: with mutex: if not error_flags: traceback.print_exc() error_flags["error"] = 1 finally: semaphore.release()
def run(self): output_files = self.output_files_local() taxid = self.additional_attributes["taxid"] # Retrieve IDseq taxon fasta files local_taxon_fasta_files = [] for _pipeline_run_id, byterange in self.additional_attributes["taxon_byteranges"].items(): first_byte = byterange[0] last_byte = byterange[1] s3_file = byterange[2] local_basename = byterange[3] bucket, key = s3.split_identifiers(s3_file) local_file = os.path.join(self.output_dir_local, local_basename) s3.fetch_byterange(first_byte, last_byte, bucket, key, local_file) local_taxon_fasta_files.append(local_file) # Trim Illumina adapters # TODO: consider moving this to the beginning of the main pipeline PipelineStepGeneratePhyloTree.trim_adapters_in_place(local_taxon_fasta_files) # knsp3 has a command (MakeKSNP3infile) for making a ksnp3-compatible input file from a directory of fasta files. # Before we can use the command, we symlink all fasta files to a dedicated directory. # The command makes certain unreasonable assumptions we'll need to enforce: # - current directory is parent directory of the fasta file directory # - file names do not have dots except before extension (also no spaces) # - file names cannot be too long (for kSNP3 tree building). genome_name_map = PipelineStepGeneratePhyloTree.clean_filename_collection(local_taxon_fasta_files) input_dir_for_ksnp3 = f"{self.output_dir_local}/inputs_for_ksnp3" command.execute(f"mkdir {input_dir_for_ksnp3}") for local_file, genome_name in genome_name_map.items(): command.execute(f"ln -s {local_file} {input_dir_for_ksnp3}/{genome_name}") # Retrieve Genbank references (full assembled genomes). # For now, we skip this using the option n=0 because # (a) sequences for the accession IDs actually matched by the sample are likely to be more relevant initially # (b) the downloads are slow # (c) the function only supports species-level taxids. If the phylo_tree's taxid in idseq-web is genus-level or higher, # then we will need to decide on a list of species/strains to be included in the tree and pass those to the function. self.get_genbank_genomes(taxid, input_dir_for_ksnp3, 0) # Retrieve NCBI NT references for the accessions in the alignment viz files. # These are the accessions (not necessarily full genomes) that were actually matched # by the sample's reads during GSNAP alignment. self.get_accession_sequences(input_dir_for_ksnp3, 10) # Run MakeKSNP3infile. command.execute(f"cd {input_dir_for_ksnp3}/..; MakeKSNP3infile {os.path.basename(input_dir_for_ksnp3)} {self.output_dir_local}/inputs.txt A") # Now run ksnp3. # We can choose among 4 different output files, see http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0081760#s2: # (1) tree.parsimony.tre: basic, includes no node labels # (2) tree_AlleleCounts.parsimony.tre: labels the internal nodes with the number of SNPs that are shared exclusively by the descendants of that node # (3) tree_tipAlleleCounts.parsimony.tre: same as (2), but also labels the strain names at the tips with the number of SNPs that are exclusive to that strain. # (4) tree_AlleleCounts.parsimony.NodeLabel.tre: labels the internal nodes with the node number separated by an underscore from the number of SNPs that are # shared exclusively by the descendants of that node. command.execute(f"cd {self.output_dir_local}; mkdir ksnp3_outputs; kSNP3 -in inputs.txt -outdir ksnp3_outputs -k 13") command.execute(f"mv {self.output_dir_local}/ksnp3_outputs/tree_tipAlleleCounts.parsimony.tre {output_files[0]}")
def get_sequence_by_accession_id_s3(accession_id, entry, nt_bucket, nt_key): seq_len = 0 seq_name = '' if not entry: return seq_len, seq_name, None range_start, name_length, seq_len = [int(e) for e in entry] accession_file = f'accession-{accession_id}' num_retries = 3 for attempt in range(num_retries): try: range_file = f'range-{attempt}-accession-{accession_id}' range_end = range_start + name_length + seq_len - 1 s3.fetch_byterange(range_start, range_end, nt_bucket, nt_key, range_file) # (1) Take everything below the first two lines, remove the # newlines chars, and put the sequence into accession_file # (2) Send the first line to stdout cmd = """cat {range_file} |tail -n+2 |tr -d '\\n' > {accession_file}; cat {range_file} |head -1""".format( range_file=range_file, accession_file=accession_file) seq_name = subprocess.check_output( cmd, executable='/bin/bash', shell=True).decode("utf-8").split(" ", 1)[1] seq_name = seq_name.replace("\n", "") # Get the sequence length based on the file size seq_len = os.stat(accession_file).st_size break except IndexError as e: # This may occur if the byterange fetched above is empty or does not properly align to an accession. # This has occurred in the past when a reference cache issue caused the nt_loc_db and nt indices to be out of sync. # Such issues should be investigated. However, the pipeline step can still complete with the missing data. log.write( "ntDbIndexError: Failed to get nt sequence by accession ID" f"{accession_id} {range_start} {range_end} {nt_bucket} {nt_key}: {e}" ) raise except: if attempt + 1 < num_retries: # Exponential backoff time.sleep(1.0 * (4**attempt)) else: log.write( f"All retries failed for getting sequence by accession ID {accession_id}." ) raise finally: try: os.remove(range_file) except: pass accession_file_full_path = f"{os.getcwd()}/{accession_file}" return seq_len, seq_name, accession_file_full_path
def get_sequence_by_accession_id_s3(accession_id, nt_loc_dict, nt_bucket, nt_key): seq_len = 0 seq_name = '' entry = nt_loc_dict.get(accession_id) if not entry: return seq_len, seq_name range_start, name_length, seq_len = entry accession_file = f'accession-{accession_id}' num_retries = 3 for attempt in range(num_retries): try: range_file = f'range-{attempt}-accession-{accession_id}' range_end = range_start + name_length + seq_len - 1 s3.fetch_byterange(range_start, range_end, nt_bucket, nt_key, range_file) # (1) Take everything below the first two lines, remove the # newlines chars, and put the sequence into accession_file # (2) Send the first line to stdout cmd = """cat {range_file} |tail -n+2 |tr -d '\\n' > {accession_file}; cat {range_file} |head -1""".format( range_file=range_file, accession_file=accession_file) seq_name = subprocess.check_output( cmd, executable='/bin/bash', shell=True).decode("utf-8").split(" ", 1)[1] seq_name = seq_name.replace("\n", "") # Get the sequence length based on the file size seq_len = os.stat(accession_file).st_size break except Exception as e: if attempt + 1 < num_retries: # Exponential backoff time.sleep(1.0 * (4**attempt)) else: msg = f"All retries failed for getting sequence by accession ID {accession_id}: {e}" raise RuntimeError(msg) finally: try: os.remove(range_file) pass except: pass accession_file_full_path = f"{os.getcwd()}/{accession_file}" return seq_len, seq_name, accession_file_full_path
def run(self): output_file = self.output_files_local()[0] byterange_dict = self.additional_attributes["taxon_byteranges"] # Retrieve IDseq taxon fasta files partial_fasta_files = [] for hit_type, byterange in byterange_dict.items(): first_byte = byterange[0] last_byte = byterange[1] s3_file = byterange[2] local_basename = f"{hit_type}_{os.path.basename(output_file)}.fasta" bucket, key = s3.split_identifiers(s3_file) local_file = os.path.join(self.output_dir_local, local_basename) s3.fetch_byterange(first_byte, last_byte, bucket, key, local_file) partial_fasta_files.append(local_file) self.fasta_union(partial_fasta_files, output_file) for fasta in partial_fasta_files + [output_file]: print(f"{count.reads(fasta)} reads in {fasta}") # Trim Illumina adapters # TODO: consider moving this to the beginning of the main pipeline self.trim_adapters_in_place(output_file)