def fetch_sequence_for_thread(error_flags, accession, accession_out_file,
                                  loc_dict, bucket, key,
                                  semaphore, mutex):
        ''' fetch sequence from S3 for the specific accession'''
        try:
            entry = loc_dict.get(accession)
            if entry:
                range_start, name_length, seq_len = entry
                range_end = range_start + name_length + seq_len - 1
                if seq_len <= MAX_ACCESSION_SEQUENCE_LEN:
                    num_retries = 3
                    for attempt in range(num_retries):
                        try:
                            s3.fetch_byterange(range_start, range_end, bucket, key, accession_out_file)
                            break
                        except Exception as e:
                            if attempt + 1 < num_retries:  # Exponential backoff
                                time.sleep(1.0 * (4**attempt))
                            else:
                                msg = f"All retries failed for getting sequence by accession ID {accession}: {e}"
                            raise RuntimeError(msg)

        except:
            with mutex:
                if not error_flags:
                    traceback.print_exc()
                error_flags["error"] = 1
        finally:
            semaphore.release()
Beispiel #2
0
    def run(self):
        output_files = self.output_files_local()
        taxid = self.additional_attributes["taxid"]

        # Retrieve IDseq taxon fasta files
        local_taxon_fasta_files = []
        for _pipeline_run_id, byterange in self.additional_attributes["taxon_byteranges"].items():
            first_byte = byterange[0]
            last_byte = byterange[1]
            s3_file = byterange[2]
            local_basename = byterange[3]
            bucket, key = s3.split_identifiers(s3_file)
            local_file = os.path.join(self.output_dir_local, local_basename)
            s3.fetch_byterange(first_byte, last_byte, bucket, key, local_file)
            local_taxon_fasta_files.append(local_file)

        # Trim Illumina adapters
        # TODO: consider moving this to the beginning of the main pipeline
        PipelineStepGeneratePhyloTree.trim_adapters_in_place(local_taxon_fasta_files)

        # knsp3 has a command (MakeKSNP3infile) for making a ksnp3-compatible input file from a directory of fasta files.
        # Before we can use the command, we symlink all fasta files to a dedicated directory.
        # The command makes certain unreasonable assumptions we'll need to enforce:
        # - current directory is parent directory of the fasta file directory
        # - file names do not have dots except before extension (also no spaces)
        # - file names cannot be too long (for kSNP3 tree building).
        genome_name_map = PipelineStepGeneratePhyloTree.clean_filename_collection(local_taxon_fasta_files)
        input_dir_for_ksnp3 = f"{self.output_dir_local}/inputs_for_ksnp3"
        command.execute(f"mkdir {input_dir_for_ksnp3}")
        for local_file, genome_name in genome_name_map.items():
            command.execute(f"ln -s {local_file} {input_dir_for_ksnp3}/{genome_name}")

        # Retrieve Genbank references (full assembled genomes).
        # For now, we skip this using the option n=0 because
        # (a) sequences for the accession IDs actually matched by the sample are likely to be more relevant initially
        # (b) the downloads are slow
        # (c) the function only supports species-level taxids. If the phylo_tree's taxid in idseq-web is genus-level or higher,
        #     then we will need to decide on a list of species/strains to be included in the tree and pass those to the function.
        self.get_genbank_genomes(taxid, input_dir_for_ksnp3, 0)

        # Retrieve NCBI NT references for the accessions in the alignment viz files.
        # These are the accessions (not necessarily full genomes) that were actually matched
        # by the sample's reads during GSNAP alignment.
        self.get_accession_sequences(input_dir_for_ksnp3, 10)

        # Run MakeKSNP3infile.
        command.execute(f"cd {input_dir_for_ksnp3}/..; MakeKSNP3infile {os.path.basename(input_dir_for_ksnp3)} {self.output_dir_local}/inputs.txt A")

        # Now run ksnp3.
        # We can choose among 4 different output files, see http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0081760#s2:
        # (1) tree.parsimony.tre: basic, includes no node labels
        # (2) tree_AlleleCounts.parsimony.tre: labels the internal nodes with the number of SNPs that are shared exclusively by the descendants of that node
        # (3) tree_tipAlleleCounts.parsimony.tre: same as (2), but also labels the strain names at the tips with the number of SNPs that are exclusive to that strain.
        # (4) tree_AlleleCounts.parsimony.NodeLabel.tre: labels the internal nodes with the node number separated by an underscore from the number of SNPs that are
        #     shared exclusively by the descendants of that node.
        command.execute(f"cd {self.output_dir_local}; mkdir ksnp3_outputs; kSNP3 -in inputs.txt -outdir ksnp3_outputs -k 13")
        command.execute(f"mv {self.output_dir_local}/ksnp3_outputs/tree_tipAlleleCounts.parsimony.tre {output_files[0]}")
Beispiel #3
0
    def get_sequence_by_accession_id_s3(accession_id, entry, nt_bucket,
                                        nt_key):
        seq_len = 0
        seq_name = ''
        if not entry:
            return seq_len, seq_name, None

        range_start, name_length, seq_len = [int(e) for e in entry]

        accession_file = f'accession-{accession_id}'
        num_retries = 3
        for attempt in range(num_retries):
            try:
                range_file = f'range-{attempt}-accession-{accession_id}'
                range_end = range_start + name_length + seq_len - 1
                s3.fetch_byterange(range_start, range_end, nt_bucket, nt_key,
                                   range_file)

                # (1) Take everything below the first two lines, remove the
                # newlines chars, and put the sequence into accession_file
                # (2) Send the first line to stdout
                cmd = """cat {range_file} |tail -n+2 |tr -d '\\n' > {accession_file}; cat {range_file} |head -1""".format(
                    range_file=range_file, accession_file=accession_file)
                seq_name = subprocess.check_output(
                    cmd, executable='/bin/bash',
                    shell=True).decode("utf-8").split(" ", 1)[1]
                seq_name = seq_name.replace("\n", "")

                # Get the sequence length based on the file size
                seq_len = os.stat(accession_file).st_size
                break
            except IndexError as e:
                # This may occur if the byterange fetched above is empty or does not properly align to an accession.
                # This has occurred in the past when a reference cache issue caused the nt_loc_db and nt indices to be out of sync.
                # Such issues should be investigated. However, the pipeline step can still complete with the missing data.
                log.write(
                    "ntDbIndexError: Failed to get nt sequence by accession ID"
                    f"{accession_id} {range_start} {range_end} {nt_bucket} {nt_key}: {e}"
                )
                raise
            except:
                if attempt + 1 < num_retries:  # Exponential backoff
                    time.sleep(1.0 * (4**attempt))
                else:
                    log.write(
                        f"All retries failed for getting sequence by accession ID {accession_id}."
                    )
                    raise
            finally:
                try:
                    os.remove(range_file)
                except:
                    pass
        accession_file_full_path = f"{os.getcwd()}/{accession_file}"
        return seq_len, seq_name, accession_file_full_path
Beispiel #4
0
    def get_sequence_by_accession_id_s3(accession_id, nt_loc_dict, nt_bucket,
                                        nt_key):
        seq_len = 0
        seq_name = ''
        entry = nt_loc_dict.get(accession_id)
        if not entry:
            return seq_len, seq_name

        range_start, name_length, seq_len = entry

        accession_file = f'accession-{accession_id}'
        num_retries = 3
        for attempt in range(num_retries):
            try:
                range_file = f'range-{attempt}-accession-{accession_id}'
                range_end = range_start + name_length + seq_len - 1
                s3.fetch_byterange(range_start, range_end, nt_bucket, nt_key,
                                   range_file)

                # (1) Take everything below the first two lines, remove the
                # newlines chars, and put the sequence into accession_file
                # (2) Send the first line to stdout
                cmd = """cat {range_file} |tail -n+2 |tr -d '\\n' > {accession_file}; cat {range_file} |head -1""".format(
                    range_file=range_file, accession_file=accession_file)
                seq_name = subprocess.check_output(
                    cmd, executable='/bin/bash',
                    shell=True).decode("utf-8").split(" ", 1)[1]
                seq_name = seq_name.replace("\n", "")

                # Get the sequence length based on the file size
                seq_len = os.stat(accession_file).st_size
                break
            except Exception as e:
                if attempt + 1 < num_retries:  # Exponential backoff
                    time.sleep(1.0 * (4**attempt))
                else:
                    msg = f"All retries failed for getting sequence by accession ID {accession_id}: {e}"
                    raise RuntimeError(msg)
            finally:
                try:
                    os.remove(range_file)
                    pass
                except:
                    pass
        accession_file_full_path = f"{os.getcwd()}/{accession_file}"
        return seq_len, seq_name, accession_file_full_path
Beispiel #5
0
    def run(self):
        output_file = self.output_files_local()[0]
        byterange_dict = self.additional_attributes["taxon_byteranges"]

        # Retrieve IDseq taxon fasta files
        partial_fasta_files = []
        for hit_type, byterange in byterange_dict.items():
            first_byte = byterange[0]
            last_byte = byterange[1]
            s3_file = byterange[2]
            local_basename = f"{hit_type}_{os.path.basename(output_file)}.fasta"
            bucket, key = s3.split_identifiers(s3_file)
            local_file = os.path.join(self.output_dir_local, local_basename)
            s3.fetch_byterange(first_byte, last_byte, bucket, key, local_file)
            partial_fasta_files.append(local_file)
        self.fasta_union(partial_fasta_files, output_file)
        for fasta in partial_fasta_files + [output_file]:
            print(f"{count.reads(fasta)} reads in {fasta}")

        # Trim Illumina adapters
        # TODO: consider moving this to the beginning of the main pipeline
        self.trim_adapters_in_place(output_file)