Esempi in Python per fetch_from_s3, esempi in Python per idseq_dag.util.s3.fetch_from_s3

Esempio n. 1

0

Mostra file

File: run_alignment_remotely.py Progetto: rcs333/idseq-dag

    def run(self):
        ''' Run alignmment remotely '''
        input_fas = self.get_input_fas()
        [output_m8, deduped_output_m8, output_hitsummary, output_counts_json] = self.output_files_local()
        service = self.additional_attributes["service"]
        assert service in ("gsnap", "rapsearch2")

        # TODO: run the alignment remotely and make lazy_chunk=True, revisit this later
        self.run_remotely(input_fas, output_m8, service)

        # get database
        lineage_db = fetch_from_s3(self.additional_files["lineage_db"], self.ref_dir_local, allow_s3mi=True)
        accession2taxid_db = fetch_from_s3(self.additional_files["accession2taxid_db"], self.ref_dir_local, allow_s3mi=True)
        blacklist_s3_file = self.additional_attributes.get('taxon_blacklist', DEFAULT_BLACKLIST_S3)
        taxon_blacklist = fetch_from_s3(blacklist_s3_file, self.ref_dir_local)
        m8.call_hits_m8(output_m8, lineage_db, accession2taxid_db,
                        deduped_output_m8, output_hitsummary, taxon_blacklist)

        # check deuterostome
        deuterostome_db = None
        db_type = 'NT' if service == 'gsnap' else 'NR'
        evalue_type = 'log10' if service == 'rapsearch2' else 'raw'
        if self.additional_files.get("deuterostome_db"):
            deuterostome_db = fetch_from_s3(self.additional_files["deuterostome_db"],
                                            self.ref_dir_local, allow_s3mi=True)
        m8.generate_taxon_count_json_from_m8(
            deduped_output_m8, output_hitsummary, evalue_type, db_type,
            lineage_db, deuterostome_db, output_counts_json)

Esempio n. 2

0

Mostra file

File: fetch_tax_info.py Progetto: jonason91/idseq-workflows

    def run(self):
        '''
            1. fetch the taxid -> wikipedia link mapping
            2. fetch wikipedia content
            3. store everything
        '''
        taxid_list = self.input_files_local[0][0]
        (taxid2wiki, taxid2desc) = self.output_files_local()

        taxid2wikidict = {}
        Entrez.email = self.additional_attributes.get(
            "entrez_email", "*****@*****.**")
        num_threads = self.additional_attributes.get("threads", 16)
        batch_size = self.additional_attributes.get("batch_size", 100)
        namecsv = self.additional_files.get("taxon2name")
        id2namedict = {}
        if namecsv:
            # This is fetching a reference without fetch_reference;  but ok because does not run from the actual pipeline
            namecsvf = s3.fetch_from_s3(namecsv, "/mnt/idseq/ref")
            with open(namecsvf, 'r') as namef:
                for line in namef:
                    fields = line.rstrip().split(",")
                    id2namedict[fields[0]] = fields[1]

        # This is fetching a reference without fetch_reference and doing a presence check;  but ok because does not run from the actual pipeline
        if s3.check_s3_presence(self.s3_path(taxid2wiki)):
            # generated
            taxid2wiki = s3.fetch_from_s3(self.s3_path(taxid2wiki), taxid2wiki)
            with open(taxid2wiki, "r") as taf:
                for line in taf:
                    (key, val) = line.rstrip("\n").split("\t")
                    taxid2wikidict[key] = val
        else:
            self.fetch_ncbi_wiki_map(num_threads, batch_size, taxid_list,
                                     taxid2wikidict)

        # output dummay for actual wiki content for now
        taxid2wikicontent = {}
        self.fetch_wiki_content(num_threads * 4, taxid2wikidict,
                                taxid2wikicontent, id2namedict)

        with open(taxid2desc, 'w') as desc_outputf:
            json.dump(taxid2wikicontent, desc_outputf)

        # output the taxid 2 wikiurl data
        with open(taxid2wiki, 'w') as taxidoutf:
            for taxid, wikiurl in taxid2wikidict.items():
                if wikiurl == "":
                    pageid = taxid2wikicontent.get(taxid,
                                                   {}).get('pageid', None)
                    if pageid:
                        wikiurl = f"http://en.wikipedia.org/wiki/index.html?curid={pageid}"
                taxidoutf.write(f"{taxid}\t{wikiurl}\n")

Esempio n. 3

0

Mostra file

File: run_gsnap_filter.py Progetto: rcs333/idseq-dag

    def run(self):
        input_fas = self.input_files_local[0][0:2]
        output_fas = self.output_files_local()
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_files_to_upload.append(output_sam_file)

        genome_dir = fetch_from_s3(self.additional_files["gsnap_genome"],
                                   self.ref_dir_local,
                                   allow_s3mi=True,
                                   auto_untar=True)
        gsnap_base_dir = os.path.dirname(genome_dir)
        gsnap_index_name = os.path.basename(genome_dir)
        # Run Gsnap
        gsnap_params = [
            'gsnapl', '-A sam', '--batch=0', '--use-shared-memory=0',
            '--gmap-mode=all', '--npaths=1', '--ordered', '-t 32',
            '--max-mismatches=40', '-D', gsnap_base_dir, '-d',
            gsnap_index_name, '-o', output_sam_file
        ] + input_fas
        command.execute(" ".join(gsnap_params))
        log.write("Finished GSNAP alignment.")

        # Extract out unmapped files from sam
        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])

Esempio n. 4

0

Mostra file

 def get_genbank_genomes(self,
                         reference_taxids,
                         destination_dir,
                         superkingdom_name,
                         n=10):
     '''
     Retrieve up to n GenBank reference genomes under the reference_taxids.
     Assumes reference_taxids are species-level or below.
     Also assumes they are all in the same superkingdom, which is the only thing we need in our application.
     Saves the references under file names compatible with MakeKSNP3infile.
     TODO: Retrieve the genomes from S3 rather than ftp.ncbi.nih.gov (JIRA/IDSEQ-334).
     '''
     if n == 0 or not reference_taxids:
         return {}
     n_per_taxid = max(n // len(reference_taxids), 1)
     genbank_categories_by_superkingdom = {
         "Viruses": ["viral"],
         "Bacteria": ["bacteria"],
         "Eukaryota": ["fungi", "protozoa"],
         None: ["bacteria", "viral", "fungi", "protozoa"]
     }
     # additional options in genbank that we probably don't need right now:
     # ["archaea", "plant",
     # "vertebrate_mammalian", "vertebrate_other", "invertebrate",
     # "other", "metagenomes"]
     categories = genbank_categories_by_superkingdom[superkingdom_name]
     for cat in categories:
         genome_list_path_s3 = f"s3://idseq-public-references/genbank/{cat}/assembly_summary.txt"  # source: ftp://ftp.ncbi.nih.gov/genomes/genbank/{cat}/assembly_summary.txt
         genome_list_local = s3.fetch_from_s3(genome_list_path_s3,
                                              destination_dir)
         genomes = []
         for taxid in reference_taxids:
             taxid_genomes = PipelineStepGeneratePhyloTree.get_taxid_genomes(
                 genome_list_local, taxid, n_per_taxid)
             genomes += [
                 entry for entry in taxid_genomes if entry not in genomes
             ]
         genomes = genomes[:n]
         command.remove_file(genome_list_local)
         if genomes:
             genbank_fastas = {}
             for line in genomes:
                 assembly_accession, taxid, _species_taxid, _organism_name, ftp_path = line.split(
                     "\t")
                 ftp_fasta_gz = f"{ftp_path}/{os.path.basename(ftp_path)}_genomic.fna.gz"
                 tree_node_name = f"genbank_{self.clean_name_for_ksnp3(assembly_accession)}"
                 local_fasta = f"{destination_dir}/{tree_node_name}.fasta"
                 if os.path.isfile(local_fasta):
                     local_fasta = f"{local_fasta.split('.')[0]}__I.fasta"
                 command.execute(
                     command_patterns.SingleCommand(
                         cmd='wget',
                         args=["-O", f"{local_fasta}.gz", ftp_fasta_gz]))
                 command.execute(
                     command_patterns.SingleCommand(
                         cmd='gunzip', args=[f"{local_fasta}.gz"]))
                 genbank_fastas[assembly_accession] = local_fasta
             return genbank_fastas
     return {}

Esempio n. 5

0

Mostra file

    def run(self):
        # Setup
        if len(self.input_files_local) > 1:
            input_fa_name = self.input_files_local[0][0]
            hit_summary_files = {
                'NT': self.input_files_local[1][2],
                'NR': self.input_files_local[2][2]
            }
        else:
            # TODO(yf): Old implementation. TO BE DEPRECATED once 3.1 is fully deployed
            input_files = self.input_files_local[0]
            input_fa_name = input_files[0]
            hit_summary_files = {'NT': input_files[1], 'NR': input_files[2]}

        # Open lineage db
        lineage_db = s3.fetch_from_s3(self.additional_files["lineage_db"],
                                      self.ref_dir_local,
                                      allow_s3mi=True)
        lineage_map = shelve.open(lineage_db.replace(".db", ""))

        # Get primary hit mappings
        valid_hits = PipelineStepGenerateTaxidFasta.parse_hits(
            hit_summary_files)

        input_fa = open(input_fa_name, 'rb')
        output_fa = open(self.output_files_local()[0], 'wb')
        seq_name = input_fa.readline()
        seq_data = input_fa.readline()
        while len(seq_name) > 0 and len(seq_data) > 0:
            # Example read_id: "NR::NT:CP010376.2:NB501961:14:HM7TLBGX2:1:23109
            # :12720:8743/2"
            # Translate the read information into our custom format with fake
            # taxids at non-specific hit levels.
            annotated_read_id = seq_name.decode("utf-8").rstrip().lstrip('>')
            read_id = annotated_read_id.split(":", 4)[-1]

            nr_taxid_species, nr_taxid_genus, nr_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage(
                valid_hits, lineage_map, read_id, 'NR')
            nt_taxid_species, nt_taxid_genus, nt_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage(
                valid_hits, lineage_map, read_id, 'NT')

            fields = [
                "family_nr", nr_taxid_family, "family_nt", nt_taxid_family
            ]
            fields += ["genus_nr", nr_taxid_genus, "genus_nt", nt_taxid_genus]
            fields += [
                "species_nr", nr_taxid_species, "species_nt", nt_taxid_species
            ]
            fields += [annotated_read_id]
            new_read_name = ('>' + ':'.join(fields) + '\n').encode()

            output_fa.write(new_read_name)
            output_fa.write(seq_data)
            seq_name = input_fa.readline()
            seq_data = input_fa.readline()
        input_fa.close()
        output_fa.close()

Esempio n. 6

0

Mostra file

def download_to_compare(to_compare):
    for i, path in enumerate(to_compare):
        if path.startswith("s3://"):
            local_name = f"tmp-{i}-" + os.path.basename(path)
            if os.path.isfile(local_name):
                os.remove(local_name)
            path = s3.fetch_from_s3(path, local_name, allow_s3mi=True)
            if path is None:
                raise RuntimeError(f"Fetch from S3 failed for {path}")
            to_compare[i] = path

Esempio n. 7

0

Mostra file

File: run_srst2.py Progetto: rcs333/idseq-dag

 def get_common_params(self):
     """Helper that gets srst2 parameters common to both paired and single rds."""
     db_file_path = fetch_from_s3(self.additional_files["resist_gene_db"],
                                  self.output_dir_local)
     min_cov = str(self.additional_attributes['min_cov'])
     # srst2 expects this to be a string, in dag could be passed in as a number
     n_threads = str(self.additional_attributes['n_threads'])
     return [
         '--min_coverage', min_cov, '--threads', n_threads, '--output',
         os.path.join(self.output_dir_local, 'output'), '--log',
         '--gene_db', db_file_path
     ]

Esempio n. 8

0

Mostra file

File: download_accessions.py Progetto: rcs333/idseq-dag

 def run(self):
     (_align_m8, _deduped_m8, hit_summary, _orig_counts) = self.input_files_local[0]
     output_reference_fasta = self.output_files_local()[0]
     loc_db = s3.fetch_from_s3(
         self.additional_files["loc_db"],
         self.ref_dir_local,
         allow_s3mi=True)
     db_s3_path = self.additional_attributes["db"]
     db_type = self.additional_attributes["db_type"]
     lineage_db = s3.fetch_from_s3(
         self.additional_files["lineage_db"],
         self.ref_dir_local,
         allow_s3mi=True)
     (read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary)
     if len(accession_dict) < MIN_ACCESSIONS_WHOLE_DB_DOWNLOAD:
         self.download_ref_sequences_from_s3(accession_dict, output_reference_fasta, db_type,
                                             loc_db, db_s3_path)
     else:
         # download the whole alignment db
         db_path = s3.fetch_from_s3(db_s3_path, self.ref_dir_local, allow_s3mi=True)
         self.download_ref_sequences_from_file(accession_dict, loc_db, db_path, output_reference_fasta)

Esempio n. 9

0

Mostra file

    def run(self):
        """Run STAR to filter out host reads."""
        # Setup
        input_files = self.input_files_local[0][0:2]
        num_inputs = len(input_files)
        scratch_dir = os.path.join(self.output_dir_local, "scratch_star")

        output_files_local = self.output_files_local()
        output_gene_file = self.additional_attributes.get("output_gene_file")

        genome_dir = s3.fetch_from_s3(self.additional_files["star_genome"],
                                      self.ref_dir_local,
                                      allow_s3mi=True,
                                      auto_untar=True)

        # Check parts file for the number of partitioned indexes
        parts_file = os.path.join(genome_dir, "parts.txt")
        assert os.path.isfile(parts_file)
        with open(parts_file, 'rb') as parts_f:
            num_parts = int(parts_f.read())

        # Run STAR on each partition and save the unmapped read info
        unmapped = input_files

        for part_idx in range(num_parts):
            tmp = f"{scratch_dir}/star-part-{part_idx}"
            genome_part = f"{genome_dir}/part-{part_idx}"
            count_genes = part_idx == 0
            self.run_star_part(tmp, genome_part, unmapped, count_genes)

            unmapped = PipelineStepRunStar.sync_pairs(
                PipelineStepRunStar.unmapped_files_in(tmp, num_inputs))

            # Run part 0 in gene-counting mode:
            # (a) ERCCs are doped into part 0 and we want their counts.
            # (b) If there is only 1 part (e.g. human), the host gene counts also
            # make sense.
            if part_idx == 0:
                gene_count_file = os.path.join(tmp, "ReadsPerGene.out.tab")
                if os.path.isfile(gene_count_file) and output_gene_file:
                    moved = os.path.join(self.output_dir_local,
                                         output_gene_file)
                    command.execute(f"mv {gene_count_file} {moved}")
                    self.additional_files_to_upload.append(moved)

        # Cleanup
        for src, dst in zip(unmapped, output_files_local):
            command.execute(f"mv {src} {dst}")  # Move out of scratch dir
        command.execute("cd %s; rm -rf *" % scratch_dir)

Esempio n. 10

0

Mostra file

File: generate_phylo_tree.py Progetto: rcs333/idseq-dag

 def get_genbank_genomes(self, reference_taxids, destination_dir, superkingdom_name, n=10):
     '''
     Retrieve up to n GenBank reference genomes under the reference_taxids.
     Assumes reference_taxids are species-level or below.
     Also assumes they are all in the same superkingdom, which is the only thing we need in our application.
     Saves the references under file names compatible with MakeKSNP3infile.
     TODO: Retrieve the genomes from S3 rather than ftp.ncbi.nih.gov (JIRA/IDSEQ-334).
     '''
     if n == 0 or not reference_taxids:
         return {}
     n_per_taxid = max(n // len(reference_taxids), 1)
     genbank_categories_by_superkingdom = {
         "Viruses": ["viral"],
         "Bacteria": ["bacteria"],
         "Eukaryota": ["fungi", "protozoa"],
         None: ["bacteria", "viral", "fungi", "protozoa"]
     }
     # additional options in genbank that we probably don't need right now:
     # ["archaea", "plant", 
     # "vertebrate_mammalian", "vertebrate_other", "invertebrate",
     # "other", "metagenomes"]
     categories = genbank_categories_by_superkingdom[superkingdom_name]
     for cat in categories:
         genome_list_path_s3 = f"s3://idseq-database/genbank/{cat}/assembly_summary.txt" # source: ftp://ftp.ncbi.nih.gov/genomes/genbank/{cat}/assembly_summary.txt
         genome_list_local = s3.fetch_from_s3(genome_list_path_s3, destination_dir)
         genomes = []
         for taxid in reference_taxids:
             cmd = f"cut -f1,6,7,8,20 {genome_list_local}" # columns: 1 = assembly_accession; 6 = taxid; 7 = species_taxid, 8 = organism_name, 20 = ftp_path
             cmd += f" | awk -F '\t' '$2 == {taxid}'" # try to find taxid in the taxid column (2nd column of the piped input)
             cmd += f" | head -n {n_per_taxid}" # take only top n_per_taxid results
             taxid_genomes = list(filter(None, command.execute_with_output(cmd).split("\n")))
             genomes += [entry for entry in taxid_genomes if entry not in genomes]
         genomes = genomes[:n]
         command.execute_with_output(f"rm {genome_list_local}")
         if genomes:
             genbank_fastas = {}
             for line in genomes:
                 assembly_accession, taxid, species_taxid, organism_name, ftp_path = line.split("\t")
                 ftp_fasta_gz = f"{ftp_path}/{os.path.basename(ftp_path)}_genomic.fna.gz"
                 tree_node_name = f"genbank_{self.clean_name_for_ksnp3(assembly_accession)}"
                 local_fasta = f"{destination_dir}/{tree_node_name}.fasta"
                 if os.path.isfile(local_fasta):
                     local_fasta = f"{local_fasta.split('.')[0]}__I.fasta"
                 command.execute(f"wget -O {local_fasta}.gz {ftp_fasta_gz}")
                 command.execute(f"gunzip {local_fasta}.gz")
                 genbank_fastas[assembly_accession] = local_fasta
             return genbank_fastas
     return {}

Esempio n. 11

0

Mostra file

    def run(self):
        """
        Trim any residual Illumina adapters.
        Discard any reads that become too short.

        See: http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/TrimmomaticManual_V0.32.pdf
        """
        input_files = self.input_files_local[0][0:2]
        output_files = self.output_files_local()
        is_paired = (len(input_files) == 2)
        adapter_fasta = s3.fetch_from_s3(
            self.additional_files["adapter_fasta"], self.ref_dir_local)

        if fasta.input_file_type(input_files[0]) != 'fastq':
            # Not fastq
            for in_file, out_file in zip(input_files, output_files):
                command.execute(f"cp {in_file} {out_file}")
            return

        if is_paired:
            paired_arg = "PE"
            output_args = [
                output_files[0],  # R1, paired, to be kept
                f"{output_files[0]}__unpaired",  # R1, no longer paired, to be discarded
                output_files[1],  # R2, paired, to be kept
                f"{output_files[1]}__unpaired"
            ]  # R2, no longer paired, to be discarded
        else:
            paired_arg = "SE"
            output_args = output_files

        cmd = " ".join([
            "java -jar /usr/local/bin/trimmomatic-0.38.jar",
            paired_arg,
            "-phred33",
            *input_files,
            *output_args,
            f"ILLUMINACLIP:{adapter_fasta}:2:30:10",
            # Remove Illumina adapters provided in the fasta file. Initially, look for seed matches
            # allowing maximally *2* mismatches. These seeds will be extended and clipped if in the case of paired end
            # reads a score of *30* is reached, or in the case of single ended reads a
            # score of *10*.
            "MINLEN:75"
            # Discard reads which are less than *75* bases long after these steps.
        ])
        command.execute(cmd)

Esempio n. 12

0

Mostra file

File: run_bowtie2.py Progetto: rcs333/idseq-dag

    def run(self):
        input_fas = self.input_files_local[0][0:2]
        output_fas = self.output_files_local()
        genome_dir = fetch_from_s3(self.additional_files["bowtie2_genome"],
                                   self.ref_dir_local,
                                   allow_s3mi=True,
                                   auto_untar=True)
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_files_to_upload.append(output_sam_file)
        # The file structure looks like
        # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2"
        # The code below will handle up to "bowtie2_genome/GRCh38.primary_assembly.
        # genome.99.bt2" but not 100.
        cmd = "ls {genome_dir}/*.bt2*".format(genome_dir=genome_dir)
        local_genome_dir_ls = command.execute_with_output(cmd)
        genome_basename = local_genome_dir_ls.split("\n")[0][:-6]
        if genome_basename[-1] == '.':
            genome_basename = genome_basename[:-1]
        bowtie2_params = [
            'bowtie2', '-q', '-x', genome_basename, '-f',
            '--very-sensitive-local', '-S', output_sam_file
        ]

        seed = self.additional_attributes.get("random_seed")
        if seed:
            bowtie2_params.extend(['--seed', str(seed)])
        else:
            # Seed option won't work with -p threading option.
            bowtie2_params.extend(['-p', str(multiprocessing.cpu_count())])

        if len(input_fas) == 2:
            bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]])
        else:
            bowtie2_params.extend(['-U', input_fas[0]])
        command.execute(" ".join(bowtie2_params))
        log.write("Finished Bowtie alignment.")

        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])

Esempio n. 13

0

Mostra file

File: build_custom_blast_index.py Progetto: jonason91/idseq-workflows

    def run(self):
        """
          Build custom blast index from an S3 location or a url
        """
        _input_files = self.input_files_local[0]  # dummy in this case  # noqa
        output_tar_file = self.output_files_local()[0]

        db_type = self.additional_attributes['db_type']
        file_source = self.additional_attributes['data_source']
        output_db_name = output_tar_file.replace(".tar", "")

        if file_source.startswith("s3://"):
            db_file = s3.fetch_from_s3(file_source, self.output_dir_local)
        else:
            # Download with wget
            db_file = os.path.join(self.output_dir_local,
                                   os.path.basename(file_source))
            urllib.request.urlretrieve(file_source, db_file)
            self.additional_output_files_hidden.append(db_file)

        # Build blast index
        if db_file.endswith(".bz2"):
            command.execute(
                command_patterns.SingleCommand(cmd='bzip2',
                                               args=["-dk", db_file]))
            db_file = db_file[:-4]
        elif db_file.endswith(".zip"):
            command.execute(
                command_patterns.SingleCommand(cmd='unzip', args=[db_file]))
            db_file = db_file[:-4]

        command.execute(
            command_patterns.SingleCommand(cmd='makeblastdb',
                                           args=[
                                               "-in", db_file, "-dbtype",
                                               db_type, "-out", output_db_name
                                           ]))
        command.execute(
            command_patterns.SingleCommand(
                cmd='tar',
                args=["cvf", output_tar_file, output_db_name + ".*"]))

Esempio n. 14

0

Mostra file

    def run_chunk(self, part_suffix, remote_home_dir, remote_index_dir,
                  remote_work_dir, remote_username, input_files, key_path,
                  service, lazy_run):
        """Dispatch a chunk to worker machines for distributed GSNAP or RAPSearch
        group machines and handle their execution.
        """
        assert service in ("gsnap", "rapsearch2")

        chunk_id = int(input_files[0].split(part_suffix)[-1])
        multihit_basename = f"multihit-{service}-out{part_suffix}{chunk_id}.m8"
        multihit_local_outfile = os.path.join(self.chunks_result_dir_local,
                                              multihit_basename)
        multihit_remote_outfile = os.path.join(remote_work_dir,
                                               multihit_basename)
        multihit_s3_outfile = os.path.join(self.chunks_result_dir_s3,
                                           multihit_basename)

        def aws_cp_operation(input_fa):
            return "aws s3 cp --only-show-errors {src} {dest}".format(
                src=shlex.quote(
                    os.path.join(self.chunks_result_dir_s3, input_fa)),
                dest=shlex.quote(os.path.join(remote_work_dir, input_fa)))

        download_input_from_s3 = " ; ".join(map(aws_cp_operation, input_files))

        # Clean up remote work directory before running
        #   This ensures that files from a failed previous run that may still be on the instance
        #   are removed so they don't corrupt the current run
        base_str = "rm -rf {remote_work_dir} ; mkdir -p {remote_work_dir} ; {download_input_from_s3} ; "
        environment = self.additional_attributes["environment"]

        # See step class docstrings for more parameter details.
        if service == "gsnap":
            commands = base_str + "{remote_home_dir}/bin/gsnapl -A m8 --batch=0 --use-shared-memory=0 --gmap-mode=none --npaths=100 --ordered -t 48 --max-mismatches=40 -D {remote_index_dir} -d nt_k16 {remote_input_files} > {multihit_remote_outfile}"
        else:
            commands = base_str + "/usr/local/bin/rapsearch -d {remote_index_dir}/nr_rapsearch -e -6 -l 10 -a T -b 0 -v 50 -z 24 -q {remote_input_files} -o {multihit_remote_outfile}"

        commands = commands.format(
            remote_work_dir=shlex.quote(remote_work_dir),
            download_input_from_s3=download_input_from_s3,
            remote_home_dir=shlex.quote(remote_home_dir),
            remote_index_dir=shlex.quote(remote_index_dir),
            remote_input_files=" ".join(
                shlex.quote(remote_work_dir + "/" + input_fa)
                for input_fa in input_files),
            multihit_remote_outfile=shlex.quote(multihit_remote_outfile) if
            service == "gsnap" else shlex.quote(multihit_remote_outfile[:-3])
            # Strip the .m8 for RAPSearch as it adds that
        )

        if lazy_run and fetch_from_s3(multihit_s3_outfile,
                                      multihit_local_outfile,
                                      okay_if_missing=True,
                                      allow_s3mi=False):
            log.write(
                f"finished alignment for chunk {chunk_id} with {service} by lazily fetching last result"
            )
        else:
            chunk_timeout = int(
                self.additional_attributes.get(
                    f"{service.lower()}_chunk_timeout", DEFAULT_CHUNK_TIMEOUT))
            for try_number in range(1, CHUNK_MAX_TRIES + 1):
                log.write(
                    f"waiting for {service} server for chunk {chunk_id}. Try #{try_number}"
                )
                with ASGInstance(service, key_path, remote_username,
                                 environment, chunk_id, try_number,
                                 self.additional_attributes) as instance_ip:
                    # Try/Except block needs to be inside the ASGInstance context.
                    # A failure to acquire an ASGInstnace is and should be unrecoverable.
                    chunk_status = None
                    elapsed = 0.0
                    try:
                        t_start = time.time()
                        try:
                            command.execute(command.remote(
                                commands, key_path, remote_username,
                                instance_ip),
                                            timeout=chunk_timeout)
                        except:
                            chunk_status = ChunkStatus.CRASH
                            raise
                        finally:
                            elapsed = time.time() - t_start
                            if chunk_status == ChunkStatus.CRASH and elapsed >= chunk_timeout:
                                chunk_status = ChunkStatus.TIMEOUT

                        output_corrupt = self.__check_if_output_is_corrupt(
                            service, key_path, remote_username, instance_ip,
                            multihit_remote_outfile, chunk_id, try_number)

                        if output_corrupt:
                            chunk_status = ChunkStatus.CORRUPT_OUTPUT
                            assert not output_corrupt, output_corrupt

                        # Yay, chunk succeeded.  Copy from server and break out of retry loop.
                        try:
                            self.__copy_multihit_remote_outfile(
                                key_path, remote_username, instance_ip,
                                multihit_remote_outfile,
                                multihit_local_outfile)
                            chunk_status = ChunkStatus.SUCCESS
                            break
                        except:
                            # If we failed to copy from the server, it's as bad as a crash in alignment.
                            chunk_status = ChunkStatus.CRASH
                            raise

                    except Exception as e:

                        # 1. No backoff needed here before retrying.  We rate limit chunk dispatch (the ASGInstance
                        # acquisition above is blocking).  ASGInstance acquisition also tries to ensure that every
                        # chunk flight gets its first try before any retry is dispatched.

                        # 2. If the reason we failed is timeout on the server, we don't retry.  The operator must decide
                        # whether to QC the data more, or use smaller chunk size.  In fact, we only retry for CRASH and
                        # CORRUPT_OUTPUT.

                        # 3. If this is the last attempt, we gotta re-raise the exception.

                        # 4. Elapsed time is only the time spent in alignment.  It excludes the time spent waiting to
                        # acquire ASGinstance.

                        log.log_event('alignment_remote_error',
                                      values={
                                          "chunk": chunk_id,
                                          "try_number": try_number,
                                          "CHUNK_MAX_TRIES": CHUNK_MAX_TRIES,
                                          "chunk_status": chunk_status,
                                          "elapsed": elapsed,
                                          "chunk_timeout": chunk_timeout,
                                          "exception": log.parse_exception(e)
                                      })
                        retrying_might_help = chunk_status in (
                            ChunkStatus.CORRUPT_OUTPUT, ChunkStatus.CRASH)
                        if try_number < CHUNK_MAX_TRIES and retrying_might_help:
                            # Retry!
                            continue
                        else:
                            # End of the road.
                            raise
                    finally:
                        # None chunk_status indicates code bug above.  An exception has been raised already
                        # for it, and it says nothing about whether the alignment succeeded or not.
                        if chunk_status != None:
                            chunk_status_tracker(service).note_outcome(
                                instance_ip, chunk_id, elapsed, chunk_status,
                                try_number)
                        self.__delete_remote_dir(remote_work_dir, key_path,
                                                 remote_username, instance_ip)

            # Upload to s3
            with self.iostream_upload:  # Limit concurrent uploads so as not to stall the pipeline.
                command.execute(
                    command_patterns.SingleCommand(
                        cmd="aws",
                        args=[
                            "s3", "cp", "--only-show-errors",
                            multihit_local_outfile,
                            os.path.join(self.chunks_result_dir_s3, "")
                        ]))
            log.write(
                f"finished alignment for chunk {chunk_id} on {service} server {instance_ip}"
            )

        # Whether lazy or not lazy, we've now got the chunk result locally here.
        return multihit_local_outfile

Esempio n. 15

0

Mostra file

 def fetch_key(self, key_path_s3):
     key_path = fetch_from_s3(key_path_s3, self.output_dir_local)
     command.chmod(key_path, 0o400)
     return key_path

Esempio n. 16

0

Mostra file

    def run(self):
        # Setup
        nt_db = self.additional_attributes["nt_db"]
        if nt_db.startswith("s3://") and not s3.check_s3_presence(nt_db):
            raise RuntimeError(f"nt_db at {nt_db} not found.")
        nt_loc_db = s3.fetch_from_s3(self.additional_files["nt_loc_db"],
                                     self.ref_dir_local,
                                     allow_s3mi=True)
        db_type = "nt"  # Only NT supported for now
        # TODO: Design a way to map in/out files more robustly, e.g. by name/type
        annotated_m8 = self.input_files_local[0][0]
        annotated_fasta = self.input_files_local[1][0]
        output_json_dir = os.path.join(self.output_dir_local, "align_viz")

        # Go through annotated_fasta with a db_type (NT/NR match). Infer the
        # family/genus/species info
        read2seq = PipelineStepGenerateAlignmentViz.parse_reads(
            annotated_fasta, db_type)
        log.write(f"Read to Seq dictionary size: {len(read2seq)}")

        db_path = nt_loc_db.replace(".db", "")
        nt_loc_dict = shelve.open(db_path)
        groups, line_count = self.process_reads_from_m8_file(
            annotated_m8, read2seq)

        if nt_db.startswith("s3://"):
            log.write("Getting sequences by accession list from S3...")
            PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_s3(
                groups, nt_loc_dict, nt_db)
        else:
            log.write("Getting sequences by accession list from file...")
            PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_file(
                groups, nt_loc_dict, nt_db)

        for accession_id, ad in groups.items():
            ad['coverage_summary'] = PipelineStepGenerateAlignmentViz.calculate_alignment_coverage(
                ad)

        result_dict, to_be_deleted = self.populate_reference_sequences(groups)

        # Delete temp files
        def safe_multi_delete(files):
            for f in files:
                try:
                    os.remove(f)
                except:
                    pass

        deleter_thread = threading.Thread(target=safe_multi_delete,
                                          args=[to_be_deleted])
        deleter_thread.start()

        self.dump_align_viz_json(output_json_dir, db_type, result_dict)

        deleter_thread.join()

        # Write summary file
        summary_msg = f"Read2Seq Size: {len(read2seq)}, M8 lines {line_count}, " \
                  f"{len(groups)} unique accession ids "
        summary_file_name = f"{output_json_dir}.summary"
        with open(summary_file_name, 'w') as summary_f:
            summary_f.write(summary_msg)

Esempio n. 17

0

Mostra file

    def run(self):
        """
        Generate host genome indexes for STAR and bowtie2
        """
        # Set up
        input_fasta_path = self.input_files_local[0][0]
        ercc_fasta_path = s3.fetch_from_s3(self.additional_files["ercc_fasta"],
                                           self.output_dir_local,
                                           allow_s3mi=True,
                                           auto_unzip=True)
        if input_fasta_path[-3:] == '.gz':
            # unzip the file
            dest_path = input_fasta_path[:-3]
            command.execute(
                command_patterns.ShellScriptCommand(
                    script=
                    r'''gzip -dc "${input_fasta_path}" > "${dest_path}";''',
                    named_args={
                        'input_fasta_path': input_fasta_path,
                        'dest_path': dest_path
                    }))

            input_fasta_path = dest_path

        input_gtf_path = None
        if self.additional_files.get("input_gtf"):
            input_gtf_path = s3.fetch_from_s3(
                self.additional_files["input_gtf"],
                self.output_dir_local,
                allow_s3mi=True)

        ercc_gtf_path = s3.fetch_from_s3(self.additional_files["ercc_gtf"],
                                         self.output_dir_local,
                                         allow_s3mi=True,
                                         auto_unzip=True)

        host_name = self.additional_attributes["host_name"]
        max_star_part_size = self.additional_attributes.get(
            "max_star_part_size")
        input_fasta_with_ercc = f"{input_fasta_path}.with_ercc"
        command.execute(
            command_patterns.ShellScriptCommand(
                script=
                r'''cat "${ercc_fasta_path}" "${input_fasta_path}" > "${input_fasta_with_ercc}";''',
                named_args={
                    'ercc_fasta_path': ercc_fasta_path,
                    'input_fasta_path': input_fasta_path,
                    'input_fasta_with_ercc': input_fasta_with_ercc
                }))

        input_gtf_with_ercc = ercc_gtf_path
        if input_gtf_path:
            input_gtf_with_ercc = f"{input_gtf_path}.with_ercc"
            command.execute(
                command_patterns.ShellScriptCommand(
                    script=
                    r'''cat "${ercc_gtf_path}" "${input_gtf_path}" > "${input_gtf_with_ercc}";''',
                    named_args={
                        'ercc_gtf_path': ercc_gtf_path,
                        'input_gtf_path': input_gtf_path,
                        'input_gtf_with_ercc': input_gtf_with_ercc
                    }))

        output_fasta_file, output_gtf_file, output_star_index, output_bowtie2_index = self.output_files_local(
        )

        command.copy_file(input_fasta_with_ercc, output_fasta_file)
        command.copy_file(input_gtf_with_ercc, output_gtf_file)

        # make STAR index
        self.make_star_index(input_fasta_with_ercc, input_gtf_with_ercc,
                             output_star_index, max_star_part_size)

        # make bowtie2 index
        self.make_bowtie2_index(host_name, input_fasta_with_ercc,
                                output_bowtie2_index)

Esempio n. 18

0

Mostra file

File: generate_phylo_tree.py Progetto: rcs333/idseq-dag

    def get_accession_sequences(self, dest_dir, n=10):
        '''
        Retrieve NCBI NT references for the most-matched accession in each alignment viz file, up to a maximum of n references.
        Write each reference to a separate fasta file.
        '''
        if n == 0:
            return {}

        # Retrieve files
        nt_db = self.additional_attributes["nt_db"]
        nt_loc_db = s3.fetch_from_s3(
            self.additional_files["nt_loc_db"],
            self.ref_dir_local,
            allow_s3mi=True)
        s3_align_viz_files = self.additional_attributes["align_viz_files"].values()
        local_align_viz_files = []
        for s3_file in s3_align_viz_files:
            local_basename = s3_file.replace("/", "-").replace(":", "-") # needs to be unique locally
            local_file = s3.fetch_from_s3(
                s3_file,
                os.path.join(self.ref_dir_local, local_basename))
            if local_file != None:
                local_align_viz_files.append(local_file)

        # Choose accessions to process.
        # align_viz files are a bit brittle, so we just log exceptions rather than failing the job.
        accessions = set()
        for local_file in local_align_viz_files:
            try:
                with open(local_file, 'rb') as f:
                    align_viz_dict = json.load(f)
                most_matched_accession = None
                max_num_reads = 0
                flat_align_viz_dict = {}
                self.parse_tree(align_viz_dict, flat_align_viz_dict)
                for acc, info in flat_align_viz_dict.items():
                    num_reads = info["coverage_summary"]["num_reads"]
                    if num_reads > max_num_reads:
                        max_num_reads = num_reads
                        most_matched_accession = acc
                accessions.add(most_matched_accession)
                if len(accessions) >= n:
                    break
            except:
                log.write(f"Warning: couldn't get accession from {local_file}!")
                traceback.print_exc()
        if len(accessions) > n:
            accessions = set(list(accessions)[0:n])

        # Make map of accession to sequence file
        accession2info = dict((acc, {}) for acc in accessions)
        nt_loc_dict = shelve.open(nt_loc_db.replace(".db", ""))
        PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_s3(
            accession2info, nt_loc_dict, nt_db)

        # Put 1 fasta file per accession into the destination directory
        accession_fastas = {}
        for acc, info in accession2info.items():
            clean_accession = self.clean_name_for_ksnp3(acc)
            local_fasta = f"{dest_dir}/NCBI_NT_accession_{clean_accession}.fasta"
            command.execute(f"ln -s {info['seq_file']} {local_fasta}")
            command.execute(f"echo '>{acc}' | cat - {local_fasta} > temp_file && mv temp_file {local_fasta}")
            accession_fastas[acc] = local_fasta

        # Return kept accessions and paths of their fasta files
        return accession_fastas

Esempio n. 19

0

Mostra file

    def run(self):
        '''
            1. summarize hits
            2. built blast index
            3. blast assembled contigs to the index
            4. update the summary
        '''
        (align_m8, deduped_m8, hit_summary,
         orig_counts) = self.input_files_local[0]
        assembled_contig, _assembled_scaffold, bowtie_sam, _contig_stats = self.input_files_local[
            1]
        reference_fasta = self.input_files_local[2][0]

        (blast_m8, refined_m8, refined_hit_summary, refined_counts,
         contig_summary_json) = self.output_files_local()
        db_type = self.additional_attributes["db_type"]
        if os.path.getsize(assembled_contig) < MIN_ASEEMBLED_CONTIG_SIZE or \
            os.path.getsize(reference_fasta) < MIN_REF_FASTA_SIZE:
            # No assembled results or refseq fasta available
            command.execute(f"echo ' ' > {blast_m8}")
            command.execute(f"cp {deduped_m8} {refined_m8}")
            command.execute(f"cp {hit_summary} {refined_hit_summary}")
            command.execute(f"cp {orig_counts} {refined_counts}")
            command.execute("echo '[]' > " + contig_summary_json)
            return

        (read_dict, accession_dict,
         _selected_genera) = m8.summarize_hits(hit_summary)
        top_entry_m8 = blast_m8.replace(".m8", ".top.m8")
        PipelineStepBlastContigs.run_blast(assembled_contig, reference_fasta,
                                           db_type, blast_m8, top_entry_m8)
        read2contig = {}
        contig_stats = defaultdict(int)
        PipelineStepRunAssembly.generate_info_from_sam(bowtie_sam, read2contig,
                                                       contig_stats)

        (updated_read_dict, read2blastm8, contig2lineage,
         added_reads) = self.update_read_dict(read2contig, top_entry_m8,
                                              read_dict, accession_dict)
        self.generate_m8_and_hit_summary(updated_read_dict, added_reads,
                                         read2blastm8, hit_summary, deduped_m8,
                                         refined_hit_summary, refined_m8)

        # Generating taxon counts based on updated results
        lineage_db = s3.fetch_from_s3(self.additional_files["lineage_db"],
                                      self.ref_dir_local,
                                      allow_s3mi=True)
        deuterostome_db = None
        evalue_type = 'raw'
        if self.additional_files.get("deuterostome_db"):
            deuterostome_db = s3.fetch_from_s3(
                self.additional_files["deuterostome_db"],
                self.ref_dir_local,
                allow_s3mi=True)
        m8.generate_taxon_count_json_from_m8(refined_m8,
                                             refined_hit_summary, evalue_type,
                                             db_type.upper(), lineage_db,
                                             deuterostome_db, refined_counts)
        # generate contig stats at genus/species level
        contig_taxon_summary = self.generate_taxon_summary(
            read2contig, contig2lineage, updated_read_dict, added_reads,
            db_type)
        with open(contig_summary_json, 'w') as contig_outf:
            json.dump(contig_taxon_summary, contig_outf)

        # Upload additional file
        contig2lineage_json = os.path.join(
            os.path.dirname(contig_summary_json),
            f"contig2lineage.{db_type}.json")
        with open(contig2lineage_json, 'w') as c2lf:
            json.dump(contig2lineage, c2lf)

        self.additional_files_to_upload.append(top_entry_m8)
        self.additional_files_to_upload.append(contig2lineage_json)

Esempio n. 20

0

Mostra file

 def fetch_key(self, key_path_s3):
     key_path = fetch_from_s3(key_path_s3, self.output_dir_local)
     command.execute("chmod 400 %s" % key_path)
     return key_path

Esempio n. 21

0

Mostra file

    def run_chunk(self, part_suffix, remote_home_dir, remote_index_dir,
                  remote_work_dir, remote_username, input_files, key_path,
                  service, lazy_run):
        """Dispatch a chunk to worker machines for distributed GSNAP or RAPSearch
        group machines and handle their execution.
        """
        assert service in ("gsnap", "rapsearch2")

        chunk_id = input_files[0].split(part_suffix)[-1]
        # TODO: Switch to python 3.6 which supports interpolation in string
        # formatting, and we will half the number of lines below.
        multihit_basename = "multihit-{service}-out{part_suffix}{chunk_id}.m8".format(
            service=service,
            part_suffix=part_suffix,
            chunk_id=chunk_id,
        )
        multihit_local_outfile = os.path.join(self.chunks_result_dir_local,
                                              multihit_basename)
        multihit_remote_outfile = os.path.join(remote_work_dir,
                                               multihit_basename)
        multihit_s3_outfile = os.path.join(self.chunks_result_dir_s3,
                                           multihit_basename)

        base_str = "aws s3 cp --only-show-errors {s3_path}/{input_fa} {remote_work_dir}/{input_fa} "
        download_input_from_s3 = " ; ".join(
            base_str.format(s3_path=self.chunks_result_dir_s3,
                            input_fa=input_fa,
                            remote_work_dir=remote_work_dir)
            for input_fa in input_files)

        base_str = "mkdir -p {remote_work_dir} ; {download_input_from_s3} ; "
        if service == "gsnap":
            commands = base_str + "{remote_home_dir}/bin/gsnapl -A m8 --batch=0 --use-shared-memory=0 --gmap-mode=none --npaths=100 --ordered -t 36 --maxsearch=1000 --max-mismatches=40 -D {remote_index_dir} -d nt_k16 {remote_input_files} > {multihit_remote_outfile}"
        else:
            commands = base_str + "/usr/local/bin/rapsearch -d {remote_index_dir}/nr_rapsearch -e -6 -l 10 -a T -b 0 -v 50 -z 24 -q {remote_input_files} -o {multihit_remote_outfile}"

        commands = commands.format(
            remote_work_dir=remote_work_dir,
            download_input_from_s3=download_input_from_s3,
            remote_home_dir=remote_home_dir,
            remote_index_dir=remote_index_dir,
            remote_input_files=" ".join(remote_work_dir + "/" + input_fa
                                        for input_fa in input_files),
            multihit_remote_outfile=multihit_remote_outfile
            if service == "gsnap" else multihit_remote_outfile[:-3]
            # Strip the .m8 for RAPSearch as it adds that
        )

        if not lazy_run or not fetch_from_s3(multihit_s3_outfile,
                                             multihit_local_outfile):
            correct_number_of_output_columns = 12
            min_column_number = 0
            max_tries = 2
            try_number = 1
            instance_ip = ""

            def interpret_min_column_number_string(
                    min_column_number_string, correct_number_of_output_columns,
                    try_number):
                if min_column_number_string:
                    min_column_number = float(min_column_number_string)
                    log.write(
                        "Try no. %d: Smallest number of columns observed in any line was %d"
                        % (try_number, min_column_number))
                else:
                    log.write("Try no. %d: No hits" % try_number)
                    min_column_number = correct_number_of_output_columns
                return min_column_number

            # Check if every row has correct number of columns (12) in the output
            # file on the remote machine
            while min_column_number != correct_number_of_output_columns \
                    and try_number <= max_tries:
                log.write("waiting for {} server for chunk {}".format(
                    service, chunk_id))
                max_concurrent = self.additional_attributes["max_concurrent"]
                environment = self.additional_attributes["environment"]

                instance_ip = server.wait_for_server_ip(
                    service, key_path, remote_username, environment,
                    max_concurrent, chunk_id)
                log.write("starting alignment for chunk %s on %s server %s" %
                          (chunk_id, service, instance_ip))
                command.execute(
                    command.remote(commands, key_path, remote_username,
                                   instance_ip))

                if service == "gsnap":
                    verification_command = "cat %s" % multihit_remote_outfile
                else:
                    # For rapsearch, first remove header lines starting with '#'
                    verification_command = "grep -v '^#' %s" % multihit_remote_outfile
                verification_command += " | awk '{print NF}' | sort -nu | head -n 1"
                min_column_number_string = command.execute_with_output(
                    command.remote(verification_command, key_path,
                                   remote_username, instance_ip))
                min_column_number = interpret_min_column_number_string(
                    min_column_number_string, correct_number_of_output_columns,
                    try_number)
                try_number += 1

            # Move output from remote machine to local machine
            msg = "Chunk %s output corrupt; not copying to S3. Re-start pipeline " \
                  "to try again." % chunk_id
            assert min_column_number == correct_number_of_output_columns, msg

            with self.iostream_upload:  # Limit concurrent uploads so as not to stall the pipeline.
                command.execute(
                    command.scp(key_path, remote_username, instance_ip,
                                multihit_remote_outfile,
                                multihit_local_outfile))
                command.execute(
                    "aws s3 cp --only-show-errors %s %s/" %
                    (multihit_local_outfile, self.chunks_result_dir_s3))
            log.write("finished alignment for chunk %s on %s server %s" %
                      (chunk_id, service, instance_ip))
        return multihit_local_outfile

Esempio n. 22

0

Mostra file

File: generate_phylo_tree.py Progetto: jonason91/idseq-workflows

    def get_accession_sequences(self, dest_dir, taxid, n=10):
        '''
        Retrieve NCBI NT references for the most-matched accession in each hitsummary2 file, up to a maximum of n references.
        Write each reference to a separate fasta file.
        '''
        if n == 0:
            return {}

        # Retrieve files
        nt_db = self.additional_attributes["nt_db"]
        nt_loc_db = s3.fetch_reference(
            self.additional_files["nt_loc_db"],
            self.ref_dir_local,
            allow_s3mi=True)

        # Choose accessions to process.
        s3_hitsummary2_files = self.additional_attributes["hitsummary2_files"].values()
        accessions = defaultdict(lambda: 0)
        # TODO: Address issue where accessions in nr can be chosen in the following code.
        # These accessions will not be found in nt_loc and will be subsequently omitted.
        for file_list in s3_hitsummary2_files:
            tally = defaultdict(lambda: 0)
            for s3_file in file_list:
                local_basename = s3_file.replace("/", "-").replace(":", "-")
                local_file = s3.fetch_from_s3(
                    s3_file,
                    os.path.join(self.output_dir_local, local_basename))
                if local_file is None:
                    continue
                with open(local_file, 'r') as f:
                    for line in f:
                        acc, species_taxid, genus_taxid, family_taxid = line.rstrip().split("\t")[3:7]
                        if any(int(hit_taxid) == taxid for hit_taxid in [species_taxid, genus_taxid, family_taxid]):
                            tally[acc] += 1
            if tally:
                best_acc, max_count = max(tally.items(), key=lambda x: x[1])
                accessions[best_acc] += max_count
        if len(accessions) > n:
            accessions = dict(sorted(accessions.items(), key=lambda x: x[1], reverse=True)[:n])
        accessions = set(accessions.keys())

        # Make map of accession to sequence file
        accession2info = dict((acc, {}) for acc in accessions)
        with open_file_db_by_extension(nt_loc_db) as nt_loc_dict:
            PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_s3(
                accession2info, nt_loc_dict, nt_db)

        # Put 1 fasta file per accession into the destination directory
        accession_fastas = {}
        for acc, info in accession2info.items():
            if 'seq_file' not in info or info['seq_file'] is None:
                log.write(f"WARNING: No sequence retrieved for {acc}")
                continue
            clean_accession = self.clean_name_for_ksnp3(acc)
            local_fasta = f"{dest_dir}/NCBI_NT_accession_{clean_accession}.fasta"
            command.execute(
                command_patterns.SingleCommand(
                    cmd="ln",
                    args=[
                        "-s",
                        info['seq_file'],
                        local_fasta
                    ]
                )
            )
            command.execute_with_output(
                command_patterns.ShellScriptCommand(
                    script=r'''echo ">${acc}" | cat - "${local_fasta}" > temp_file;''',
                    named_args={
                        'acc': acc,
                        'local_fasta': local_fasta
                    }
                )
            )
            command.move_file('temp_file', local_fasta)

            accession_fastas[acc] = local_fasta

        # Return kept accessions and paths of their fasta files
        return accession_fastas