Example #1
0
 def run(self):
     (_align_m8, _deduped_m8, hit_summary,
      _orig_counts) = self.input_files_local[0]
     output_reference_fasta = self.output_files_local()[0]
     loc_db = s3.fetch_reference(
         self.additional_files["loc_db"],
         self.ref_dir_local,
         auto_unzip=
         True,  # This is default for references, but let's be explicit.
         allow_s3mi=ALLOW_S3MI)
     db_s3_path = self.additional_attributes["db"]
     # db_type = self.additional_attributes["db_type"]
     (_read_dict, accession_dict,
      _selected_genera) = m8.summarize_hits(hit_summary)
     with open_file_db_by_extension(
             loc_db, IdSeqDictValue.VALUE_TYPE_ARRAY) as loc_dict:
         db_path = s3.fetch_reference(
             db_s3_path,
             self.ref_dir_local,
             auto_unzip=
             True,  # This is default for references, but let's be explicit
             allow_s3mi=ALLOW_S3MI)
         self.download_ref_sequences_from_file(accession_dict, loc_dict,
                                               db_path,
                                               output_reference_fasta)
Example #2
0
    def run(self):
        ''' Run alignmment remotely '''

        alignment_algorithm_inputs = PipelineStepRunAlignment._alignment_algorithm_inputs(
            self.input_files_local[0])
        duplicate_cluster_sizes_path, = self.input_files_local[1]
        output_m8, deduped_output_m8, output_hitsummary, output_counts_with_dcr_json = self.output_files_local(
        )
        assert output_counts_with_dcr_json.endswith(
            "_with_dcr.json"), self.output_files_local()

        if self.is_local_run:
            self.run_locally(
                alignment_algorithm_inputs[self.alignment_algorithm],
                output_m8)
        else:
            self.run_remotely(
                alignment_algorithm_inputs[self.alignment_algorithm],
                output_m8)

        # get database
        lineage_db = fetch_reference(self.additional_files["lineage_db"],
                                     self.ref_dir_local)
        accession2taxid_db = fetch_reference(
            self.additional_files["accession2taxid_db"],
            self.ref_dir_local,
            allow_s3mi=True)

        min_alignment_length = NT_MIN_ALIGNMENT_LEN if self.alignment_algorithm == 'gsnap' else 0
        m8.call_hits_m8(output_m8, lineage_db, accession2taxid_db,
                        deduped_output_m8, output_hitsummary,
                        min_alignment_length)

        db_type = 'NT' if self.alignment_algorithm == 'gsnap' else 'NR'

        deuterostome_db = None
        if self.additional_files.get("deuterostome_db"):
            deuterostome_db = fetch_reference(
                self.additional_files["deuterostome_db"],
                self.ref_dir_local,
                allow_s3mi=True)

        blacklist_s3_file = self.additional_files.get('taxon_blacklist',
                                                      DEFAULT_BLACKLIST_S3)
        taxon_blacklist = fetch_reference(blacklist_s3_file,
                                          self.ref_dir_local)

        taxon_whitelist = None
        if self.additional_attributes.get("use_taxon_whitelist"):
            taxon_whitelist = fetch_reference(
                self.additional_files.get("taxon_whitelist",
                                          DEFAULT_WHITELIST_S3),
                self.ref_dir_local)

        m8.generate_taxon_count_json_from_m8(deduped_output_m8,
                                             output_hitsummary, db_type,
                                             lineage_db, deuterostome_db,
                                             taxon_whitelist, taxon_blacklist,
                                             duplicate_cluster_sizes_path,
                                             output_counts_with_dcr_json)
    def create_taxon_count_file(self):
        # TOOO: Can this be consolidated throughout the pipeline?
        # This setup is mostly repeated in three steps. The list of taxa do not seem to change.
        count_type = 'merged_NT_NR'
        lineage_db = fetch_reference(self.additional_files["lineage_db"],
                                     self.ref_dir_local,
                                     allow_s3mi=False)
        deuterostome_db = None
        if self.additional_files.get("deuterostome_db"):
            deuterostome_db = fetch_reference(
                self.additional_files["deuterostome_db"],
                self.ref_dir_local,
                allow_s3mi=False)  # Too small for s3mi
        taxon_whitelist = None
        if self.additional_attributes.get("use_taxon_whitelist"):
            taxon_whitelist = fetch_reference(
                self.additional_files.get("taxon_whitelist",
                                          DEFAULT_WHITELIST_S3),
                self.ref_dir_local)
        blacklist_s3_file = self.additional_files.get('taxon_blacklist',
                                                      DEFAULT_BLACKLIST_S3)
        taxon_blacklist = fetch_reference(blacklist_s3_file,
                                          self.ref_dir_local)
        cdhit_cluster_sizes_path = self.inputs.cluster_sizes_filename

        generate_taxon_count_json_from_m8(
            self.outputs.merged_m8_filename, self.outputs.merged_hit_filename,
            count_type, lineage_db, deuterostome_db, taxon_whitelist,
            taxon_blacklist, cdhit_cluster_sizes_path,
            self.outputs.merged_taxon_count_filename)
Example #4
0
    def generate_mapped_reads_tsv(self):
        """Use bedtools to generate a table of mapped reads for each genome in the ARG ANNOT database.
            If a new resistance gene db is used, the .bed file will need to be updated manually."""
        bed_file_path = fetch_reference(
            self.additional_files["resist_genome_bed"],
            self.ref_dir_local,
            allow_s3mi=False)
        sample_bam_file_path = self.output_files_local()[5]

        tmp_sort_dir = os.path.join(self.output_dir_local, "tmp_sort")
        command.make_dirs(tmp_sort_dir)

        # Convert the sorted.bam output from SRST2 to the bed format, then sort the bed file.
        # This allows us to use the "sorted" mode of bedtools coverage, which is memory-efficient.
        # Otherwise, large sorted.bam files will cause our machines to run out of RAM.
        #
        # Note that despite being called "sorted.bam", the bam is not sorted the way we need it to be.
        #
        # env LC_ALL=C ensures that the sort command uses the same sort order on all machines.
        #
        # The -T flag with tmp_sort_dir ensures that we make tmp files inside /mnt, which is where our huge AWS volumes are mounted.
        # By default, the sort command creates temp files in /tmp, which has very little disk space.
        command.execute(
            command_patterns.ShellScriptCommand(
                script='''
                    bedtools bamtobed -i "$1" |
                    env LC_ALL=C sort -T "$2" -k1,1 -k2,2n |
                    bedtools coverage -sorted -a "$3" -b stdin > "$4";''',
                args=[
                    sample_bam_file_path, tmp_sort_dir, bed_file_path,
                    os.path.join(self.output_dir_local, MATCHED_READS_FILE)
                ]))

        command.remove_rf(tmp_sort_dir)
Example #5
0
    def run(self):
        input_fas = self.input_files_local[0][0:2]
        output_fas = self.output_files_local()
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_output_files_hidden.append(output_sam_file)

        genome_dir = fetch_reference(self.additional_files["gsnap_genome"],
                                     self.ref_dir_local,
                                     allow_s3mi=True,
                                     auto_untar=True)
        gsnap_base_dir = os.path.dirname(genome_dir)
        gsnap_index_name = os.path.basename(genome_dir)
        # Run Gsnap
        gsnap_params = [
            '-A', 'sam', '--batch=0', '--use-shared-memory=0',
            '--gmap-mode=all', '--npaths=1', '--ordered', '-t', 32,
            '--max-mismatches=40', '-D', gsnap_base_dir, '-d',
            gsnap_index_name, '-o', output_sam_file
        ] + input_fas
        command.execute(
            command_patterns.SingleCommand(cmd='gsnapl', args=gsnap_params))
        log.write("Finished GSNAP alignment.")

        # Extract out unmapped files from sam
        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])
Example #6
0
    def run(self):
        # Setup
        if len(self.input_files_local) > 1:
            input_fa_name = self.input_files_local[0][0]
            hit_summary_files = {
                'NT': self.input_files_local[1][2],
                'NR': self.input_files_local[2][2]
            }
        else:
            # TODO(yf): Old implementation. TO BE DEPRECATED once 3.1 is fully deployed
            input_files = self.input_files_local[0]
            input_fa_name = input_files[0]
            hit_summary_files = {'NT': input_files[1], 'NR': input_files[2]}

        # Open lineage db
        lineage_db = s3.fetch_reference(self.additional_files["lineage_db"],
                                        self.ref_dir_local,
                                        allow_s3mi=True)

        # Get primary hit mappings
        valid_hits = PipelineStepGenerateTaxidFasta.parse_hits(
            hit_summary_files)

        with open(input_fa_name, 'rb') as input_fa, \
             open(self.output_files_local()[0], 'wb') as output_fa, \
             open_file_db_by_extension(lineage_db, IdSeqDictValue.VALUE_TYPE_ARRAY) as lineage_map:  # noqa
            seq_name = input_fa.readline()
            seq_data = input_fa.readline()
            while len(seq_name) > 0 and len(seq_data) > 0:
                # Example read_id: "NR::NT:CP010376.2:NB501961:14:HM7TLBGX2:1:23109
                # :12720:8743/2"
                # Translate the read information into our custom format with fake
                # taxids at non-specific hit levels.
                annotated_read_id = seq_name.decode("utf-8").rstrip().lstrip(
                    '>')
                read_id = annotated_read_id.split(":", 4)[-1]

                nr_taxid_species, nr_taxid_genus, nr_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage(
                    valid_hits, lineage_map, read_id, 'NR')
                nt_taxid_species, nt_taxid_genus, nt_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage(
                    valid_hits, lineage_map, read_id, 'NT')

                fields = [
                    "family_nr", nr_taxid_family, "family_nt", nt_taxid_family
                ]
                fields += [
                    "genus_nr", nr_taxid_genus, "genus_nt", nt_taxid_genus
                ]
                fields += [
                    "species_nr", nr_taxid_species, "species_nt",
                    nt_taxid_species
                ]
                fields += [annotated_read_id]
                new_read_name = ('>' + ':'.join(fields) + '\n').encode()

                output_fa.write(new_read_name)
                output_fa.write(seq_data)
                seq_name = input_fa.readline()
                seq_data = input_fa.readline()
Example #7
0
    def run(self):
        input_fa_name = self.input_files_local[0][0]
        if len(self.input_files_local) > 1:
            input_fa_name = self.input_files_local[0][0]
            nt_hit_summary_path, nr_hit_summary_path = self.input_files_local[
                1][2], self.input_files_local[2][2]
        else:
            # This is used in `short-read-mngs/experimental.wdl`
            input_fa_name = self.input_files_local[0][0]
            nt_hit_summary_path, nr_hit_summary_path = self.input_files_local[
                0][1], self.input_files_local[0][2]

        # Open lineage db
        lineage_db = s3.fetch_reference(self.additional_files["lineage_db"],
                                        self.ref_dir_local,
                                        allow_s3mi=True)

        with open(nt_hit_summary_path) as nt_hit_summary_f, open(
                nr_hit_summary_path) as nr_hit_summary_f:
            nr_hits_by_read_id = {
                row["read_id"]: (row["taxid"], row["level"])
                for row in HitSummaryMergedReader(nr_hit_summary_f)
            }
            nt_hits_by_read_id = {
                row["read_id"]: (row["taxid"], row["level"])
                for row in HitSummaryMergedReader(nt_hit_summary_f)
            }

        with open(self.output_files_local()[0], "w") as output_fa, \
             open_file_db_by_extension(lineage_db) as lineage_map:  # noqa
            for read in fasta.iterator(input_fa_name):
                # Example read_id: "NR::NT:CP010376.2:NB501961:14:HM7TLBGX2:1:23109
                # :12720:8743/2"
                # Translate the read information into our custom format with fake
                # taxids at non-specific hit levels.
                # TODO: (tmorse) fasta parsing
                annotated_read_id = read.header.lstrip('>')
                read_id = annotated_read_id.split(":", 4)[-1]

                nr_taxid_species, nr_taxid_genus, nr_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage(
                    nr_hits_by_read_id, lineage_map, read_id)
                nt_taxid_species, nt_taxid_genus, nt_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage(
                    nt_hits_by_read_id, lineage_map, read_id)

                fields = [
                    "family_nr", nr_taxid_family, "family_nt", nt_taxid_family
                ]
                fields += [
                    "genus_nr", nr_taxid_genus, "genus_nt", nt_taxid_genus
                ]
                fields += [
                    "species_nr", nr_taxid_species, "species_nt",
                    nt_taxid_species
                ]
                fields += [annotated_read_id]
                new_read_name = ('>' + ':'.join(fields) + '\n')

                output_fa.write(new_read_name)
                output_fa.write(read.sequence + "\n")
Example #8
0
 def get_common_params(self):
     """Helper that gets srst2 parameters common to both paired and single rds."""
     # TODO:  Why is this not fetch_reference?   So it can be cached.
     db_file_path = fetch_reference(self.additional_files["resist_gene_db"], self.ref_dir_local, allow_s3mi=False)  # too small for s3mi
     min_cov = str(self.additional_attributes['min_cov'])
     # srst2 expects this to be a string, in dag could be passed in as a number
     n_threads = str(self.additional_attributes['n_threads'])
     return ['--min_coverage', min_cov, '--threads', n_threads,
             '--output', os.path.join(self.output_dir_local, 'output'), '--log', '--gene_db', db_file_path]
Example #9
0
    def run(self):
        input_fas = self.input_fas()
        output_fas = self.output_files_local()
        genome_dir = fetch_reference(
            self.additional_files["bowtie2_genome"],
            self.ref_dir_local,
            allow_s3mi=True,
            auto_untar=True)
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_output_files_hidden.append(output_sam_file)
        # The file structure looks like
        # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2"
        genome_basename = command.glob(f"{genome_dir}/*.bt2*", max_results=1)[0]
        # remove two extensions: ex: hg38_phiX_rRNA_mito_ERCC.3.bt2 -> hg38_phiX_rRNA_mito_ERCC
        genome_basename = os.path.splitext(os.path.splitext(genome_basename)[0])[0]

        bowtie2_params = [
            '-q', '-x', genome_basename, '-f',
            '--very-sensitive-local', '-S', output_sam_file
        ]

        # --seed cannot be used with -p multithreading
        # We have observed the lack of multithreading resulting in
        # severe performance degradation in some cases. So for the
        # time being multithreading is being chosen over determinism.
        # To seed bowtie2 do something similar to:
        # bowtie2_params.extend(['--seed', '4'])
        bowtie2_params.extend(['-p', str(multiprocessing.cpu_count())])

        if len(input_fas) == 2:
            bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]])
        else:
            bowtie2_params.extend(['-U', input_fas[0]])

        # Example:
        # bowtie2 -q -x /mnt/idseq/ref/bowtie2_genome/hg38_phiX_rRNA_mito_ERCC -f \
        #         --very-sensitive-local -S /mnt/idseq/results/589/bowtie2_human.sam \
        #         -p 32 \
        #         -1 /mnt/idseq/results/589/unmapped_human_1.fa -2 /mnt/idseq/results/589/unmapped_human_2.fa
        command.execute(
            command_patterns.SingleCommand(
                cmd='bowtie2',
                args=bowtie2_params
            )
        )
        log.write("Finished Bowtie alignment.")

        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])
Example #10
0
    def run(self):
        input_fas = self.input_files_local[0][0:2]
        output_fas = self.output_files_local()
        genome_dir = fetch_reference(
            self.additional_files["bowtie2_genome"],
            self.ref_dir_local,
            allow_s3mi=True,
            auto_untar=True)
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_output_files_hidden.append(output_sam_file)
        # The file structure looks like
        # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2"
        genome_basename = command.glob(f"{genome_dir}/*.bt2*", max_results=1)[0]
        # remove two extensions: ex: hg38_phiX_rRNA_mito_ERCC.3.bt2 -> hg38_phiX_rRNA_mito_ERCC
        genome_basename = os.path.splitext(os.path.splitext(genome_basename)[0])[0]

        bowtie2_params = [
            '-q', '-x', genome_basename, '-f',
            '--very-sensitive-local', '-S', output_sam_file
        ]

        seed = self.additional_attributes.get("random_seed")
        if seed:
            bowtie2_params.extend(['--seed', str(seed)])
        else:
            # Seed option won't work with -p threading option.
            bowtie2_params.extend(['-p', str(multiprocessing.cpu_count())])

        if len(input_fas) == 2:
            bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]])
        else:
            bowtie2_params.extend(['-U', input_fas[0]])

        # Example:
        # bowtie2 -q -x /mnt/idseq/ref/bowtie2_genome/hg38_phiX_rRNA_mito_ERCC -f \
        #         --very-sensitive-local -S /mnt/idseq/results/589/bowtie2_human.sam \
        #         -p 32 \
        #         -1 /mnt/idseq/results/589/unmapped_human_1.fa -2 /mnt/idseq/results/589/unmapped_human_2.fa
        command.execute(
            command_patterns.SingleCommand(
                cmd='bowtie2',
                args=bowtie2_params
            )
        )
        log.write("Finished Bowtie alignment.")

        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])
Example #11
0
    def run(self):
        input_fas = self.input_fas()
        output_fas = self.output_files_local()
        genome_dir = fetch_reference(self.additional_files["bowtie2_genome"],
                                     self.ref_dir_local,
                                     allow_s3mi=True,
                                     auto_untar=True)
        output_sam_file = os.path.join(
            self.output_dir_local,
            self.additional_attributes["output_sam_file"])
        self.additional_output_files_hidden.append(output_sam_file)
        # The file structure looks like
        # "bowtie2_genome/GRCh38.primary_assembly.genome.3.bt2"
        genome_basename = command.glob(f"{genome_dir}/*.bt2*",
                                       max_results=1)[0]
        # remove two extensions: ex: hg38_phiX_rRNA_mito_ERCC.3.bt2 -> hg38_phiX_rRNA_mito_ERCC
        genome_basename = os.path.splitext(
            os.path.splitext(genome_basename)[0])[0]

        bowtie2_params = [
            '-q', '-x', genome_basename, '-f', '--very-sensitive-local', '-S',
            output_sam_file
        ]

        # FIXME: https://jira.czi.team/browse/IDSEQ-2738
        #  We want to move towards a general randomness solution in which
        #  all randomness is seeded based on the content of the original input.
        #  This is currently introducing non-determinism and hard coding
        #  an arbitrary seed here shouldn't impact correctness.
        bowtie2_params.extend(
            ['--seed',
             '4'])  # chosen by fair dice role, guaranteed to be random

        if len(input_fas) == 2:
            bowtie2_params.extend(['-1', input_fas[0], '-2', input_fas[1]])
        else:
            bowtie2_params.extend(['-U', input_fas[0]])

        # Example:
        # bowtie2 -q -x /mnt/idseq/ref/bowtie2_genome/hg38_phiX_rRNA_mito_ERCC -f \
        #         --very-sensitive-local -S /mnt/idseq/results/589/bowtie2_human.sam \
        #         -p 32 \
        #         -1 /mnt/idseq/results/589/unmapped_human_1.fa -2 /mnt/idseq/results/589/unmapped_human_2.fa
        command.execute(
            command_patterns.SingleCommand(cmd='bowtie2', args=bowtie2_params))
        log.write("Finished Bowtie alignment.")

        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(output_sam_file,
                                                     output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(output_sam_file,
                                                       output_fas[0])
Example #12
0
    def run(self):
        ''' Run alignmment remotely '''
        input_fas = self.get_input_fas()
        [output_m8, deduped_output_m8, output_hitsummary,
         output_counts_json] = self.output_files_local()
        service = self.additional_attributes["service"]
        assert service in ("gsnap", "rapsearch2")
        min_alignment_length = 36 if service == 'gsnap' else 0  # alignments < 36-NT are false positives

        self.run_remotely(input_fas, output_m8, service)

        # get database
        lineage_db = fetch_reference(self.additional_files["lineage_db"],
                                     self.ref_dir_local)
        accession2taxid_db = fetch_reference(
            self.additional_files["accession2taxid_db"],
            self.ref_dir_local,
            allow_s3mi=True)
        blacklist_s3_file = self.additional_attributes.get(
            'taxon_blacklist', DEFAULT_BLACKLIST_S3)
        taxon_blacklist = fetch_reference(blacklist_s3_file,
                                          self.ref_dir_local)
        m8.call_hits_m8(output_m8, lineage_db, accession2taxid_db,
                        deduped_output_m8, output_hitsummary,
                        min_alignment_length, taxon_blacklist)

        # check deuterostome
        deuterostome_db = None
        db_type = 'NT' if service == 'gsnap' else 'NR'
        evalue_type = 'log10' if service == 'rapsearch2' else 'raw'
        if self.additional_files.get("deuterostome_db"):
            deuterostome_db = fetch_reference(
                self.additional_files["deuterostome_db"],
                self.ref_dir_local,
                allow_s3mi=True)
        m8.generate_taxon_count_json_from_m8(deduped_output_m8,
                                             output_hitsummary, evalue_type,
                                             db_type, lineage_db,
                                             deuterostome_db,
                                             output_counts_json)
Example #13
0
    def run(self):
        """
        Trim any residual Illumina adapters.
        Discard any reads that become too short.

        See: http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/TrimmomaticManual_V0.32.pdf
        """
        input_files = self.input_files_local[0][0:2]
        output_files = self.output_files_local()
        is_paired = (len(input_files) == 2)
        adapter_fasta = s3.fetch_reference(
            self.additional_files["adapter_fasta"], self.ref_dir_local)

        if fasta.input_file_type(input_files[0]) != 'fastq':
            # Not fastq
            for in_file, out_file in zip(input_files, output_files):
                command.copy_file(in_file, out_file)
            return

        if is_paired:
            paired_arg = "PE"
            output_args = [
                output_files[0],  # R1, paired, to be kept
                f"{output_files[0]}__unpaired",  # R1, no longer paired, to be discarded
                output_files[1],  # R2, paired, to be kept
                f"{output_files[1]}__unpaired"
            ]  # R2, no longer paired, to be discarded
        else:
            paired_arg = "SE"
            output_args = output_files

        params = [
            "-jar",
            "/usr/local/bin/trimmomatic-0.38.jar",
            paired_arg,
            "-phred33",
            *input_files,
            *output_args,
            f"ILLUMINACLIP:{adapter_fasta}:2:30:10:8:true",
            # Remove Illumina adapters provided in the fasta file. Initially, look for seed matches
            # allowing maximally *2* mismatches. These seeds will be extended and clipped if in the case of paired end
            # reads a score of *30* is reached, or in the case of single ended reads a
            # score of *10*.
            # additional parameters: minAdapterLength = 8, keepBothReads = true; these are set to require pairs to be
            #    kept even when an adapter read-through occurs and R2 is a direct reverse complement of R1.
            "MINLEN:35"
            # Discard reads which are less than *75* bases long after these steps.
        ]
        command.execute(command_patterns.SingleCommand(cmd="java",
                                                       args=params))
Example #14
0
    def run(self):
        """
        Extract data from input files.
        Generate coverage viz data.
        Output JSON output files.
        """
        max_num_bins_coverage = self.additional_attributes.get(
            "max_num_bins_coverage", MAX_NUM_BINS_COVERAGE)
        num_accessions_per_taxon = self.additional_attributes.get(
            "num_accessions_per_taxon", NUM_ACCESSIONS_PER_TAXON)
        min_contig_size = self.additional_attributes.get(
            "min_contig_size", MIN_CONTIG_SIZE)

        info_db = s3.fetch_reference(self.additional_files["info_db"],
                                     self.ref_dir_local,
                                     allow_s3mi=True)
        with open_file_db_by_extension(
                info_db, IdSeqDictValue.VALUE_TYPE_ARRAY) as info_dict:
            # Extract data from input files.
            (taxon_data, accession_data, contig_data,
             read_data) = self.prepare_data(self.input_files_local, info_dict,
                                            min_contig_size,
                                            num_accessions_per_taxon)

        # Generate the coverage viz data for each accession.
        coverage_viz_data = self.generate_coverage_viz_data(
            accession_data, contig_data, read_data, max_num_bins_coverage)

        # Generate the summary data, which contains a dict of all taxons for which coverage viz data is available.
        # For each taxon, summary data for the best accessions, plus the number of total accessions, is included.
        coverage_viz_summary_data = self.generate_coverage_viz_summary_data(
            taxon_data, accession_data, coverage_viz_data)

        coverage_viz_summary = self.output_files_local()[0]
        # Write the summary JSON file which is initially loaded on the report page.
        with open(coverage_viz_summary, 'w') as cvs:
            json.dump(coverage_viz_summary_data, cvs)

        # Create a separate coverage viz JSON file for each accession.
        # This file will be passed to the front-end when the user views that particular accession.
        coverage_viz_dir = os.path.join(self.output_dir_local, "coverage_viz")
        command.make_dirs(coverage_viz_dir)
        for accession_id in coverage_viz_data:
            upload_file = os.path.join(coverage_viz_dir,
                                       f"{accession_id}_coverage_viz.json")

            with open(upload_file, 'w') as uf:
                json.dump(coverage_viz_data[accession_id], uf)

        self.additional_output_folders_hidden.append(coverage_viz_dir)
Example #15
0
    def run(self):
        input_fas = self.input_fas()
        output_fas = self.output_files_local()
        output_sam_file = os.path.join(self.output_dir_local,
                                       self.additional_attributes["output_sam_file"])
        self.additional_output_files_hidden.append(output_sam_file)

        genome_dir = fetch_reference(self.additional_files["gsnap_genome"],
                                     self.ref_dir_local,
                                     allow_s3mi=True,
                                     auto_untar=True)
        gsnap_base_dir = os.path.dirname(genome_dir)
        gsnap_index_name = os.path.basename(genome_dir)
        # Hack to determine gsnap vs gsnapl
        error_message = subprocess.run(
                ['gsnapl', '-D', gsnap_base_dir, '-d', gsnap_index_name],
                input='>'.encode('utf-8'),
                stderr=subprocess.PIPE,
                stdout=subprocess.PIPE
            ).stderr
        gsnap_exe = "gsnap" if 'please run gsnap instead' in error_message.decode('utf-8') else "gsnapl"
        # Run Gsnap
        gsnap_params = [
            '-A', 'sam', '--batch=0', '--use-shared-memory=0',
            '--gmap-mode=all', '--npaths=1', '--ordered', '-t', 32,
            '--max-mismatches=40', '-D', gsnap_base_dir, '-d', gsnap_index_name,
            '-o',
            output_sam_file
        ] + input_fas
        command.execute(
            command_patterns.SingleCommand(
                cmd=gsnap_exe,
                args=gsnap_params
            )
        )
        log.write("Finished GSNAP alignment.")

        # Extract out unmapped files from sam
        if len(input_fas) == 2:
            convert.generate_unmapped_pairs_from_sam(
                output_sam_file, output_fas)
        else:
            convert.generate_unmapped_singles_from_sam(
                output_sam_file, output_fas[0])
    def get_accession_sequences(self, dest_dir, taxid, n=10):
        '''
        Retrieve NCBI NT references for the most-matched accession in each hitsummary2 file, up to a maximum of n references.
        Write each reference to a separate fasta file.
        '''
        if n == 0:
            return {}

        # Retrieve files
        nt_db = self.additional_attributes["nt_db"]
        nt_loc_db = s3.fetch_reference(
            self.additional_files["nt_loc_db"],
            self.ref_dir_local,
            allow_s3mi=True)

        # Choose accessions to process.
        s3_hitsummary2_files = self.additional_attributes["hitsummary2_files"].values()
        accessions = defaultdict(lambda: 0)
        # TODO: Address issue where accessions in nr can be chosen in the following code.
        # These accessions will not be found in nt_loc and will be subsequently omitted.
        for file_list in s3_hitsummary2_files:
            tally = defaultdict(lambda: 0)
            for s3_file in file_list:
                local_basename = s3_file.replace("/", "-").replace(":", "-")
                local_file = s3.fetch_from_s3(
                    s3_file,
                    os.path.join(self.output_dir_local, local_basename))
                if local_file is None:
                    continue
                with open(local_file, 'r') as f:
                    for line in f:
                        acc, species_taxid, genus_taxid, family_taxid = line.rstrip().split("\t")[3:7]
                        if any(int(hit_taxid) == taxid for hit_taxid in [species_taxid, genus_taxid, family_taxid]):
                            tally[acc] += 1
            if tally:
                best_acc, max_count = max(tally.items(), key=lambda x: x[1])
                accessions[best_acc] += max_count
        if len(accessions) > n:
            accessions = dict(sorted(accessions.items(), key=lambda x: x[1], reverse=True)[:n])
        accessions = set(accessions.keys())

        # Make map of accession to sequence file
        accession2info = dict((acc, {}) for acc in accessions)
        with open_file_db_by_extension(nt_loc_db) as nt_loc_dict:
            PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_s3(
                accession2info, nt_loc_dict, nt_db)

        # Put 1 fasta file per accession into the destination directory
        accession_fastas = {}
        for acc, info in accession2info.items():
            if 'seq_file' not in info or info['seq_file'] is None:
                log.write(f"WARNING: No sequence retrieved for {acc}")
                continue
            clean_accession = self.clean_name_for_ksnp3(acc)
            local_fasta = f"{dest_dir}/NCBI_NT_accession_{clean_accession}.fasta"
            command.execute(
                command_patterns.SingleCommand(
                    cmd="ln",
                    args=[
                        "-s",
                        info['seq_file'],
                        local_fasta
                    ]
                )
            )
            command.execute_with_output(
                command_patterns.ShellScriptCommand(
                    script=r'''echo ">${acc}" | cat - "${local_fasta}" > temp_file;''',
                    named_args={
                        'acc': acc,
                        'local_fasta': local_fasta
                    }
                )
            )
            command.move_file('temp_file', local_fasta)

            accession_fastas[acc] = local_fasta

        # Return kept accessions and paths of their fasta files
        return accession_fastas
    def run(self):
        '''
            1. summarize hits
            2. built blast index
            3. blast assembled contigs to the index
            4. update the summary
        '''
        _align_m8, deduped_m8, hit_summary, orig_counts_with_dcr = self.input_files_local[0]
        assembled_contig, _assembled_scaffold, bowtie_sam, _contig_stats = self.input_files_local[1]
        reference_fasta, = self.input_files_local[2]
        duplicate_cluster_sizes_path, = self.input_files_local[3]

        blast_m8, refined_m8, refined_hit_summary, refined_counts_with_dcr, contig_summary_json, blast_top_m8 = self.output_files_local()

        assert refined_counts_with_dcr.endswith("with_dcr.json"), self.output_files_local()
        assert orig_counts_with_dcr.endswith("with_dcr.json"), self.output_files_local()

        db_type = self.additional_attributes["db_type"]
        no_assembled_results = (
            os.path.getsize(assembled_contig) < MIN_ASSEMBLED_CONTIG_SIZE or
            os.path.getsize(reference_fasta) < MIN_REF_FASTA_SIZE)

        if no_assembled_results:
            # No assembled results or refseq fasta available.
            # Create empty output files.
            command.write_text_to_file(' ', blast_m8)
            command.write_text_to_file(' ', blast_top_m8)
            command.copy_file(deduped_m8, refined_m8)
            command.copy_file(hit_summary, refined_hit_summary)
            command.copy_file(orig_counts_with_dcr, refined_counts_with_dcr)
            command.write_text_to_file('[]', contig_summary_json)
            return  # return in the middle of the function

        (read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary)
        PipelineStepBlastContigs.run_blast(db_type, blast_m8, assembled_contig, reference_fasta, blast_top_m8)
        read2contig = {}
        PipelineStepRunAssembly.generate_info_from_sam(bowtie_sam, read2contig, duplicate_cluster_sizes_path)

        (updated_read_dict, read2blastm8, contig2lineage, added_reads) = self.update_read_dict(
            read2contig, blast_top_m8, read_dict, accession_dict, db_type)
        self.generate_m8_and_hit_summary(updated_read_dict, added_reads, read2blastm8,
                                         hit_summary, deduped_m8,
                                         refined_hit_summary, refined_m8)

        # Generating taxon counts based on updated results
        lineage_db = s3.fetch_reference(
            self.additional_files["lineage_db"],
            self.ref_dir_local,
            allow_s3mi=False)  # Too small to waste s3mi

        deuterostome_db = None
        if self.additional_files.get("deuterostome_db"):
            deuterostome_db = s3.fetch_reference(self.additional_files["deuterostome_db"],
                                                 self.ref_dir_local, allow_s3mi=False)  # Too small for s3mi

        blacklist_s3_file = self.additional_files.get('taxon_blacklist', DEFAULT_BLACKLIST_S3)
        taxon_blacklist = s3.fetch_reference(blacklist_s3_file, self.ref_dir_local)

        taxon_whitelist = None
        if self.additional_attributes.get("use_taxon_whitelist"):
            taxon_whitelist = s3.fetch_reference(self.additional_files.get("taxon_whitelist", DEFAULT_WHITELIST_S3),
                                                 self.ref_dir_local)

        with TraceLock("PipelineStepBlastContigs-CYA", PipelineStepBlastContigs.cya_lock, debug=False):
            with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_count_json_from_m8", "db_type": db_type, "refined_counts": refined_counts_with_dcr}):
                m8.generate_taxon_count_json_from_m8(refined_m8, refined_hit_summary, db_type.upper(),
                                                     lineage_db, deuterostome_db, taxon_whitelist, taxon_blacklist,
                                                     duplicate_cluster_sizes_path, refined_counts_with_dcr)

        # generate contig stats at genus/species level
        with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_summary"}):
            contig_taxon_summary = self.generate_taxon_summary(
                read2contig,
                contig2lineage,
                updated_read_dict,
                added_reads,
                db_type,
                duplicate_cluster_sizes_path,
                # same filter as applied in generate_taxon_count_json_from_m8
                m8.build_should_keep_filter(deuterostome_db, taxon_whitelist, taxon_blacklist)
            )

        with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_summary_json", "contig_summary_json": contig_summary_json}):
            with open(contig_summary_json, 'w') as contig_outf:
                json.dump(contig_taxon_summary, contig_outf)

        # Upload additional file
        contig2lineage_json = os.path.join(os.path.dirname(contig_summary_json), f"contig2lineage.{db_type}.json")
        with log.log_context("PipelineStepBlastContigs", {"substep": "contig2lineage_json", "contig2lineage_json": contig2lineage_json}):
            with open(contig2lineage_json, 'w') as c2lf:
                json.dump(contig2lineage, c2lf)

        self.additional_output_files_hidden.append(contig2lineage_json)
Example #18
0
    def run(self):
        # Setup
        nt_db = self.additional_attributes["nt_db"]
        nt_loc_db = s3.fetch_reference(
            self.additional_files["nt_loc_db"],
            self.ref_dir_local,
            auto_unzip=
            True,  # This is default for reference download, just being explicit.
            allow_s3mi=True)
        db_type = "nt"  # Only NT supported for now
        # TODO: Design a way to map in/out files more robustly, e.g. by name/type
        annotated_m8 = self.input_files_local[0][0]
        annotated_fasta = self.input_files_local[1][0]
        output_json_dir = os.path.join(self.output_dir_local, "align_viz")

        # Go through annotated_fasta with a db_type (NT/NR match). Infer the
        # family/genus/species info
        read2seq = PipelineStepGenerateAlignmentViz.parse_reads(
            annotated_fasta, db_type)
        log.write(f"Read to Seq dictionary size: {len(read2seq)}")

        groups, line_count = self.process_reads_from_m8_file(
            annotated_m8, read2seq)

        # If nt_db is not yet downloaded, then do download nt_db here
        if nt_db.startswith("s3://"):
            # TODO: Handle this better.  We might be poorly provisioned to allow s3mi speed
            # for this step, on the instance where it is running.
            nt_db = s3.fetch_reference(
                nt_db,
                self.ref_dir_local,
                auto_unzip=
                True,  # this is default for reference uploads, just being explicit
                allow_s3mi=True
            )  # s3mi probably okay here because we tend to download only NT and little else in this stage

        with open_file_db_by_extension(
                nt_loc_db, IdSeqDictValue.VALUE_TYPE_ARRAY) as nt_loc_dict:
            log.write("Getting sequences by accession list from file...")
            PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_file(
                groups, nt_loc_dict, nt_db)

        for _accession_id, ad in groups.items():
            ad['coverage_summary'] = PipelineStepGenerateAlignmentViz.calculate_alignment_coverage(
                ad)

        result_dict, to_be_deleted = self.populate_reference_sequences(groups)

        # Delete temp files
        def safe_multi_delete(files):
            for f in files:
                try:
                    os.remove(f)
                except:
                    pass

        deleter_thread = threading.Thread(target=safe_multi_delete,
                                          args=[to_be_deleted])
        deleter_thread.start()

        self.dump_align_viz_json(output_json_dir, db_type, result_dict)

        deleter_thread.join()

        # Write summary file
        summary_msg = f"Read2Seq Size: {len(read2seq)}, M8 lines {line_count}, " \
            f"{len(groups)} unique accession ids "
        summary_file_name = f"{output_json_dir}.summary"
        with open(summary_file_name, 'w') as summary_f:
            summary_f.write(summary_msg)
Example #19
0
    def run(self):
        """Run STAR to filter out host reads."""
        # Setup
        if self.sequence_input_files is not None and self.validated_input_counts_file is not None:
            validated_input_counts_file = self.validated_input_counts_file
            input_files = self.sequence_input_files
        else:
            validated_input_counts_file = self.input_files_local[0][0]
            input_files = self.input_files_local[0][1:3]

        num_inputs = len(input_files)
        scratch_dir = os.path.join(self.output_dir_local, "scratch_star")

        output_files_local = self.output_files_local()
        output_gene_file = self.additional_attributes.get("output_gene_file")
        output_log_file = self.additional_attributes.get("output_log_file")

        genome_dir = s3.fetch_reference(
            self.additional_files["star_genome"],
            self.ref_dir_local,
            allow_s3mi=True,
            auto_untar=True)

        # Check parts file for the number of partitioned indexes
        parts_file = os.path.join(genome_dir, "parts.txt")
        assert os.path.isfile(parts_file)
        with open(parts_file, 'rb') as parts_f:
            num_parts = int(parts_f.read())

        # Don't compute insert size metrics if the STAR index has more than one part
        #   Logic for combining BAM output from STAR or insert size metrics not implemented
        if self.collect_insert_size_metrics_for and num_parts != 1:
            log.write("Insert size metrics were expected to be collected for sample but were not because the STAR index has more than one part")
            self.collect_insert_size_metrics_for = None

        # Run STAR on each partition and save the unmapped read info
        unmapped = input_files

        with open(validated_input_counts_file) as validated_input_counts_f:
            validated_input_counts = json.load(validated_input_counts_f)

        use_starlong = validated_input_counts[vc.BUCKET_LONG] > 1 or \
            validated_input_counts[vc.BUCKET_TOO_LONG] > 1

        for part_idx in range(num_parts):
            tmp = f"{scratch_dir}/star-part-{part_idx}"
            genome_part = f"{genome_dir}/part-{part_idx}"
            count_genes = part_idx == 0
            self.run_star_part(tmp, genome_part, unmapped, count_genes, use_starlong)

            unmapped, too_discrepant = PipelineStepRunStar.sync_pairs(
                PipelineStepRunStar.unmapped_files_in(tmp, num_inputs))

            if too_discrepant:
                raise BrokenReadPairError("Broken pairs")

            # Run part 0 in gene-counting mode:
            # (a) ERCCs are doped into part 0 and we want their counts.
            # (b) If there is only 1 part (e.g. human), the host gene counts also
            # make sense.
            if part_idx == 0:
                gene_count_file = os.path.join(tmp, "ReadsPerGene.out.tab")
                if os.path.isfile(gene_count_file) and output_gene_file:
                    moved = os.path.join(self.output_dir_local,
                                         output_gene_file)
                    command.move_file(gene_count_file, moved)
                    self.additional_output_files_hidden.append(moved)

                log_file = os.path.join(tmp, "Log.final.out")
                if os.path.isfile(log_file) and output_log_file:
                    moved = os.path.join(self.output_dir_local, output_log_file)
                    command.move_file(log_file, moved)

                # STAR names the output BAM file Aligned.out.bam without TranscriptomeSAM and
                #  Aligned.toTranscriptome.out.bam with  TranscriptomeSAM, this doesn't
                #  appear to be configurable
                is_dna = self.collect_insert_size_metrics_for == "dna"
                bam_filename = "Aligned.out.bam" if is_dna else "Aligned.toTranscriptome.out.bam"
                if self.collect_insert_size_metrics_for:
                    bam_path = os.path.join(tmp, bam_filename)

                    # If this file wasn't generated but self.collect_insert_size_metrics_for has a value
                    #   something unexpected has gone wrong
                    assert(os.path.isfile(bam_path)), \
                        "Expected STAR to generate Aligned.out.bam but it was not found"
                    try:
                        self.collect_insert_size_metrics(tmp, bam_path, self.output_metrics_file, self.output_histogram_file)
                        if os.path.exists(self.output_metrics_file):
                            self.additional_output_files_visible.append(self.output_metrics_file)
                        else:
                            message = "expected picard to generate a metrics file but none was found"
                            log.write(message=message, warning=True)
                        if os.path.exists(self.output_histogram_file):
                            self.additional_output_files_visible.append(self.output_histogram_file)
                        else:
                            message = "expected picard to generate a histogram file but none was found"
                            log.write(message=message, warning=True)
                    except Exception as e:
                        log.write(message=f"encountered error while running picard: {type(e).__name__}: {e}", warning=True)

        # Sort unmapped files for deterministic output
        for unmapped_file in unmapped:
            sort_fastx_by_entry_id(unmapped_file)
        # Cleanup
        for src, dst in zip(unmapped, output_files_local):
            command.move_file(src, dst)    # Move out of scratch dir
        command.remove_rf(f"{scratch_dir}/*")
Example #20
0
    def run(self):
        '''
            1. summarize hits
            2. built blast index
            3. blast assembled contigs to the index
            4. update the summary
        '''
        (_align_m8, deduped_m8, hit_summary,
         orig_counts) = self.input_files_local[0]
        assembled_contig, _assembled_scaffold, bowtie_sam, _contig_stats = self.input_files_local[
            1]
        reference_fasta = self.input_files_local[2][0]

        (blast_m8, refined_m8, refined_hit_summary, refined_counts,
         contig_summary_json, blast_top_m8) = self.output_files_local()
        db_type = self.additional_attributes["db_type"]
        if os.path.getsize(assembled_contig) < MIN_ASSEMBLED_CONTIG_SIZE or \
           os.path.getsize(reference_fasta) < MIN_REF_FASTA_SIZE:
            # No assembled results or refseq fasta available.
            # Create empty output files.
            command.write_text_to_file(' ', blast_m8)
            command.write_text_to_file(' ', blast_top_m8)
            command.copy_file(deduped_m8, refined_m8)
            command.copy_file(hit_summary, refined_hit_summary)
            command.copy_file(orig_counts, refined_counts)
            command.write_text_to_file('[]', contig_summary_json)
            return  # return in the middle of the function

        (read_dict, accession_dict,
         _selected_genera) = m8.summarize_hits(hit_summary)
        PipelineStepBlastContigs.run_blast(db_type, blast_m8, assembled_contig,
                                           reference_fasta, blast_top_m8)
        read2contig = {}
        contig_stats = defaultdict(int)
        PipelineStepRunAssembly.generate_info_from_sam(bowtie_sam, read2contig,
                                                       contig_stats)

        (updated_read_dict, read2blastm8, contig2lineage,
         added_reads) = self.update_read_dict(read2contig, blast_top_m8,
                                              read_dict, accession_dict,
                                              db_type)
        self.generate_m8_and_hit_summary(updated_read_dict, added_reads,
                                         read2blastm8, hit_summary, deduped_m8,
                                         refined_hit_summary, refined_m8)

        # Generating taxon counts based on updated results
        lineage_db = s3.fetch_reference(
            self.additional_files["lineage_db"],
            self.ref_dir_local,
            allow_s3mi=False)  # Too small to waste s3mi
        deuterostome_db = None
        evalue_type = 'raw'
        if self.additional_files.get("deuterostome_db"):
            deuterostome_db = s3.fetch_reference(
                self.additional_files["deuterostome_db"],
                self.ref_dir_local,
                allow_s3mi=False)  # Too small for s3mi
        with TraceLock("PipelineStepBlastContigs-CYA",
                       PipelineStepBlastContigs.cya_lock,
                       debug=False):
            with log.log_context(
                    "PipelineStepBlastContigs", {
                        "substep": "generate_taxon_count_json_from_m8",
                        "db_type": db_type,
                        "refined_counts": refined_counts
                    }):
                m8.generate_taxon_count_json_from_m8(
                    refined_m8, refined_hit_summary, evalue_type,
                    db_type.upper(), lineage_db, deuterostome_db,
                    refined_counts)

        # generate contig stats at genus/species level
        with log.log_context("PipelineStepBlastContigs",
                             {"substep": "generate_taxon_summary"}):
            contig_taxon_summary = self.generate_taxon_summary(
                read2contig, contig2lineage, updated_read_dict, added_reads,
                db_type)

        with log.log_context(
                "PipelineStepBlastContigs", {
                    "substep": "generate_taxon_summary_json",
                    "contig_summary_json": contig_summary_json
                }):
            with open(contig_summary_json, 'w') as contig_outf:
                json.dump(contig_taxon_summary, contig_outf)

        # Upload additional file
        contig2lineage_json = os.path.join(
            os.path.dirname(contig_summary_json),
            f"contig2lineage.{db_type}.json")
        with log.log_context(
                "PipelineStepBlastContigs", {
                    "substep": "contig2lineage_json",
                    "contig2lineage_json": contig2lineage_json
                }):
            with open(contig2lineage_json, 'w') as c2lf:
                json.dump(contig2lineage, c2lf)

        self.additional_output_files_hidden.append(contig2lineage_json)