Beispiel #1
0
 def run(self):
     (_align_m8, _deduped_m8, hit_summary,
      _orig_counts) = self.input_files_local[0]
     output_reference_fasta = self.output_files_local()[0]
     loc_db = s3.fetch_reference(
         self.additional_files["loc_db"],
         self.ref_dir_local,
         auto_unzip=
         True,  # This is default for references, but let's be explicit.
         allow_s3mi=ALLOW_S3MI)
     db_s3_path = self.additional_attributes["db"]
     # db_type = self.additional_attributes["db_type"]
     (_read_dict, accession_dict,
      _selected_genera) = m8.summarize_hits(hit_summary)
     with open_file_db_by_extension(
             loc_db, IdSeqDictValue.VALUE_TYPE_ARRAY) as loc_dict:
         db_path = s3.fetch_reference(
             db_s3_path,
             self.ref_dir_local,
             auto_unzip=
             True,  # This is default for references, but let's be explicit
             allow_s3mi=ALLOW_S3MI)
         self.download_ref_sequences_from_file(accession_dict, loc_dict,
                                               db_path,
                                               output_reference_fasta)
 def run(self):
     (_align_m8, _deduped_m8, hit_summary, _orig_counts) = self.input_files_local[0]
     output_reference_fasta = self.output_files_local()[0]
     loc_db = s3.fetch_from_s3(
         self.additional_files["loc_db"],
         self.ref_dir_local,
         allow_s3mi=True)
     db_s3_path = self.additional_attributes["db"]
     db_type = self.additional_attributes["db_type"]
     lineage_db = s3.fetch_from_s3(
         self.additional_files["lineage_db"],
         self.ref_dir_local,
         allow_s3mi=True)
     (read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary)
     if len(accession_dict) < MIN_ACCESSIONS_WHOLE_DB_DOWNLOAD:
         self.download_ref_sequences_from_s3(accession_dict, output_reference_fasta, db_type,
                                             loc_db, db_s3_path)
     else:
         # download the whole alignment db
         db_path = s3.fetch_from_s3(db_s3_path, self.ref_dir_local, allow_s3mi=True)
         self.download_ref_sequences_from_file(accession_dict, loc_db, db_path, output_reference_fasta)
    def run(self):
        '''
            1. summarize hits
            2. built blast index
            3. blast assembled contigs to the index
            4. update the summary
        '''
        _align_m8, deduped_m8, hit_summary, orig_counts_with_dcr = self.input_files_local[0]
        assembled_contig, _assembled_scaffold, bowtie_sam, _contig_stats = self.input_files_local[1]
        reference_fasta, = self.input_files_local[2]
        duplicate_cluster_sizes_path, = self.input_files_local[3]

        blast_m8, refined_m8, refined_hit_summary, refined_counts_with_dcr, contig_summary_json, blast_top_m8 = self.output_files_local()

        assert refined_counts_with_dcr.endswith("with_dcr.json"), self.output_files_local()
        assert orig_counts_with_dcr.endswith("with_dcr.json"), self.output_files_local()

        db_type = self.additional_attributes["db_type"]
        no_assembled_results = (
            os.path.getsize(assembled_contig) < MIN_ASSEMBLED_CONTIG_SIZE or
            os.path.getsize(reference_fasta) < MIN_REF_FASTA_SIZE)

        if no_assembled_results:
            # No assembled results or refseq fasta available.
            # Create empty output files.
            command.write_text_to_file(' ', blast_m8)
            command.write_text_to_file(' ', blast_top_m8)
            command.copy_file(deduped_m8, refined_m8)
            command.copy_file(hit_summary, refined_hit_summary)
            command.copy_file(orig_counts_with_dcr, refined_counts_with_dcr)
            command.write_text_to_file('[]', contig_summary_json)
            return  # return in the middle of the function

        (read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary)
        PipelineStepBlastContigs.run_blast(db_type, blast_m8, assembled_contig, reference_fasta, blast_top_m8)
        read2contig = {}
        PipelineStepRunAssembly.generate_info_from_sam(bowtie_sam, read2contig, duplicate_cluster_sizes_path)

        (updated_read_dict, read2blastm8, contig2lineage, added_reads) = self.update_read_dict(
            read2contig, blast_top_m8, read_dict, accession_dict, db_type)
        self.generate_m8_and_hit_summary(updated_read_dict, added_reads, read2blastm8,
                                         hit_summary, deduped_m8,
                                         refined_hit_summary, refined_m8)

        # Generating taxon counts based on updated results
        lineage_db = s3.fetch_reference(
            self.additional_files["lineage_db"],
            self.ref_dir_local,
            allow_s3mi=False)  # Too small to waste s3mi

        deuterostome_db = None
        if self.additional_files.get("deuterostome_db"):
            deuterostome_db = s3.fetch_reference(self.additional_files["deuterostome_db"],
                                                 self.ref_dir_local, allow_s3mi=False)  # Too small for s3mi

        blacklist_s3_file = self.additional_files.get('taxon_blacklist', DEFAULT_BLACKLIST_S3)
        taxon_blacklist = s3.fetch_reference(blacklist_s3_file, self.ref_dir_local)

        taxon_whitelist = None
        if self.additional_attributes.get("use_taxon_whitelist"):
            taxon_whitelist = s3.fetch_reference(self.additional_files.get("taxon_whitelist", DEFAULT_WHITELIST_S3),
                                                 self.ref_dir_local)

        with TraceLock("PipelineStepBlastContigs-CYA", PipelineStepBlastContigs.cya_lock, debug=False):
            with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_count_json_from_m8", "db_type": db_type, "refined_counts": refined_counts_with_dcr}):
                m8.generate_taxon_count_json_from_m8(refined_m8, refined_hit_summary, db_type.upper(),
                                                     lineage_db, deuterostome_db, taxon_whitelist, taxon_blacklist,
                                                     duplicate_cluster_sizes_path, refined_counts_with_dcr)

        # generate contig stats at genus/species level
        with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_summary"}):
            contig_taxon_summary = self.generate_taxon_summary(
                read2contig,
                contig2lineage,
                updated_read_dict,
                added_reads,
                db_type,
                duplicate_cluster_sizes_path,
                # same filter as applied in generate_taxon_count_json_from_m8
                m8.build_should_keep_filter(deuterostome_db, taxon_whitelist, taxon_blacklist)
            )

        with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_summary_json", "contig_summary_json": contig_summary_json}):
            with open(contig_summary_json, 'w') as contig_outf:
                json.dump(contig_taxon_summary, contig_outf)

        # Upload additional file
        contig2lineage_json = os.path.join(os.path.dirname(contig_summary_json), f"contig2lineage.{db_type}.json")
        with log.log_context("PipelineStepBlastContigs", {"substep": "contig2lineage_json", "contig2lineage_json": contig2lineage_json}):
            with open(contig2lineage_json, 'w') as c2lf:
                json.dump(contig2lineage, c2lf)

        self.additional_output_files_hidden.append(contig2lineage_json)
Beispiel #4
0
    def run(self):
        '''
            1. summarize hits
            2. built blast index
            3. blast assembled contigs to the index
            4. update the summary
        '''
        (align_m8, deduped_m8, hit_summary,
         orig_counts) = self.input_files_local[0]
        assembled_contig, _assembled_scaffold, bowtie_sam, _contig_stats = self.input_files_local[
            1]
        reference_fasta = self.input_files_local[2][0]

        (blast_m8, refined_m8, refined_hit_summary, refined_counts,
         contig_summary_json) = self.output_files_local()
        db_type = self.additional_attributes["db_type"]
        if os.path.getsize(assembled_contig) < MIN_ASEEMBLED_CONTIG_SIZE or \
            os.path.getsize(reference_fasta) < MIN_REF_FASTA_SIZE:
            # No assembled results or refseq fasta available
            command.execute(f"echo ' ' > {blast_m8}")
            command.execute(f"cp {deduped_m8} {refined_m8}")
            command.execute(f"cp {hit_summary} {refined_hit_summary}")
            command.execute(f"cp {orig_counts} {refined_counts}")
            command.execute("echo '[]' > " + contig_summary_json)
            return

        (read_dict, accession_dict,
         _selected_genera) = m8.summarize_hits(hit_summary)
        top_entry_m8 = blast_m8.replace(".m8", ".top.m8")
        PipelineStepBlastContigs.run_blast(assembled_contig, reference_fasta,
                                           db_type, blast_m8, top_entry_m8)
        read2contig = {}
        contig_stats = defaultdict(int)
        PipelineStepRunAssembly.generate_info_from_sam(bowtie_sam, read2contig,
                                                       contig_stats)

        (updated_read_dict, read2blastm8, contig2lineage,
         added_reads) = self.update_read_dict(read2contig, top_entry_m8,
                                              read_dict, accession_dict)
        self.generate_m8_and_hit_summary(updated_read_dict, added_reads,
                                         read2blastm8, hit_summary, deduped_m8,
                                         refined_hit_summary, refined_m8)

        # Generating taxon counts based on updated results
        lineage_db = s3.fetch_from_s3(self.additional_files["lineage_db"],
                                      self.ref_dir_local,
                                      allow_s3mi=True)
        deuterostome_db = None
        evalue_type = 'raw'
        if self.additional_files.get("deuterostome_db"):
            deuterostome_db = s3.fetch_from_s3(
                self.additional_files["deuterostome_db"],
                self.ref_dir_local,
                allow_s3mi=True)
        m8.generate_taxon_count_json_from_m8(refined_m8,
                                             refined_hit_summary, evalue_type,
                                             db_type.upper(), lineage_db,
                                             deuterostome_db, refined_counts)
        # generate contig stats at genus/species level
        contig_taxon_summary = self.generate_taxon_summary(
            read2contig, contig2lineage, updated_read_dict, added_reads,
            db_type)
        with open(contig_summary_json, 'w') as contig_outf:
            json.dump(contig_taxon_summary, contig_outf)

        # Upload additional file
        contig2lineage_json = os.path.join(
            os.path.dirname(contig_summary_json),
            f"contig2lineage.{db_type}.json")
        with open(contig2lineage_json, 'w') as c2lf:
            json.dump(contig2lineage, c2lf)

        self.additional_files_to_upload.append(top_entry_m8)
        self.additional_files_to_upload.append(contig2lineage_json)
Beispiel #5
0
    def run(self):
        '''
            1. summarize hits
            2. built blast index
            3. blast assembled contigs to the index
            4. update the summary
        '''
        (_align_m8, deduped_m8, hit_summary,
         orig_counts) = self.input_files_local[0]
        assembled_contig, _assembled_scaffold, bowtie_sam, _contig_stats = self.input_files_local[
            1]
        reference_fasta = self.input_files_local[2][0]

        (blast_m8, refined_m8, refined_hit_summary, refined_counts,
         contig_summary_json, blast_top_m8) = self.output_files_local()
        db_type = self.additional_attributes["db_type"]
        if os.path.getsize(assembled_contig) < MIN_ASSEMBLED_CONTIG_SIZE or \
           os.path.getsize(reference_fasta) < MIN_REF_FASTA_SIZE:
            # No assembled results or refseq fasta available.
            # Create empty output files.
            command.write_text_to_file(' ', blast_m8)
            command.write_text_to_file(' ', blast_top_m8)
            command.copy_file(deduped_m8, refined_m8)
            command.copy_file(hit_summary, refined_hit_summary)
            command.copy_file(orig_counts, refined_counts)
            command.write_text_to_file('[]', contig_summary_json)
            return  # return in the middle of the function

        (read_dict, accession_dict,
         _selected_genera) = m8.summarize_hits(hit_summary)
        PipelineStepBlastContigs.run_blast(db_type, blast_m8, assembled_contig,
                                           reference_fasta, blast_top_m8)
        read2contig = {}
        contig_stats = defaultdict(int)
        PipelineStepRunAssembly.generate_info_from_sam(bowtie_sam, read2contig,
                                                       contig_stats)

        (updated_read_dict, read2blastm8, contig2lineage,
         added_reads) = self.update_read_dict(read2contig, blast_top_m8,
                                              read_dict, accession_dict,
                                              db_type)
        self.generate_m8_and_hit_summary(updated_read_dict, added_reads,
                                         read2blastm8, hit_summary, deduped_m8,
                                         refined_hit_summary, refined_m8)

        # Generating taxon counts based on updated results
        lineage_db = s3.fetch_reference(
            self.additional_files["lineage_db"],
            self.ref_dir_local,
            allow_s3mi=False)  # Too small to waste s3mi
        deuterostome_db = None
        evalue_type = 'raw'
        if self.additional_files.get("deuterostome_db"):
            deuterostome_db = s3.fetch_reference(
                self.additional_files["deuterostome_db"],
                self.ref_dir_local,
                allow_s3mi=False)  # Too small for s3mi
        with TraceLock("PipelineStepBlastContigs-CYA",
                       PipelineStepBlastContigs.cya_lock,
                       debug=False):
            with log.log_context(
                    "PipelineStepBlastContigs", {
                        "substep": "generate_taxon_count_json_from_m8",
                        "db_type": db_type,
                        "refined_counts": refined_counts
                    }):
                m8.generate_taxon_count_json_from_m8(
                    refined_m8, refined_hit_summary, evalue_type,
                    db_type.upper(), lineage_db, deuterostome_db,
                    refined_counts)

        # generate contig stats at genus/species level
        with log.log_context("PipelineStepBlastContigs",
                             {"substep": "generate_taxon_summary"}):
            contig_taxon_summary = self.generate_taxon_summary(
                read2contig, contig2lineage, updated_read_dict, added_reads,
                db_type)

        with log.log_context(
                "PipelineStepBlastContigs", {
                    "substep": "generate_taxon_summary_json",
                    "contig_summary_json": contig_summary_json
                }):
            with open(contig_summary_json, 'w') as contig_outf:
                json.dump(contig_taxon_summary, contig_outf)

        # Upload additional file
        contig2lineage_json = os.path.join(
            os.path.dirname(contig_summary_json),
            f"contig2lineage.{db_type}.json")
        with log.log_context(
                "PipelineStepBlastContigs", {
                    "substep": "contig2lineage_json",
                    "contig2lineage_json": contig2lineage_json
                }):
            with open(contig2lineage_json, 'w') as c2lf:
                json.dump(contig2lineage, c2lf)

        self.additional_output_files_hidden.append(contig2lineage_json)