def run(self): (_align_m8, _deduped_m8, hit_summary, _orig_counts) = self.input_files_local[0] output_reference_fasta = self.output_files_local()[0] loc_db = s3.fetch_reference( self.additional_files["loc_db"], self.ref_dir_local, auto_unzip= True, # This is default for references, but let's be explicit. allow_s3mi=ALLOW_S3MI) db_s3_path = self.additional_attributes["db"] # db_type = self.additional_attributes["db_type"] (_read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary) with open_file_db_by_extension( loc_db, IdSeqDictValue.VALUE_TYPE_ARRAY) as loc_dict: db_path = s3.fetch_reference( db_s3_path, self.ref_dir_local, auto_unzip= True, # This is default for references, but let's be explicit allow_s3mi=ALLOW_S3MI) self.download_ref_sequences_from_file(accession_dict, loc_dict, db_path, output_reference_fasta)
def run(self): (_align_m8, _deduped_m8, hit_summary, _orig_counts) = self.input_files_local[0] output_reference_fasta = self.output_files_local()[0] loc_db = s3.fetch_from_s3( self.additional_files["loc_db"], self.ref_dir_local, allow_s3mi=True) db_s3_path = self.additional_attributes["db"] db_type = self.additional_attributes["db_type"] lineage_db = s3.fetch_from_s3( self.additional_files["lineage_db"], self.ref_dir_local, allow_s3mi=True) (read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary) if len(accession_dict) < MIN_ACCESSIONS_WHOLE_DB_DOWNLOAD: self.download_ref_sequences_from_s3(accession_dict, output_reference_fasta, db_type, loc_db, db_s3_path) else: # download the whole alignment db db_path = s3.fetch_from_s3(db_s3_path, self.ref_dir_local, allow_s3mi=True) self.download_ref_sequences_from_file(accession_dict, loc_db, db_path, output_reference_fasta)
def run(self): ''' 1. summarize hits 2. built blast index 3. blast assembled contigs to the index 4. update the summary ''' _align_m8, deduped_m8, hit_summary, orig_counts_with_dcr = self.input_files_local[0] assembled_contig, _assembled_scaffold, bowtie_sam, _contig_stats = self.input_files_local[1] reference_fasta, = self.input_files_local[2] duplicate_cluster_sizes_path, = self.input_files_local[3] blast_m8, refined_m8, refined_hit_summary, refined_counts_with_dcr, contig_summary_json, blast_top_m8 = self.output_files_local() assert refined_counts_with_dcr.endswith("with_dcr.json"), self.output_files_local() assert orig_counts_with_dcr.endswith("with_dcr.json"), self.output_files_local() db_type = self.additional_attributes["db_type"] no_assembled_results = ( os.path.getsize(assembled_contig) < MIN_ASSEMBLED_CONTIG_SIZE or os.path.getsize(reference_fasta) < MIN_REF_FASTA_SIZE) if no_assembled_results: # No assembled results or refseq fasta available. # Create empty output files. command.write_text_to_file(' ', blast_m8) command.write_text_to_file(' ', blast_top_m8) command.copy_file(deduped_m8, refined_m8) command.copy_file(hit_summary, refined_hit_summary) command.copy_file(orig_counts_with_dcr, refined_counts_with_dcr) command.write_text_to_file('[]', contig_summary_json) return # return in the middle of the function (read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary) PipelineStepBlastContigs.run_blast(db_type, blast_m8, assembled_contig, reference_fasta, blast_top_m8) read2contig = {} PipelineStepRunAssembly.generate_info_from_sam(bowtie_sam, read2contig, duplicate_cluster_sizes_path) (updated_read_dict, read2blastm8, contig2lineage, added_reads) = self.update_read_dict( read2contig, blast_top_m8, read_dict, accession_dict, db_type) self.generate_m8_and_hit_summary(updated_read_dict, added_reads, read2blastm8, hit_summary, deduped_m8, refined_hit_summary, refined_m8) # Generating taxon counts based on updated results lineage_db = s3.fetch_reference( self.additional_files["lineage_db"], self.ref_dir_local, allow_s3mi=False) # Too small to waste s3mi deuterostome_db = None if self.additional_files.get("deuterostome_db"): deuterostome_db = s3.fetch_reference(self.additional_files["deuterostome_db"], self.ref_dir_local, allow_s3mi=False) # Too small for s3mi blacklist_s3_file = self.additional_files.get('taxon_blacklist', DEFAULT_BLACKLIST_S3) taxon_blacklist = s3.fetch_reference(blacklist_s3_file, self.ref_dir_local) taxon_whitelist = None if self.additional_attributes.get("use_taxon_whitelist"): taxon_whitelist = s3.fetch_reference(self.additional_files.get("taxon_whitelist", DEFAULT_WHITELIST_S3), self.ref_dir_local) with TraceLock("PipelineStepBlastContigs-CYA", PipelineStepBlastContigs.cya_lock, debug=False): with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_count_json_from_m8", "db_type": db_type, "refined_counts": refined_counts_with_dcr}): m8.generate_taxon_count_json_from_m8(refined_m8, refined_hit_summary, db_type.upper(), lineage_db, deuterostome_db, taxon_whitelist, taxon_blacklist, duplicate_cluster_sizes_path, refined_counts_with_dcr) # generate contig stats at genus/species level with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_summary"}): contig_taxon_summary = self.generate_taxon_summary( read2contig, contig2lineage, updated_read_dict, added_reads, db_type, duplicate_cluster_sizes_path, # same filter as applied in generate_taxon_count_json_from_m8 m8.build_should_keep_filter(deuterostome_db, taxon_whitelist, taxon_blacklist) ) with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_summary_json", "contig_summary_json": contig_summary_json}): with open(contig_summary_json, 'w') as contig_outf: json.dump(contig_taxon_summary, contig_outf) # Upload additional file contig2lineage_json = os.path.join(os.path.dirname(contig_summary_json), f"contig2lineage.{db_type}.json") with log.log_context("PipelineStepBlastContigs", {"substep": "contig2lineage_json", "contig2lineage_json": contig2lineage_json}): with open(contig2lineage_json, 'w') as c2lf: json.dump(contig2lineage, c2lf) self.additional_output_files_hidden.append(contig2lineage_json)
def run(self): ''' 1. summarize hits 2. built blast index 3. blast assembled contigs to the index 4. update the summary ''' (align_m8, deduped_m8, hit_summary, orig_counts) = self.input_files_local[0] assembled_contig, _assembled_scaffold, bowtie_sam, _contig_stats = self.input_files_local[ 1] reference_fasta = self.input_files_local[2][0] (blast_m8, refined_m8, refined_hit_summary, refined_counts, contig_summary_json) = self.output_files_local() db_type = self.additional_attributes["db_type"] if os.path.getsize(assembled_contig) < MIN_ASEEMBLED_CONTIG_SIZE or \ os.path.getsize(reference_fasta) < MIN_REF_FASTA_SIZE: # No assembled results or refseq fasta available command.execute(f"echo ' ' > {blast_m8}") command.execute(f"cp {deduped_m8} {refined_m8}") command.execute(f"cp {hit_summary} {refined_hit_summary}") command.execute(f"cp {orig_counts} {refined_counts}") command.execute("echo '[]' > " + contig_summary_json) return (read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary) top_entry_m8 = blast_m8.replace(".m8", ".top.m8") PipelineStepBlastContigs.run_blast(assembled_contig, reference_fasta, db_type, blast_m8, top_entry_m8) read2contig = {} contig_stats = defaultdict(int) PipelineStepRunAssembly.generate_info_from_sam(bowtie_sam, read2contig, contig_stats) (updated_read_dict, read2blastm8, contig2lineage, added_reads) = self.update_read_dict(read2contig, top_entry_m8, read_dict, accession_dict) self.generate_m8_and_hit_summary(updated_read_dict, added_reads, read2blastm8, hit_summary, deduped_m8, refined_hit_summary, refined_m8) # Generating taxon counts based on updated results lineage_db = s3.fetch_from_s3(self.additional_files["lineage_db"], self.ref_dir_local, allow_s3mi=True) deuterostome_db = None evalue_type = 'raw' if self.additional_files.get("deuterostome_db"): deuterostome_db = s3.fetch_from_s3( self.additional_files["deuterostome_db"], self.ref_dir_local, allow_s3mi=True) m8.generate_taxon_count_json_from_m8(refined_m8, refined_hit_summary, evalue_type, db_type.upper(), lineage_db, deuterostome_db, refined_counts) # generate contig stats at genus/species level contig_taxon_summary = self.generate_taxon_summary( read2contig, contig2lineage, updated_read_dict, added_reads, db_type) with open(contig_summary_json, 'w') as contig_outf: json.dump(contig_taxon_summary, contig_outf) # Upload additional file contig2lineage_json = os.path.join( os.path.dirname(contig_summary_json), f"contig2lineage.{db_type}.json") with open(contig2lineage_json, 'w') as c2lf: json.dump(contig2lineage, c2lf) self.additional_files_to_upload.append(top_entry_m8) self.additional_files_to_upload.append(contig2lineage_json)
def run(self): ''' 1. summarize hits 2. built blast index 3. blast assembled contigs to the index 4. update the summary ''' (_align_m8, deduped_m8, hit_summary, orig_counts) = self.input_files_local[0] assembled_contig, _assembled_scaffold, bowtie_sam, _contig_stats = self.input_files_local[ 1] reference_fasta = self.input_files_local[2][0] (blast_m8, refined_m8, refined_hit_summary, refined_counts, contig_summary_json, blast_top_m8) = self.output_files_local() db_type = self.additional_attributes["db_type"] if os.path.getsize(assembled_contig) < MIN_ASSEMBLED_CONTIG_SIZE or \ os.path.getsize(reference_fasta) < MIN_REF_FASTA_SIZE: # No assembled results or refseq fasta available. # Create empty output files. command.write_text_to_file(' ', blast_m8) command.write_text_to_file(' ', blast_top_m8) command.copy_file(deduped_m8, refined_m8) command.copy_file(hit_summary, refined_hit_summary) command.copy_file(orig_counts, refined_counts) command.write_text_to_file('[]', contig_summary_json) return # return in the middle of the function (read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary) PipelineStepBlastContigs.run_blast(db_type, blast_m8, assembled_contig, reference_fasta, blast_top_m8) read2contig = {} contig_stats = defaultdict(int) PipelineStepRunAssembly.generate_info_from_sam(bowtie_sam, read2contig, contig_stats) (updated_read_dict, read2blastm8, contig2lineage, added_reads) = self.update_read_dict(read2contig, blast_top_m8, read_dict, accession_dict, db_type) self.generate_m8_and_hit_summary(updated_read_dict, added_reads, read2blastm8, hit_summary, deduped_m8, refined_hit_summary, refined_m8) # Generating taxon counts based on updated results lineage_db = s3.fetch_reference( self.additional_files["lineage_db"], self.ref_dir_local, allow_s3mi=False) # Too small to waste s3mi deuterostome_db = None evalue_type = 'raw' if self.additional_files.get("deuterostome_db"): deuterostome_db = s3.fetch_reference( self.additional_files["deuterostome_db"], self.ref_dir_local, allow_s3mi=False) # Too small for s3mi with TraceLock("PipelineStepBlastContigs-CYA", PipelineStepBlastContigs.cya_lock, debug=False): with log.log_context( "PipelineStepBlastContigs", { "substep": "generate_taxon_count_json_from_m8", "db_type": db_type, "refined_counts": refined_counts }): m8.generate_taxon_count_json_from_m8( refined_m8, refined_hit_summary, evalue_type, db_type.upper(), lineage_db, deuterostome_db, refined_counts) # generate contig stats at genus/species level with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_summary"}): contig_taxon_summary = self.generate_taxon_summary( read2contig, contig2lineage, updated_read_dict, added_reads, db_type) with log.log_context( "PipelineStepBlastContigs", { "substep": "generate_taxon_summary_json", "contig_summary_json": contig_summary_json }): with open(contig_summary_json, 'w') as contig_outf: json.dump(contig_taxon_summary, contig_outf) # Upload additional file contig2lineage_json = os.path.join( os.path.dirname(contig_summary_json), f"contig2lineage.{db_type}.json") with log.log_context( "PipelineStepBlastContigs", { "substep": "contig2lineage_json", "contig2lineage_json": contig2lineage_json }): with open(contig2lineage_json, 'w') as c2lf: json.dump(contig2lineage, c2lf) self.additional_output_files_hidden.append(contig2lineage_json)