def run(self): (_align_m8, _deduped_m8, hit_summary, _orig_counts) = self.input_files_local[0] output_reference_fasta = self.output_files_local()[0] loc_db = s3.fetch_reference( self.additional_files["loc_db"], self.ref_dir_local, auto_unzip= True, # This is default for references, but let's be explicit. allow_s3mi=ALLOW_S3MI) db_s3_path = self.additional_attributes["db"] # db_type = self.additional_attributes["db_type"] (_read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary) with open_file_db_by_extension( loc_db, IdSeqDictValue.VALUE_TYPE_ARRAY) as loc_dict: db_path = s3.fetch_reference( db_s3_path, self.ref_dir_local, auto_unzip= True, # This is default for references, but let's be explicit allow_s3mi=ALLOW_S3MI) self.download_ref_sequences_from_file(accession_dict, loc_dict, db_path, output_reference_fasta)
def run(self): # Setup if len(self.input_files_local) > 1: input_fa_name = self.input_files_local[0][0] hit_summary_files = { 'NT': self.input_files_local[1][2], 'NR': self.input_files_local[2][2] } else: # TODO(yf): Old implementation. TO BE DEPRECATED once 3.1 is fully deployed input_files = self.input_files_local[0] input_fa_name = input_files[0] hit_summary_files = {'NT': input_files[1], 'NR': input_files[2]} # Open lineage db lineage_db = s3.fetch_reference(self.additional_files["lineage_db"], self.ref_dir_local, allow_s3mi=True) # Get primary hit mappings valid_hits = PipelineStepGenerateTaxidFasta.parse_hits( hit_summary_files) with open(input_fa_name, 'rb') as input_fa, \ open(self.output_files_local()[0], 'wb') as output_fa, \ open_file_db_by_extension(lineage_db, IdSeqDictValue.VALUE_TYPE_ARRAY) as lineage_map: # noqa seq_name = input_fa.readline() seq_data = input_fa.readline() while len(seq_name) > 0 and len(seq_data) > 0: # Example read_id: "NR::NT:CP010376.2:NB501961:14:HM7TLBGX2:1:23109 # :12720:8743/2" # Translate the read information into our custom format with fake # taxids at non-specific hit levels. annotated_read_id = seq_name.decode("utf-8").rstrip().lstrip( '>') read_id = annotated_read_id.split(":", 4)[-1] nr_taxid_species, nr_taxid_genus, nr_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage( valid_hits, lineage_map, read_id, 'NR') nt_taxid_species, nt_taxid_genus, nt_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage( valid_hits, lineage_map, read_id, 'NT') fields = [ "family_nr", nr_taxid_family, "family_nt", nt_taxid_family ] fields += [ "genus_nr", nr_taxid_genus, "genus_nt", nt_taxid_genus ] fields += [ "species_nr", nr_taxid_species, "species_nt", nt_taxid_species ] fields += [annotated_read_id] new_read_name = ('>' + ':'.join(fields) + '\n').encode() output_fa.write(new_read_name) output_fa.write(seq_data) seq_name = input_fa.readline() seq_data = input_fa.readline()
def run(self): input_fa_name = self.input_files_local[0][0] if len(self.input_files_local) > 1: input_fa_name = self.input_files_local[0][0] nt_hit_summary_path, nr_hit_summary_path = self.input_files_local[ 1][2], self.input_files_local[2][2] else: # This is used in `short-read-mngs/experimental.wdl` input_fa_name = self.input_files_local[0][0] nt_hit_summary_path, nr_hit_summary_path = self.input_files_local[ 0][1], self.input_files_local[0][2] # Open lineage db lineage_db = s3.fetch_reference(self.additional_files["lineage_db"], self.ref_dir_local, allow_s3mi=True) with open(nt_hit_summary_path) as nt_hit_summary_f, open( nr_hit_summary_path) as nr_hit_summary_f: nr_hits_by_read_id = { row["read_id"]: (row["taxid"], row["level"]) for row in HitSummaryMergedReader(nr_hit_summary_f) } nt_hits_by_read_id = { row["read_id"]: (row["taxid"], row["level"]) for row in HitSummaryMergedReader(nt_hit_summary_f) } with open(self.output_files_local()[0], "w") as output_fa, \ open_file_db_by_extension(lineage_db) as lineage_map: # noqa for read in fasta.iterator(input_fa_name): # Example read_id: "NR::NT:CP010376.2:NB501961:14:HM7TLBGX2:1:23109 # :12720:8743/2" # Translate the read information into our custom format with fake # taxids at non-specific hit levels. # TODO: (tmorse) fasta parsing annotated_read_id = read.header.lstrip('>') read_id = annotated_read_id.split(":", 4)[-1] nr_taxid_species, nr_taxid_genus, nr_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage( nr_hits_by_read_id, lineage_map, read_id) nt_taxid_species, nt_taxid_genus, nt_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage( nt_hits_by_read_id, lineage_map, read_id) fields = [ "family_nr", nr_taxid_family, "family_nt", nt_taxid_family ] fields += [ "genus_nr", nr_taxid_genus, "genus_nt", nt_taxid_genus ] fields += [ "species_nr", nr_taxid_species, "species_nt", nt_taxid_species ] fields += [annotated_read_id] new_read_name = ('>' + ':'.join(fields) + '\n') output_fa.write(new_read_name) output_fa.write(read.sequence + "\n")
def run(self): """ Extract data from input files. Generate coverage viz data. Output JSON output files. """ max_num_bins_coverage = self.additional_attributes.get( "max_num_bins_coverage", MAX_NUM_BINS_COVERAGE) num_accessions_per_taxon = self.additional_attributes.get( "num_accessions_per_taxon", NUM_ACCESSIONS_PER_TAXON) min_contig_size = self.additional_attributes.get( "min_contig_size", MIN_CONTIG_SIZE) info_db = s3.fetch_reference(self.additional_files["info_db"], self.ref_dir_local, allow_s3mi=True) with open_file_db_by_extension( info_db, IdSeqDictValue.VALUE_TYPE_ARRAY) as info_dict: # Extract data from input files. (taxon_data, accession_data, contig_data, read_data) = self.prepare_data(self.input_files_local, info_dict, min_contig_size, num_accessions_per_taxon) # Generate the coverage viz data for each accession. coverage_viz_data = self.generate_coverage_viz_data( accession_data, contig_data, read_data, max_num_bins_coverage) # Generate the summary data, which contains a dict of all taxons for which coverage viz data is available. # For each taxon, summary data for the best accessions, plus the number of total accessions, is included. coverage_viz_summary_data = self.generate_coverage_viz_summary_data( taxon_data, accession_data, coverage_viz_data) coverage_viz_summary = self.output_files_local()[0] # Write the summary JSON file which is initially loaded on the report page. with open(coverage_viz_summary, 'w') as cvs: json.dump(coverage_viz_summary_data, cvs) # Create a separate coverage viz JSON file for each accession. # This file will be passed to the front-end when the user views that particular accession. coverage_viz_dir = os.path.join(self.output_dir_local, "coverage_viz") command.make_dirs(coverage_viz_dir) for accession_id in coverage_viz_data: upload_file = os.path.join(coverage_viz_dir, f"{accession_id}_coverage_viz.json") with open(upload_file, 'w') as uf: json.dump(coverage_viz_data[accession_id], uf) self.additional_output_folders_hidden.append(coverage_viz_dir)
def run(self): # Setup nt_db = self.additional_attributes["nt_db"] nt_loc_db = s3.fetch_reference( self.additional_files["nt_loc_db"], self.ref_dir_local, auto_unzip= True, # This is default for reference download, just being explicit. allow_s3mi=True) db_type = "nt" # Only NT supported for now # TODO: Design a way to map in/out files more robustly, e.g. by name/type annotated_m8 = self.input_files_local[0][0] annotated_fasta = self.input_files_local[1][0] output_json_dir = os.path.join(self.output_dir_local, "align_viz") # Go through annotated_fasta with a db_type (NT/NR match). Infer the # family/genus/species info read2seq = PipelineStepGenerateAlignmentViz.parse_reads( annotated_fasta, db_type) log.write(f"Read to Seq dictionary size: {len(read2seq)}") groups, line_count = self.process_reads_from_m8_file( annotated_m8, read2seq) # If nt_db is not yet downloaded, then do download nt_db here if nt_db.startswith("s3://"): # TODO: Handle this better. We might be poorly provisioned to allow s3mi speed # for this step, on the instance where it is running. nt_db = s3.fetch_reference( nt_db, self.ref_dir_local, auto_unzip= True, # this is default for reference uploads, just being explicit allow_s3mi=True ) # s3mi probably okay here because we tend to download only NT and little else in this stage with open_file_db_by_extension( nt_loc_db, IdSeqDictValue.VALUE_TYPE_ARRAY) as nt_loc_dict: log.write("Getting sequences by accession list from file...") PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_file( groups, nt_loc_dict, nt_db) for _accession_id, ad in groups.items(): ad['coverage_summary'] = PipelineStepGenerateAlignmentViz.calculate_alignment_coverage( ad) result_dict, to_be_deleted = self.populate_reference_sequences(groups) # Delete temp files def safe_multi_delete(files): for f in files: try: os.remove(f) except: pass deleter_thread = threading.Thread(target=safe_multi_delete, args=[to_be_deleted]) deleter_thread.start() self.dump_align_viz_json(output_json_dir, db_type, result_dict) deleter_thread.join() # Write summary file summary_msg = f"Read2Seq Size: {len(read2seq)}, M8 lines {line_count}, " \ f"{len(groups)} unique accession ids " summary_file_name = f"{output_json_dir}.summary" with open(summary_file_name, 'w') as summary_f: summary_f.write(summary_msg)
def get_accession_sequences(self, dest_dir, taxid, n=10): ''' Retrieve NCBI NT references for the most-matched accession in each hitsummary2 file, up to a maximum of n references. Write each reference to a separate fasta file. ''' if n == 0: return {} # Retrieve files nt_db = self.additional_attributes["nt_db"] nt_loc_db = s3.fetch_reference( self.additional_files["nt_loc_db"], self.ref_dir_local, allow_s3mi=True) # Choose accessions to process. s3_hitsummary2_files = self.additional_attributes["hitsummary2_files"].values() accessions = defaultdict(lambda: 0) # TODO: Address issue where accessions in nr can be chosen in the following code. # These accessions will not be found in nt_loc and will be subsequently omitted. for file_list in s3_hitsummary2_files: tally = defaultdict(lambda: 0) for s3_file in file_list: local_basename = s3_file.replace("/", "-").replace(":", "-") local_file = s3.fetch_from_s3( s3_file, os.path.join(self.output_dir_local, local_basename)) if local_file is None: continue with open(local_file, 'r') as f: for line in f: acc, species_taxid, genus_taxid, family_taxid = line.rstrip().split("\t")[3:7] if any(int(hit_taxid) == taxid for hit_taxid in [species_taxid, genus_taxid, family_taxid]): tally[acc] += 1 if tally: best_acc, max_count = max(tally.items(), key=lambda x: x[1]) accessions[best_acc] += max_count if len(accessions) > n: accessions = dict(sorted(accessions.items(), key=lambda x: x[1], reverse=True)[:n]) accessions = set(accessions.keys()) # Make map of accession to sequence file accession2info = dict((acc, {}) for acc in accessions) with open_file_db_by_extension(nt_loc_db) as nt_loc_dict: PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_s3( accession2info, nt_loc_dict, nt_db) # Put 1 fasta file per accession into the destination directory accession_fastas = {} for acc, info in accession2info.items(): if 'seq_file' not in info or info['seq_file'] is None: log.write(f"WARNING: No sequence retrieved for {acc}") continue clean_accession = self.clean_name_for_ksnp3(acc) local_fasta = f"{dest_dir}/NCBI_NT_accession_{clean_accession}.fasta" command.execute( command_patterns.SingleCommand( cmd="ln", args=[ "-s", info['seq_file'], local_fasta ] ) ) command.execute_with_output( command_patterns.ShellScriptCommand( script=r'''echo ">${acc}" | cat - "${local_fasta}" > temp_file;''', named_args={ 'acc': acc, 'local_fasta': local_fasta } ) ) command.move_file('temp_file', local_fasta) accession_fastas[acc] = local_fasta # Return kept accessions and paths of their fasta files return accession_fastas
def call_hits_m8(input_m8, lineage_map_path, accession2taxid_dict_path, output_m8, output_summary, min_alignment_length, deuterostome_path, taxon_whitelist_path, taxon_blacklist_path): """ Determine the optimal taxon assignment for each read from the alignment results. When a read aligns to multiple distinct references, we need to assess at which level in the taxonomic hierarchy the multiple alignments reach consensus. We refer to this process of controlling for specificity as 'hit calling'. Input: - m8 file of multiple alignments per read Outputs: - cleaned m8 file with a single, optimal alignment per read - file with summary information, including taxonomy level at which specificity is reached Details: - A taxon is a group of any rank (e.g. species, genus, family, etc.). - A hit is a match of a read to a known reference labeled with an accession ID. We use NCBI's mapping of accession IDs to taxonomy IDs in order to retrieve the full taxonomic hierarchy for the accession ID. - The full taxonomy hierarchy for a hit is called its "lineage" (species, genus, family, etc.). A hit will normally have (positive) NCBI taxon IDs at all levels of the hierarchy, but there are some exceptions: - We use an artificial negative taxon ID if we have determined that the alignment is not specific at the taxonomy level under consideration. This happens when a read's multiple reference matches do not agree on taxon ID at the given level. For example, a read may match 5 references that all belong to different species (e.g. Escherichia albertii, Escherichia vulneris, Escherichia coli, ...), but to the same genus (Escherichia). In this case, we use the taxon ID for the genus (Escherichia) at the genus-level, but we populate the species-level with an artificial negative ID. The artificial ID is defined based on a negative base ( INVALID_CALL_BASE_ID), the taxon level (e.g. 2 for genus), and the valid parent ID (e.g. genus Escherichia's taxon ID): see helper function _cleaned_taxid_lineage for the precise formula. - Certain entries in NCBI may not have a full lineage classification; for example species and family will be defined but genus will be undefined. In this case, we populate the undefined taxonomic level with an artificial negative ID defined in the same manner as above. - m8 files correspond to BLAST tabular output format 6: Columns: read_id | _ref_id | percent_identity | alignment_length... * read_id = query (e.g., gene) sequence id * ref_id = subject (e.g., reference genome) sequence id * percent_identity = percentage of identical matches * alignment_length = length of the alignments * e_value = the expect value See: * http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 * http://www.metagenomics.wiki/tools/blast/evalue """ with open_file_db_by_extension(lineage_map_path) as lineage_map, \ open_file_db_by_extension(accession2taxid_dict_path) as accession2taxid_dict: # noqa _call_hits_m8_work(input_m8, lineage_map, accession2taxid_dict, output_m8, output_summary, min_alignment_length, deuterostome_path, taxon_whitelist_path, taxon_blacklist_path)
def generate_taxon_count_json_from_m8( blastn_6_path, hit_level_path, count_type, lineage_map_path, deuterostome_path, taxon_whitelist_path, taxon_blacklist_path, duplicate_cluster_sizes_path, output_json_file): # Parse through hit file and m8 input file and format a JSON file with # our desired attributes, including aggregated statistics. duplicate_cluster_sizes = load_duplicate_cluster_sizes(duplicate_cluster_sizes_path) should_keep = build_should_keep_filter( deuterostome_path, taxon_whitelist_path, taxon_blacklist_path) # Setup aggregation = {} with open(hit_level_path) as hit_level_f, \ open(blastn_6_path) as blastn_6_f, \ open_file_db_by_extension(lineage_map_path) as lineage_map: num_ranks = len(lineage.NULL_LINEAGE) # See https://en.wikipedia.org/wiki/Double-precision_floating-point_format MIN_NORMAL_POSITIVE_DOUBLE = 2.0**-1022 with log.log_context("generate_taxon_count_json_from_m8", {"substep": "loop_1"}): # Lines in m8_file and hit_level_file correspond (same read_id) for hit_row, blastn_6_row in zip(HitSummaryMergedReader(hit_level_f), BlastnOutput6NTRerankedReader(blastn_6_f)): # Retrieve data values from files read_id = hit_row["read_id"] hit_level = hit_row["level"] hit_taxid = hit_row["taxid"] if hit_level < 0: log.write('hit_level < 0', debug=True) hit_source_count_type = hit_row.get("source_count_type") msg = "read_ids in %s and %s do not match: %s vs. %s" % ( os.path.basename(blastn_6_path), os.path.basename(hit_level_path), blastn_6_row["qseqid"], read_id) assert blastn_6_row["qseqid"] == read_id, msg percent_identity = blastn_6_row["pident"] alignment_length = blastn_6_row["length"] if count_type == 'merged_NT_NR' and hit_source_count_type == 'NR': # NOTE: At the moment of the change, applied ONLY in the scope of the prototype of NT/NR consensus project. # Protein alignments (NR) are done at amino acid level. Each amino acid is composed of 3 nucleotides. # To make alignment length values comparable across NT and NR alignments (for combined statistics), # the NR alignment lengths are multiplied by 3. alignment_length *= 3 e_value = blastn_6_row["evalue"] # These have been filtered out before the creation of blastn_6_f and hit_level_f assert alignment_length > 0 assert -0.25 < percent_identity < 100.25 assert e_value == e_value if count_type == "NT" or hit_source_count_type == "NT": # e_value could be 0 when large contigs are mapped if e_value <= MIN_NORMAL_POSITIVE_DOUBLE: e_value = MIN_NORMAL_POSITIVE_DOUBLE e_value = math.log10(e_value) # Retrieve the taxon lineage and mark meaningless calls with fake # taxids. # lineage_map expects string ids hit_taxids_all_levels = lineage_map.get( str(hit_taxid), lineage.NULL_LINEAGE) cleaned_hit_taxids_all_levels = lineage.validate_taxid_lineage( hit_taxids_all_levels, hit_taxid, hit_level) assert num_ranks == len(cleaned_hit_taxids_all_levels) if should_keep(cleaned_hit_taxids_all_levels): # Aggregate each level and collect statistics agg_key = tuple(cleaned_hit_taxids_all_levels) while agg_key: agg_bucket = aggregation.get(agg_key) if not agg_bucket: agg_bucket = { 'nonunique_count': 0, 'unique_count': 0, 'sum_percent_identity': 0.0, 'sum_alignment_length': 0.0, 'sum_e_value': 0.0 } aggregation[agg_key] = agg_bucket agg_bucket['nonunique_count'] += get_read_cluster_size( duplicate_cluster_sizes, read_id) agg_bucket['unique_count'] += 1 agg_bucket['sum_percent_identity'] += percent_identity agg_bucket['sum_alignment_length'] += alignment_length agg_bucket['sum_e_value'] += e_value if hit_source_count_type: agg_bucket.setdefault('source_count_type', set()).add(hit_source_count_type) # Chomp off the lowest rank as we aggregate up the tree agg_key = agg_key[1:] # Produce the final output taxon_counts_attributes = [] with log.log_context("generate_taxon_count_json_from_m8", {"substep": "loop_2"}): for agg_key, agg_bucket in aggregation.items(): unique_count = agg_bucket['unique_count'] nonunique_count = agg_bucket['nonunique_count'] tax_level = num_ranks - len(agg_key) + 1 # TODO: Extend taxonomic ranks as indicated on the commented out lines. taxon_counts_row = { "tax_id": agg_key[0], "tax_level": tax_level, # 'species_taxid' : agg_key[tax_level - 1] if tax_level == 1 else "-100", 'genus_taxid': agg_key[2 - tax_level] if tax_level <= 2 else "-200", 'family_taxid': agg_key[3 - tax_level] if tax_level <= 3 else "-300", # 'order_taxid' : agg_key[4 - tax_level] if tax_level <= 4 else "-400", # 'class_taxid' : agg_key[5 - tax_level] if tax_level <= 5 else "-500", # 'phyllum_taxid' : agg_key[6 - tax_level] if tax_level <= 6 else "-600", # 'kingdom_taxid' : agg_key[7 - tax_level] if tax_level <= 7 else "-700", # 'domain_taxid' : agg_key[8 - tax_level] if tax_level <= 8 else "-800", "count": # this field will be consumed by the webapp nonunique_count if READ_COUNTING_MODE == ReadCountingMode.COUNT_ALL else unique_count, "nonunique_count": nonunique_count, "unique_count": unique_count, "dcr": nonunique_count / unique_count, "percent_identity": agg_bucket['sum_percent_identity'] / unique_count, "alignment_length": agg_bucket['sum_alignment_length'] / unique_count, "e_value": agg_bucket['sum_e_value'] / unique_count, "count_type": count_type } if agg_bucket.get('source_count_type'): taxon_counts_row['source_count_type'] = list(agg_bucket['source_count_type']) taxon_counts_attributes.append(taxon_counts_row) output_dict = { "pipeline_output": { "taxon_counts_attributes": taxon_counts_attributes } } with log.log_context( "generate_taxon_count_json_from_m8", {"substep": "json_dump", "output_json_file": output_json_file} ): with open(output_json_file, 'w') as outf: json.dump(output_dict, outf) outf.flush()
def generate_taxon_count_json_from_m8(m8_file, hit_level_file, e_value_type, count_type, lineage_map_path, deuterostome_path, taxon_whitelist_path, taxon_blacklist_path, cdhit_cluster_sizes_path, output_json_file): # Parse through hit file and m8 input file and format a JSON file with # our desired attributes, including aggregated statistics. cdhit_cluster_sizes = load_cdhit_cluster_sizes(cdhit_cluster_sizes_path) should_keep = build_should_keep_filter(deuterostome_path, taxon_whitelist_path, taxon_blacklist_path) # Setup aggregation = {} with open(hit_level_file, 'r', encoding='utf-8') as hit_f, \ open(m8_file, 'r', encoding='utf-8') as m8_f, \ open_file_db_by_extension(lineage_map_path, IdSeqDictValue.VALUE_TYPE_ARRAY) as lineage_map: # noqa # Lines in m8_file and hit_level_file correspond (same read_id) hit_line = hit_f.readline() m8_line = m8_f.readline() num_ranks = len(lineage.NULL_LINEAGE) # See https://en.wikipedia.org/wiki/Double-precision_floating-point_format MIN_NORMAL_POSITIVE_DOUBLE = 2.0**-1022 with log.log_context("generate_taxon_count_json_from_m8", {"substep": "loop_1"}): while hit_line and m8_line: # Retrieve data values from files hit_line_columns = hit_line.rstrip("\n").split("\t") read_id = hit_line_columns[0] hit_level = hit_line_columns[1] hit_taxid = hit_line_columns[2] if int(hit_level) < 0: # Skip negative levels and continue hit_line = hit_f.readline() m8_line = m8_f.readline() continue # m8 files correspond to BLAST tabular output format 6: # Columns: read_id | _ref_id | percent_identity | alignment_length... # # * read_id = query (e.g., gene) sequence id # * _ref_id = subject (e.g., reference genome) sequence id # * percent_identity = percentage of identical matches # * alignment_length = length of the alignments # * e_value = the expect value # # See: # * http://www.metagenomics.wiki/tools/blast/blastn-output-format-6 # * http://www.metagenomics.wiki/tools/blast/evalue m8_line_columns = m8_line.split("\t") msg = "read_ids in %s and %s do not match: %s vs. %s" % ( os.path.basename(m8_file), os.path.basename(hit_level_file), m8_line_columns[0], hit_line_columns[0]) assert m8_line_columns[0] == hit_line_columns[0], msg percent_identity = float(m8_line_columns[2]) alignment_length = float(m8_line_columns[3]) e_value = float(m8_line_columns[10]) # These have been filtered out before the creation of m8_f and hit_f assert alignment_length > 0 assert -0.25 < percent_identity < 100.25 assert e_value == e_value if e_value_type != 'log10': # e_value could be 0 when large contigs are mapped if e_value <= MIN_NORMAL_POSITIVE_DOUBLE: e_value = MIN_NORMAL_POSITIVE_DOUBLE e_value = math.log10(e_value) # Retrieve the taxon lineage and mark meaningless calls with fake # taxids. hit_taxids_all_levels = lineage_map.get( hit_taxid, lineage.NULL_LINEAGE) cleaned_hit_taxids_all_levels = lineage.validate_taxid_lineage( hit_taxids_all_levels, hit_taxid, hit_level) assert num_ranks == len(cleaned_hit_taxids_all_levels) if should_keep(cleaned_hit_taxids_all_levels): # Aggregate each level and collect statistics agg_key = tuple(cleaned_hit_taxids_all_levels) while agg_key: agg_bucket = aggregation.get(agg_key) if not agg_bucket: agg_bucket = { 'nonunique_count': 0, 'unique_count': 0, 'sum_percent_identity': 0.0, 'sum_alignment_length': 0.0, 'sum_e_value': 0.0 } aggregation[agg_key] = agg_bucket agg_bucket['nonunique_count'] += get_read_cluster_size( cdhit_cluster_sizes, read_id) agg_bucket['unique_count'] += 1 agg_bucket['sum_percent_identity'] += percent_identity agg_bucket['sum_alignment_length'] += alignment_length agg_bucket['sum_e_value'] += e_value # Chomp off the lowest rank as we aggregate up the tree agg_key = agg_key[1:] hit_line = hit_f.readline() m8_line = m8_f.readline() # Produce the final output taxon_counts_attributes = [] with log.log_context("generate_taxon_count_json_from_m8", {"substep": "loop_2"}): for agg_key, agg_bucket in aggregation.items(): unique_count = agg_bucket['unique_count'] nonunique_count = agg_bucket['nonunique_count'] tax_level = num_ranks - len(agg_key) + 1 # TODO: Extend taxonomic ranks as indicated on the commented out lines. taxon_counts_attributes.append({ "tax_id": agg_key[0], "tax_level": tax_level, # 'species_taxid' : agg_key[tax_level - 1] if tax_level == 1 else "-100", 'genus_taxid': agg_key[2 - tax_level] if tax_level <= 2 else "-200", 'family_taxid': agg_key[3 - tax_level] if tax_level <= 3 else "-300", # 'order_taxid' : agg_key[4 - tax_level] if tax_level <= 4 else "-400", # 'class_taxid' : agg_key[5 - tax_level] if tax_level <= 5 else "-500", # 'phyllum_taxid' : agg_key[6 - tax_level] if tax_level <= 6 else "-600", # 'kingdom_taxid' : agg_key[7 - tax_level] if tax_level <= 7 else "-700", # 'domain_taxid' : agg_key[8 - tax_level] if tax_level <= 8 else "-800", "count": # this field will be consumed by the webapp nonunique_count if READ_COUNTING_MODE == ReadCountingMode.COUNT_ALL else unique_count, "nonunique_count": nonunique_count, "unique_count": unique_count, "dcr": nonunique_count / unique_count, "percent_identity": agg_bucket['sum_percent_identity'] / unique_count, "alignment_length": agg_bucket['sum_alignment_length'] / unique_count, "e_value": agg_bucket['sum_e_value'] / unique_count, "count_type": count_type }) output_dict = { "pipeline_output": { "taxon_counts_attributes": taxon_counts_attributes } } with log.log_context("generate_taxon_count_json_from_m8", { "substep": "json_dump", "output_json_file": output_json_file }): with open(output_json_file, 'w') as outf: json.dump(output_dict, outf) outf.flush()