def test_read_error_wrong_data_type(self): blastn_input_6 = [ "# a comment", # a comment "1 MK468611.1 not_num 126 0 0 1 126 8433 8308 1.1e-74 290.5", ] with self.assertRaises(Exception): next(BlastnOutput6Reader(blastn_input_6))
def optimal_hit_for_each_query_nr(blast_output_path, max_evalue): contigs_to_best_alignments = defaultdict(list) accession_counts = defaultdict(lambda: 0) with open(blast_output_path) as blastn_6_f: # For each contig, get the alignments that have the best total score (may be multiple if there are ties). for alignment in BlastnOutput6Reader(blastn_6_f): if alignment["evalue"] > max_evalue: continue query = alignment["qseqid"] best_alignments = contigs_to_best_alignments[query] if len(best_alignments) == 0 or best_alignments[0]["bitscore"] < alignment["bitscore"]: contigs_to_best_alignments[query] = [alignment] # Add all ties to best_hits. elif len(best_alignments) > 0 and best_alignments[0]["bitscore"] == alignment["bitscore"]: contigs_to_best_alignments[query].append(alignment) # Create a map of accession to best alignment count. for _contig_id, alignments in contigs_to_best_alignments.items(): for alignment in alignments: accession_counts[alignment["sseqid"]] += 1 # For each contig, pick the optimal alignment based on the accession that has the most best alignments. # If there is still a tie, arbitrarily pick the first one (later we could factor in which taxid has the most blast candidates) for contig_id, alignments in contigs_to_best_alignments.items(): optimal_alignment = None for alignment in alignments: if not optimal_alignment or accession_counts[optimal_alignment["sseqid"]] < accession_counts[alignment["sseqid"]]: optimal_alignment = alignment yield optimal_alignment
def generate_m8_and_hit_summary(consolidated_dict, added_reads, read2blastm8, hit_summary_path, deduped_blastn_6_path, refined_hit_summary_path, refined_blastn_6_path): ''' generate new m8 and hit_summary based on consolidated_dict and read2blastm8 ''' # Generate new hit summary new_read_ids = added_reads.keys() with open(hit_summary_path) as hit_summary_f, open(refined_hit_summary_path, "w") as refined_hit_summary_f: refined_hit_summary_writer = HitSummaryMergedWriter(refined_hit_summary_f) for read in HitSummaryReader(hit_summary_f): refined_hit_summary_writer.writerow(consolidated_dict[read["read_id"]]) # add the reads that are newly blasted for read_id in new_read_ids: refined_hit_summary_writer.writerow(added_reads[read_id]) # Generate new M8 with open(deduped_blastn_6_path) as deduped_blastn_6_f, open(refined_blastn_6_path, "w") as refined_blastn_6_f: refined_blastn_6_writer = BlastnOutput6NTRerankedWriter(refined_blastn_6_f) for row in BlastnOutput6Reader(deduped_blastn_6_f): new_row = read2blastm8.get(row["qseqid"], row) new_row["qseqid"] = row["qseqid"] refined_blastn_6_writer.writerow(new_row) # add the reads that are newly blasted for read_id in new_read_ids: new_row = read2blastm8.get(read_id) new_row["qseqid"] = read_id refined_blastn_6_writer.writerow(new_row)
def test_read_error_too_many_columns(self): blastn_input_6 = [ "# a comment", # a comment "1 MK468611.1 100.0 126 0 0 1 126 8433 8308 1.1e-74 290.5 1", ] with self.assertRaises(Exception): next(BlastnOutput6Reader(blastn_input_6))
def test_read(self): blastn_input_6 = [ "# a comment", # a comment "1 MK468611.1 100.0 126 0 0 1 126 8433 8308 1.1e-74 290.5", "2 MK468611.1 90.0 126 0 0 1 126 8433 8308 290.5", # missing evalue ] rows = list(BlastnOutput6Reader(blastn_input_6)) self.assertEqual(len(rows), 2) self.assertEqual(rows[0]["pident"], 100.0) self.assertEqual(rows[1]["evalue"], "")
def test_filtration(self): blastn_input_6 = [ "# a comment", # a comment "1 MK468611.1 100.0 126 0 0 1 126 8433 8308 1.1e-74 290.5", "2 MK468611.1 135.0 126 0 0 1 126 8433 8308 1.1e-74 290.5", # pident too high "3 MK468611.1 -0.25 126 0 0 1 126 8433 8308 1.1e-74 290.5", # pident too low "4 MK468611.1 -0.25 126 0 0 1 126 8433 8308 NaN 290.5", # NaN error "5 MK468611.1 -0.25 126 0 0 1 126 8433 8308 1 290.5", # error too high ] rows = list(BlastnOutput6Reader(blastn_input_6, filter_invalid=True)) self.assertEqual(len(rows), 1) self.assertEqual(rows[0]["qseqid"], "1")
def update_read_dict(read2contig, blast_top_blastn_6_path, read_dict, accession_dict, db_type): consolidated_dict = read_dict read2blastm8 = {} contig2accession = {} contig2lineage = {} added_reads = {} with open(blast_top_blastn_6_path) as blast_top_blastn_6_f: blastn_6_reader = BlastnOutput6NTRerankedReader(blast_top_blastn_6_f) if db_type == 'nt' else BlastnOutput6Reader(blast_top_blastn_6_f) for row in blastn_6_reader: contig_id = row["qseqid"] accession_id = row["sseqid"] contig2accession[contig_id] = (accession_id, row) contig2lineage[contig_id] = accession_dict[accession_id] for read_id, contig_id in read2contig.items(): (accession, m8_row) = contig2accession.get(contig_id, (None, None)) # accession_dict comes from hit_summary, which comes from alignment and is filtered for taxids # this means that we don't need to filter here because we will never get unfiltered taxids from # accession_dict, however it may be missing accessions so we must handle that case. if accession and accession in accession_dict: (species_taxid, genus_taxid, family_taxid) = accession_dict[accession] if read_id in consolidated_dict: consolidated_dict[read_id]["taxid"] = species_taxid consolidated_dict[read_id]["contig_id"] = contig_id consolidated_dict[read_id]["contig_accession_id"] = accession consolidated_dict[read_id]["contig_species_taxid"] = species_taxid consolidated_dict[read_id]["contig_genus_taxid"] = genus_taxid consolidated_dict[read_id]["contig_family_taxid"] = family_taxid else: added_reads[read_id] = { "read_id": read_id, "level": 1, "taxid": species_taxid, "accession_id": accession, "species_taxid": species_taxid, "genus_taxid": genus_taxid, "family_taxid": family_taxid, "contig_id": contig_id, "contig_accession_id": accession, "contig_species_taxid": species_taxid, "contig_genus_taxid": genus_taxid, "contig_family_taxid": family_taxid, "from_assembly": "from_assembly", } if m8_row: read2blastm8[read_id] = m8_row return (consolidated_dict, read2blastm8, contig2lineage, added_reads)
def _call_hits_m8_work(input_blastn_6_path, lineage_map, accession2taxid_dict, output_blastn_6_path, output_summary, min_alignment_length, deuterostome_path, taxon_whitelist_path, taxon_blacklist_path): lineage_cache = {} should_keep = build_should_keep_filter( deuterostome_path, taxon_whitelist_path, taxon_blacklist_path) # Helper functions def get_lineage(accession_id): """Find the lineage of the accession ID and utilize a cache for performance by reducing random IOPS, ameliorating a key performance bottleneck """ if accession_id in lineage_cache: return lineage_cache[accession_id] accession_taxid = accession2taxid_dict.get( accession_id.split(".")[0], "NA") result = lineage_map.get(accession_taxid, lineage.NULL_LINEAGE) lineage_cache[accession_id] = result return result def accumulate(hits, accession_id): """Accumulate hits for summarizing hit information and specificity at each taxonomy level. """ lineage_taxids = get_lineage(accession_id) for level, taxid_at_level in enumerate(lineage_taxids): if int(taxid_at_level) < 0: # Skip if we have a negative taxid. When an accession doesn't # provide species level info, it doesn't contradict any info # provided by other accessions. This occurs a lot and # handling it in this way seems to work well. continue accession_list = hits[level].get( taxid_at_level, []) + [accession_id] hits[level][taxid_at_level] = accession_list def most_frequent_accession(accession_list): counts = Counter(accession_list) return counts.most_common(1)[0][0] # FIXME: https://jira.czi.team/browse/IDSEQ-2738 # We want to move towards a general randomness solution in which # all randomness is seeded based on the content of the original input. # This is currently introducing non-determinism and hard coding # an arbitrary seed here shouldn't impact correctness. This is only used # to break ties. randgen = random.Random(x=4) # chosen by fair dice role, guaranteed to be random def call_hit_level_v2(hits): ''' Always call hit at the species level with the taxid with most matches ''' species_level_hits = hits[0] max_match = 0 taxid_candidates = [] for taxid, accession_list in species_level_hits.items(): accession_len = len(accession_list) if accession_len > max_match: taxid_candidates = [taxid] max_match = accession_len elif accession_len == max_match: taxid_candidates.append(taxid) if max_match > 0: selected_taxid = taxid_candidates[0] if len(taxid_candidates) > 1: selected_taxid = randgen.sample(taxid_candidates, 1)[0] accession_id = most_frequent_accession( species_level_hits[selected_taxid]) return 1, selected_taxid, accession_id return -1, "-1", None # Deduplicate m8 and summarize hits summary = {} count = 0 LOG_INCREMENT = 50000 log.write(f"Starting to summarize hits from {input_blastn_6_path}.") with open(input_blastn_6_path) as input_blastn_6_f: for row in BlastnOutput6Reader(input_blastn_6_f, filter_invalid=True, min_alignment_length=min_alignment_length): read_id, accession_id, e_value = row["qseqid"], row["sseqid"], row["evalue"] # The Expect value (E) is a parameter that describes the number of # hits one can 'expect' to see by chance when searching a database of # a particular size. It decreases exponentially as the Score (S) of # the match increases. Essentially, the E value describes the random # background noise. https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web # &PAGE_TYPE=BlastDocs&DOC_TYPE=FAQ my_best_evalue, hits, _ = summary.get(read_id, (float("inf"), [{}, {}, {}], None)) if my_best_evalue > e_value: # If we find a new better e value we want to start accumulation over hits = [{}, {}, {}] accumulate(hits, accession_id) my_best_evalue = e_value elif my_best_evalue == e_value: # If we find another accession with the same e value we want to accumulate it accumulate(hits, accession_id) summary[read_id] = my_best_evalue, hits, call_hit_level_v2(hits) count += 1 if count % LOG_INCREMENT == 0: log.write(f"Summarized hits for {count} read ids from {input_blastn_6_path}, and counting.") log.write(f"Summarized hits for all {count} read ids from {input_blastn_6_path}.") # Generate output files. outf is the main output_m8 file and outf_sum is # the summary level info. emitted = set() with open(output_blastn_6_path, "w") as blastn_6_out_f, open(output_summary, "w") as hit_summary_out_f, open(input_blastn_6_path) as input_blastn_6_f: blastn_6_writer = BlastnOutput6Writer(blastn_6_out_f) hit_summary_writer = HitSummaryWriter(hit_summary_out_f) # Iterator over the lines of the m8 file. Emit the hit with the # best value that provides the most specific taxonomy # information. If there are multiple hits (also called multiple # accession IDs) for a given read that all have the same e-value, # some may provide species information and some may only provide # genus information. We want to emit the one that provides the # species information because from that we can infer the rest of # the lineage. If we accidentally emitted the one that provided # only genus info, downstream steps may have difficulty # recovering the species. # TODO: Consider all hits within a fixed margin of the best e-value. # This change may need to be accompanied by a change to # GSNAP/RAPSearch2 parameters. for row in BlastnOutput6Reader(input_blastn_6_f, filter_invalid=True, min_alignment_length=min_alignment_length): read_id, accession_id, e_value = row["qseqid"], row["sseqid"], row["evalue"] if read_id in emitted: continue # Read the fields from the summary level info best_e_value, _, (hit_level, taxid, best_accession_id) = summary[read_id] if best_e_value == e_value and best_accession_id in (None, accession_id) and should_keep([taxid]): # Read out the hit with the best value that provides the # most specific taxonomy information. emitted.add(read_id) blastn_6_writer.writerow(row) species_taxid = -1 genus_taxid = -1 family_taxid = -1 if best_accession_id != None: (species_taxid, genus_taxid, family_taxid) = get_lineage( best_accession_id) hit_summary_writer.writerow({ "read_id": read_id, "level": hit_level, "taxid": taxid, "accession_id": best_accession_id, "species_taxid": species_taxid, "genus_taxid": genus_taxid, "family_taxid": family_taxid, })
def test_read_error_empty_line(self): blastn_input_6 = [""] with self.assertRaises(Exception): next(BlastnOutput6Reader(blastn_input_6))