Esempio n. 1
0
 def test_read_error_wrong_data_type(self):
     blastn_input_6 = [
         "# a comment",  # a comment
         "1	MK468611.1	not_num	126	0	0	1	126	8433	8308	1.1e-74	290.5",
     ]
     with self.assertRaises(Exception):
         next(BlastnOutput6Reader(blastn_input_6))
    def optimal_hit_for_each_query_nr(blast_output_path, max_evalue):
        contigs_to_best_alignments = defaultdict(list)
        accession_counts = defaultdict(lambda: 0)

        with open(blast_output_path) as blastn_6_f:
            # For each contig, get the alignments that have the best total score (may be multiple if there are ties).
            for alignment in BlastnOutput6Reader(blastn_6_f):
                if alignment["evalue"] > max_evalue:
                    continue
                query = alignment["qseqid"]
                best_alignments = contigs_to_best_alignments[query]

                if len(best_alignments) == 0 or best_alignments[0]["bitscore"] < alignment["bitscore"]:
                    contigs_to_best_alignments[query] = [alignment]
                # Add all ties to best_hits.
                elif len(best_alignments) > 0 and best_alignments[0]["bitscore"] == alignment["bitscore"]:
                    contigs_to_best_alignments[query].append(alignment)

            # Create a map of accession to best alignment count.
            for _contig_id, alignments in contigs_to_best_alignments.items():
                for alignment in alignments:
                    accession_counts[alignment["sseqid"]] += 1

            # For each contig, pick the optimal alignment based on the accession that has the most best alignments.
            # If there is still a tie, arbitrarily pick the first one (later we could factor in which taxid has the most blast candidates)
            for contig_id, alignments in contigs_to_best_alignments.items():
                optimal_alignment = None
                for alignment in alignments:
                    if not optimal_alignment or accession_counts[optimal_alignment["sseqid"]] < accession_counts[alignment["sseqid"]]:
                        optimal_alignment = alignment

                yield optimal_alignment
    def generate_m8_and_hit_summary(consolidated_dict, added_reads, read2blastm8,
                                    hit_summary_path, deduped_blastn_6_path,
                                    refined_hit_summary_path, refined_blastn_6_path):
        ''' generate new m8 and hit_summary based on consolidated_dict and read2blastm8 '''
        # Generate new hit summary
        new_read_ids = added_reads.keys()
        with open(hit_summary_path) as hit_summary_f, open(refined_hit_summary_path, "w") as refined_hit_summary_f:
            refined_hit_summary_writer = HitSummaryMergedWriter(refined_hit_summary_f)
            for read in HitSummaryReader(hit_summary_f):
                refined_hit_summary_writer.writerow(consolidated_dict[read["read_id"]])
            # add the reads that are newly blasted
            for read_id in new_read_ids:
                refined_hit_summary_writer.writerow(added_reads[read_id])
        # Generate new M8
        with open(deduped_blastn_6_path) as deduped_blastn_6_f, open(refined_blastn_6_path, "w") as refined_blastn_6_f:
            refined_blastn_6_writer = BlastnOutput6NTRerankedWriter(refined_blastn_6_f)
            for row in BlastnOutput6Reader(deduped_blastn_6_f):
                new_row = read2blastm8.get(row["qseqid"], row)
                new_row["qseqid"] = row["qseqid"]
                refined_blastn_6_writer.writerow(new_row)

            # add the reads that are newly blasted
            for read_id in new_read_ids:
                new_row = read2blastm8.get(read_id)
                new_row["qseqid"] = read_id
                refined_blastn_6_writer.writerow(new_row)
Esempio n. 4
0
 def test_read_error_too_many_columns(self):
     blastn_input_6 = [
         "# a comment",  # a comment
         "1	MK468611.1	100.0	126	0	0	1	126	8433	8308	1.1e-74	290.5	1",
     ]
     with self.assertRaises(Exception):
         next(BlastnOutput6Reader(blastn_input_6))
Esempio n. 5
0
 def test_read(self):
     blastn_input_6 = [
         "# a comment",  # a comment
         "1	MK468611.1	100.0	126	0	0	1	126	8433	8308	1.1e-74	290.5",
         "2	MK468611.1	90.0	126	0	0	1	126	8433	8308		290.5",  # missing evalue
     ]
     rows = list(BlastnOutput6Reader(blastn_input_6))
     self.assertEqual(len(rows), 2)
     self.assertEqual(rows[0]["pident"], 100.0)
     self.assertEqual(rows[1]["evalue"], "")
Esempio n. 6
0
    def test_filtration(self):
        blastn_input_6 = [
            "# a comment",  # a comment
            "1	MK468611.1	100.0	126	0	0	1	126	8433	8308	1.1e-74	290.5",
            "2	MK468611.1	135.0	126	0	0	1	126	8433	8308	1.1e-74	290.5",  # pident too high
            "3 	MK468611.1	-0.25	126	0	0	1	126	8433	8308	1.1e-74	290.5",  # pident too low
            "4	MK468611.1	-0.25	126	0	0	1	126	8433	8308	NaN	290.5",  # NaN error
            "5	MK468611.1	-0.25	126	0	0	1	126	8433	8308	1	290.5",  # error too high
        ]

        rows = list(BlastnOutput6Reader(blastn_input_6, filter_invalid=True))
        self.assertEqual(len(rows), 1)
        self.assertEqual(rows[0]["qseqid"], "1")
    def update_read_dict(read2contig, blast_top_blastn_6_path, read_dict, accession_dict, db_type):
        consolidated_dict = read_dict
        read2blastm8 = {}
        contig2accession = {}
        contig2lineage = {}
        added_reads = {}

        with open(blast_top_blastn_6_path) as blast_top_blastn_6_f:
            blastn_6_reader = BlastnOutput6NTRerankedReader(blast_top_blastn_6_f) if db_type == 'nt' else BlastnOutput6Reader(blast_top_blastn_6_f)
            for row in blastn_6_reader:
                contig_id = row["qseqid"]
                accession_id = row["sseqid"]
                contig2accession[contig_id] = (accession_id, row)
                contig2lineage[contig_id] = accession_dict[accession_id]

            for read_id, contig_id in read2contig.items():
                (accession, m8_row) = contig2accession.get(contig_id, (None, None))
                # accession_dict comes from hit_summary, which comes from alignment and is filtered for taxids
                # this means that we don't need to filter here because we will never get unfiltered taxids from
                # accession_dict, however it may be missing accessions so we must handle that case.
                if accession and accession in accession_dict:
                    (species_taxid, genus_taxid, family_taxid) = accession_dict[accession]
                    if read_id in consolidated_dict:
                        consolidated_dict[read_id]["taxid"] = species_taxid
                        consolidated_dict[read_id]["contig_id"] = contig_id
                        consolidated_dict[read_id]["contig_accession_id"] = accession
                        consolidated_dict[read_id]["contig_species_taxid"] = species_taxid
                        consolidated_dict[read_id]["contig_genus_taxid"] = genus_taxid
                        consolidated_dict[read_id]["contig_family_taxid"] = family_taxid
                    else:
                        added_reads[read_id] = {
                            "read_id": read_id,
                            "level": 1,
                            "taxid": species_taxid,
                            "accession_id": accession,
                            "species_taxid": species_taxid,
                            "genus_taxid": genus_taxid,
                            "family_taxid": family_taxid,
                            "contig_id": contig_id,
                            "contig_accession_id": accession,
                            "contig_species_taxid": species_taxid,
                            "contig_genus_taxid": genus_taxid,
                            "contig_family_taxid": family_taxid,
                            "from_assembly": "from_assembly",
                        }
                if m8_row:
                    read2blastm8[read_id] = m8_row
            return (consolidated_dict, read2blastm8, contig2lineage, added_reads)
Esempio n. 8
0
def _call_hits_m8_work(input_blastn_6_path, lineage_map, accession2taxid_dict,
                       output_blastn_6_path, output_summary, min_alignment_length,
                       deuterostome_path, taxon_whitelist_path, taxon_blacklist_path):
    lineage_cache = {}

    should_keep = build_should_keep_filter(
        deuterostome_path, taxon_whitelist_path, taxon_blacklist_path)

    # Helper functions
    def get_lineage(accession_id):
        """Find the lineage of the accession ID and utilize a cache for
        performance by reducing random IOPS, ameliorating a key performance
        bottleneck
        """
        if accession_id in lineage_cache:
            return lineage_cache[accession_id]
        accession_taxid = accession2taxid_dict.get(
            accession_id.split(".")[0], "NA")
        result = lineage_map.get(accession_taxid, lineage.NULL_LINEAGE)
        lineage_cache[accession_id] = result
        return result

    def accumulate(hits, accession_id):
        """Accumulate hits for summarizing hit information and specificity at
        each taxonomy level.
        """
        lineage_taxids = get_lineage(accession_id)
        for level, taxid_at_level in enumerate(lineage_taxids):
            if int(taxid_at_level) < 0:
                # Skip if we have a negative taxid. When an accession doesn't
                # provide species level info, it doesn't contradict any info
                # provided by other accessions. This occurs a lot and
                # handling it in this way seems to work well.
                continue
            accession_list = hits[level].get(
                taxid_at_level, []) + [accession_id]
            hits[level][taxid_at_level] = accession_list

    def most_frequent_accession(accession_list):
        counts = Counter(accession_list)
        return counts.most_common(1)[0][0]

    # FIXME: https://jira.czi.team/browse/IDSEQ-2738
    #  We want to move towards a general randomness solution in which
    #  all randomness is seeded based on the content of the original input.
    #  This is currently introducing non-determinism and hard coding
    #  an arbitrary seed here shouldn't impact correctness. This is only used
    #  to break ties.
    randgen = random.Random(x=4)  # chosen by fair dice role, guaranteed to be random

    def call_hit_level_v2(hits):
        ''' Always call hit at the species level with the taxid with most matches '''
        species_level_hits = hits[0]
        max_match = 0
        taxid_candidates = []
        for taxid, accession_list in species_level_hits.items():
            accession_len = len(accession_list)
            if accession_len > max_match:
                taxid_candidates = [taxid]
                max_match = accession_len
            elif accession_len == max_match:
                taxid_candidates.append(taxid)
        if max_match > 0:
            selected_taxid = taxid_candidates[0]
            if len(taxid_candidates) > 1:
                selected_taxid = randgen.sample(taxid_candidates, 1)[0]
            accession_id = most_frequent_accession(
                species_level_hits[selected_taxid])
            return 1, selected_taxid, accession_id
        return -1, "-1", None

    # Deduplicate m8 and summarize hits
    summary = {}
    count = 0
    LOG_INCREMENT = 50000
    log.write(f"Starting to summarize hits from {input_blastn_6_path}.")
    with open(input_blastn_6_path) as input_blastn_6_f:
        for row in BlastnOutput6Reader(input_blastn_6_f, filter_invalid=True, min_alignment_length=min_alignment_length):
            read_id, accession_id, e_value = row["qseqid"], row["sseqid"], row["evalue"]
            # The Expect value (E) is a parameter that describes the number of
            # hits one can 'expect' to see by chance when searching a database of
            # a particular size. It decreases exponentially as the Score (S) of
            # the match increases. Essentially, the E value describes the random
            # background noise. https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web
            # &PAGE_TYPE=BlastDocs&DOC_TYPE=FAQ
            my_best_evalue, hits, _ = summary.get(read_id, (float("inf"), [{}, {}, {}], None))
            if my_best_evalue > e_value:
                # If we find a new better e value we want to start accumulation over
                hits = [{}, {}, {}]
                accumulate(hits, accession_id)
                my_best_evalue = e_value
            elif my_best_evalue == e_value:
                # If we find another accession with the same e value we want to accumulate it
                accumulate(hits, accession_id)
            summary[read_id] = my_best_evalue, hits, call_hit_level_v2(hits)
            count += 1
            if count % LOG_INCREMENT == 0:
                log.write(f"Summarized hits for {count} read ids from {input_blastn_6_path}, and counting.")

    log.write(f"Summarized hits for all {count} read ids from {input_blastn_6_path}.")

    # Generate output files. outf is the main output_m8 file and outf_sum is
    # the summary level info.
    emitted = set()
    with open(output_blastn_6_path, "w") as blastn_6_out_f, open(output_summary, "w") as hit_summary_out_f, open(input_blastn_6_path) as input_blastn_6_f:
        blastn_6_writer = BlastnOutput6Writer(blastn_6_out_f)
        hit_summary_writer = HitSummaryWriter(hit_summary_out_f)
        # Iterator over the lines of the m8 file. Emit the hit with the
        # best value that provides the most specific taxonomy
        # information. If there are multiple hits (also called multiple
        # accession IDs) for a given read that all have the same e-value,
        # some may provide species information and some may only provide
        # genus information. We want to emit the one that provides the
        # species information because from that we can infer the rest of
        # the lineage. If we accidentally emitted the one that provided
        # only genus info, downstream steps may have difficulty
        # recovering the species.

        # TODO: Consider all hits within a fixed margin of the best e-value.
        # This change may need to be accompanied by a change to
        # GSNAP/RAPSearch2 parameters.
        for row in BlastnOutput6Reader(input_blastn_6_f, filter_invalid=True, min_alignment_length=min_alignment_length):
            read_id, accession_id, e_value = row["qseqid"], row["sseqid"], row["evalue"]
            if read_id in emitted:
                continue

            # Read the fields from the summary level info
            best_e_value, _, (hit_level, taxid,
                              best_accession_id) = summary[read_id]
            if best_e_value == e_value and best_accession_id in (None, accession_id) and should_keep([taxid]):
                # Read out the hit with the best value that provides the
                # most specific taxonomy information.
                emitted.add(read_id)
                blastn_6_writer.writerow(row)
                species_taxid = -1
                genus_taxid = -1
                family_taxid = -1
                if best_accession_id != None:
                    (species_taxid, genus_taxid, family_taxid) = get_lineage(
                        best_accession_id)

                hit_summary_writer.writerow({
                    "read_id": read_id,
                    "level": hit_level,
                    "taxid": taxid,
                    "accession_id": best_accession_id,
                    "species_taxid": species_taxid,
                    "genus_taxid": genus_taxid,
                    "family_taxid": family_taxid,
                })
Esempio n. 9
0
 def test_read_error_empty_line(self):
     blastn_input_6 = [""]
     with self.assertRaises(Exception):
         next(BlastnOutput6Reader(blastn_input_6))