Ejemplo n.º 1
0
 def run(self):
     (_align_m8, _deduped_m8, hit_summary,
      _orig_counts) = self.input_files_local[0]
     output_reference_fasta = self.output_files_local()[0]
     loc_db = s3.fetch_reference(
         self.additional_files["loc_db"],
         self.ref_dir_local,
         auto_unzip=
         True,  # This is default for references, but let's be explicit.
         allow_s3mi=ALLOW_S3MI)
     db_s3_path = self.additional_attributes["db"]
     # db_type = self.additional_attributes["db_type"]
     (_read_dict, accession_dict,
      _selected_genera) = m8.summarize_hits(hit_summary)
     with open_file_db_by_extension(
             loc_db, IdSeqDictValue.VALUE_TYPE_ARRAY) as loc_dict:
         db_path = s3.fetch_reference(
             db_s3_path,
             self.ref_dir_local,
             auto_unzip=
             True,  # This is default for references, but let's be explicit
             allow_s3mi=ALLOW_S3MI)
         self.download_ref_sequences_from_file(accession_dict, loc_dict,
                                               db_path,
                                               output_reference_fasta)
Ejemplo n.º 2
0
    def run(self):
        # Setup
        if len(self.input_files_local) > 1:
            input_fa_name = self.input_files_local[0][0]
            hit_summary_files = {
                'NT': self.input_files_local[1][2],
                'NR': self.input_files_local[2][2]
            }
        else:
            # TODO(yf): Old implementation. TO BE DEPRECATED once 3.1 is fully deployed
            input_files = self.input_files_local[0]
            input_fa_name = input_files[0]
            hit_summary_files = {'NT': input_files[1], 'NR': input_files[2]}

        # Open lineage db
        lineage_db = s3.fetch_reference(self.additional_files["lineage_db"],
                                        self.ref_dir_local,
                                        allow_s3mi=True)

        # Get primary hit mappings
        valid_hits = PipelineStepGenerateTaxidFasta.parse_hits(
            hit_summary_files)

        with open(input_fa_name, 'rb') as input_fa, \
             open(self.output_files_local()[0], 'wb') as output_fa, \
             open_file_db_by_extension(lineage_db, IdSeqDictValue.VALUE_TYPE_ARRAY) as lineage_map:  # noqa
            seq_name = input_fa.readline()
            seq_data = input_fa.readline()
            while len(seq_name) > 0 and len(seq_data) > 0:
                # Example read_id: "NR::NT:CP010376.2:NB501961:14:HM7TLBGX2:1:23109
                # :12720:8743/2"
                # Translate the read information into our custom format with fake
                # taxids at non-specific hit levels.
                annotated_read_id = seq_name.decode("utf-8").rstrip().lstrip(
                    '>')
                read_id = annotated_read_id.split(":", 4)[-1]

                nr_taxid_species, nr_taxid_genus, nr_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage(
                    valid_hits, lineage_map, read_id, 'NR')
                nt_taxid_species, nt_taxid_genus, nt_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage(
                    valid_hits, lineage_map, read_id, 'NT')

                fields = [
                    "family_nr", nr_taxid_family, "family_nt", nt_taxid_family
                ]
                fields += [
                    "genus_nr", nr_taxid_genus, "genus_nt", nt_taxid_genus
                ]
                fields += [
                    "species_nr", nr_taxid_species, "species_nt",
                    nt_taxid_species
                ]
                fields += [annotated_read_id]
                new_read_name = ('>' + ':'.join(fields) + '\n').encode()

                output_fa.write(new_read_name)
                output_fa.write(seq_data)
                seq_name = input_fa.readline()
                seq_data = input_fa.readline()
Ejemplo n.º 3
0
    def run(self):
        input_fa_name = self.input_files_local[0][0]
        if len(self.input_files_local) > 1:
            input_fa_name = self.input_files_local[0][0]
            nt_hit_summary_path, nr_hit_summary_path = self.input_files_local[
                1][2], self.input_files_local[2][2]
        else:
            # This is used in `short-read-mngs/experimental.wdl`
            input_fa_name = self.input_files_local[0][0]
            nt_hit_summary_path, nr_hit_summary_path = self.input_files_local[
                0][1], self.input_files_local[0][2]

        # Open lineage db
        lineage_db = s3.fetch_reference(self.additional_files["lineage_db"],
                                        self.ref_dir_local,
                                        allow_s3mi=True)

        with open(nt_hit_summary_path) as nt_hit_summary_f, open(
                nr_hit_summary_path) as nr_hit_summary_f:
            nr_hits_by_read_id = {
                row["read_id"]: (row["taxid"], row["level"])
                for row in HitSummaryMergedReader(nr_hit_summary_f)
            }
            nt_hits_by_read_id = {
                row["read_id"]: (row["taxid"], row["level"])
                for row in HitSummaryMergedReader(nt_hit_summary_f)
            }

        with open(self.output_files_local()[0], "w") as output_fa, \
             open_file_db_by_extension(lineage_db) as lineage_map:  # noqa
            for read in fasta.iterator(input_fa_name):
                # Example read_id: "NR::NT:CP010376.2:NB501961:14:HM7TLBGX2:1:23109
                # :12720:8743/2"
                # Translate the read information into our custom format with fake
                # taxids at non-specific hit levels.
                # TODO: (tmorse) fasta parsing
                annotated_read_id = read.header.lstrip('>')
                read_id = annotated_read_id.split(":", 4)[-1]

                nr_taxid_species, nr_taxid_genus, nr_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage(
                    nr_hits_by_read_id, lineage_map, read_id)
                nt_taxid_species, nt_taxid_genus, nt_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage(
                    nt_hits_by_read_id, lineage_map, read_id)

                fields = [
                    "family_nr", nr_taxid_family, "family_nt", nt_taxid_family
                ]
                fields += [
                    "genus_nr", nr_taxid_genus, "genus_nt", nt_taxid_genus
                ]
                fields += [
                    "species_nr", nr_taxid_species, "species_nt",
                    nt_taxid_species
                ]
                fields += [annotated_read_id]
                new_read_name = ('>' + ':'.join(fields) + '\n')

                output_fa.write(new_read_name)
                output_fa.write(read.sequence + "\n")
Ejemplo n.º 4
0
    def run(self):
        """
        Extract data from input files.
        Generate coverage viz data.
        Output JSON output files.
        """
        max_num_bins_coverage = self.additional_attributes.get(
            "max_num_bins_coverage", MAX_NUM_BINS_COVERAGE)
        num_accessions_per_taxon = self.additional_attributes.get(
            "num_accessions_per_taxon", NUM_ACCESSIONS_PER_TAXON)
        min_contig_size = self.additional_attributes.get(
            "min_contig_size", MIN_CONTIG_SIZE)

        info_db = s3.fetch_reference(self.additional_files["info_db"],
                                     self.ref_dir_local,
                                     allow_s3mi=True)
        with open_file_db_by_extension(
                info_db, IdSeqDictValue.VALUE_TYPE_ARRAY) as info_dict:
            # Extract data from input files.
            (taxon_data, accession_data, contig_data,
             read_data) = self.prepare_data(self.input_files_local, info_dict,
                                            min_contig_size,
                                            num_accessions_per_taxon)

        # Generate the coverage viz data for each accession.
        coverage_viz_data = self.generate_coverage_viz_data(
            accession_data, contig_data, read_data, max_num_bins_coverage)

        # Generate the summary data, which contains a dict of all taxons for which coverage viz data is available.
        # For each taxon, summary data for the best accessions, plus the number of total accessions, is included.
        coverage_viz_summary_data = self.generate_coverage_viz_summary_data(
            taxon_data, accession_data, coverage_viz_data)

        coverage_viz_summary = self.output_files_local()[0]
        # Write the summary JSON file which is initially loaded on the report page.
        with open(coverage_viz_summary, 'w') as cvs:
            json.dump(coverage_viz_summary_data, cvs)

        # Create a separate coverage viz JSON file for each accession.
        # This file will be passed to the front-end when the user views that particular accession.
        coverage_viz_dir = os.path.join(self.output_dir_local, "coverage_viz")
        command.make_dirs(coverage_viz_dir)
        for accession_id in coverage_viz_data:
            upload_file = os.path.join(coverage_viz_dir,
                                       f"{accession_id}_coverage_viz.json")

            with open(upload_file, 'w') as uf:
                json.dump(coverage_viz_data[accession_id], uf)

        self.additional_output_folders_hidden.append(coverage_viz_dir)
Ejemplo n.º 5
0
    def run(self):
        # Setup
        nt_db = self.additional_attributes["nt_db"]
        nt_loc_db = s3.fetch_reference(
            self.additional_files["nt_loc_db"],
            self.ref_dir_local,
            auto_unzip=
            True,  # This is default for reference download, just being explicit.
            allow_s3mi=True)
        db_type = "nt"  # Only NT supported for now
        # TODO: Design a way to map in/out files more robustly, e.g. by name/type
        annotated_m8 = self.input_files_local[0][0]
        annotated_fasta = self.input_files_local[1][0]
        output_json_dir = os.path.join(self.output_dir_local, "align_viz")

        # Go through annotated_fasta with a db_type (NT/NR match). Infer the
        # family/genus/species info
        read2seq = PipelineStepGenerateAlignmentViz.parse_reads(
            annotated_fasta, db_type)
        log.write(f"Read to Seq dictionary size: {len(read2seq)}")

        groups, line_count = self.process_reads_from_m8_file(
            annotated_m8, read2seq)

        # If nt_db is not yet downloaded, then do download nt_db here
        if nt_db.startswith("s3://"):
            # TODO: Handle this better.  We might be poorly provisioned to allow s3mi speed
            # for this step, on the instance where it is running.
            nt_db = s3.fetch_reference(
                nt_db,
                self.ref_dir_local,
                auto_unzip=
                True,  # this is default for reference uploads, just being explicit
                allow_s3mi=True
            )  # s3mi probably okay here because we tend to download only NT and little else in this stage

        with open_file_db_by_extension(
                nt_loc_db, IdSeqDictValue.VALUE_TYPE_ARRAY) as nt_loc_dict:
            log.write("Getting sequences by accession list from file...")
            PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_file(
                groups, nt_loc_dict, nt_db)

        for _accession_id, ad in groups.items():
            ad['coverage_summary'] = PipelineStepGenerateAlignmentViz.calculate_alignment_coverage(
                ad)

        result_dict, to_be_deleted = self.populate_reference_sequences(groups)

        # Delete temp files
        def safe_multi_delete(files):
            for f in files:
                try:
                    os.remove(f)
                except:
                    pass

        deleter_thread = threading.Thread(target=safe_multi_delete,
                                          args=[to_be_deleted])
        deleter_thread.start()

        self.dump_align_viz_json(output_json_dir, db_type, result_dict)

        deleter_thread.join()

        # Write summary file
        summary_msg = f"Read2Seq Size: {len(read2seq)}, M8 lines {line_count}, " \
            f"{len(groups)} unique accession ids "
        summary_file_name = f"{output_json_dir}.summary"
        with open(summary_file_name, 'w') as summary_f:
            summary_f.write(summary_msg)
    def get_accession_sequences(self, dest_dir, taxid, n=10):
        '''
        Retrieve NCBI NT references for the most-matched accession in each hitsummary2 file, up to a maximum of n references.
        Write each reference to a separate fasta file.
        '''
        if n == 0:
            return {}

        # Retrieve files
        nt_db = self.additional_attributes["nt_db"]
        nt_loc_db = s3.fetch_reference(
            self.additional_files["nt_loc_db"],
            self.ref_dir_local,
            allow_s3mi=True)

        # Choose accessions to process.
        s3_hitsummary2_files = self.additional_attributes["hitsummary2_files"].values()
        accessions = defaultdict(lambda: 0)
        # TODO: Address issue where accessions in nr can be chosen in the following code.
        # These accessions will not be found in nt_loc and will be subsequently omitted.
        for file_list in s3_hitsummary2_files:
            tally = defaultdict(lambda: 0)
            for s3_file in file_list:
                local_basename = s3_file.replace("/", "-").replace(":", "-")
                local_file = s3.fetch_from_s3(
                    s3_file,
                    os.path.join(self.output_dir_local, local_basename))
                if local_file is None:
                    continue
                with open(local_file, 'r') as f:
                    for line in f:
                        acc, species_taxid, genus_taxid, family_taxid = line.rstrip().split("\t")[3:7]
                        if any(int(hit_taxid) == taxid for hit_taxid in [species_taxid, genus_taxid, family_taxid]):
                            tally[acc] += 1
            if tally:
                best_acc, max_count = max(tally.items(), key=lambda x: x[1])
                accessions[best_acc] += max_count
        if len(accessions) > n:
            accessions = dict(sorted(accessions.items(), key=lambda x: x[1], reverse=True)[:n])
        accessions = set(accessions.keys())

        # Make map of accession to sequence file
        accession2info = dict((acc, {}) for acc in accessions)
        with open_file_db_by_extension(nt_loc_db) as nt_loc_dict:
            PipelineStepGenerateAlignmentViz.get_sequences_by_accession_list_from_s3(
                accession2info, nt_loc_dict, nt_db)

        # Put 1 fasta file per accession into the destination directory
        accession_fastas = {}
        for acc, info in accession2info.items():
            if 'seq_file' not in info or info['seq_file'] is None:
                log.write(f"WARNING: No sequence retrieved for {acc}")
                continue
            clean_accession = self.clean_name_for_ksnp3(acc)
            local_fasta = f"{dest_dir}/NCBI_NT_accession_{clean_accession}.fasta"
            command.execute(
                command_patterns.SingleCommand(
                    cmd="ln",
                    args=[
                        "-s",
                        info['seq_file'],
                        local_fasta
                    ]
                )
            )
            command.execute_with_output(
                command_patterns.ShellScriptCommand(
                    script=r'''echo ">${acc}" | cat - "${local_fasta}" > temp_file;''',
                    named_args={
                        'acc': acc,
                        'local_fasta': local_fasta
                    }
                )
            )
            command.move_file('temp_file', local_fasta)

            accession_fastas[acc] = local_fasta

        # Return kept accessions and paths of their fasta files
        return accession_fastas
Ejemplo n.º 7
0
def call_hits_m8(input_m8, lineage_map_path, accession2taxid_dict_path,
                 output_m8, output_summary, min_alignment_length,
                 deuterostome_path, taxon_whitelist_path, taxon_blacklist_path):
    """
    Determine the optimal taxon assignment for each read from the alignment
    results. When a read aligns to multiple distinct references, we need to
    assess at which level in the taxonomic hierarchy the multiple alignments
    reach consensus. We refer to this process of controlling for specificity
    as 'hit calling'.

    Input:
    - m8 file of multiple alignments per read

    Outputs:
    - cleaned m8 file with a single, optimal alignment per read
    - file with summary information, including taxonomy level at which
    specificity is reached

    Details:
    - A taxon is a group of any rank (e.g. species, genus, family, etc.).

    - A hit is a match of a read to a known reference labeled with an
    accession ID. We use NCBI's mapping of accession IDs to taxonomy IDs in
    order to retrieve the full taxonomic hierarchy for the accession ID.

    - The full taxonomy hierarchy for a hit is called its "lineage" (species,
    genus, family, etc.). A hit will normally have (positive) NCBI taxon IDs
    at all levels of the hierarchy, but there are some exceptions:

        - We use an artificial negative taxon ID if we have determined that
        the alignment is not specific at the taxonomy level under
        consideration. This happens when a read's multiple reference matches
        do not agree on taxon ID at the given level.

        For example, a read may match 5 references that all belong to
        different species (e.g. Escherichia albertii, Escherichia vulneris,
        Escherichia coli, ...), but to the same genus (Escherichia). In this
        case, we use the taxon ID for the genus (Escherichia) at the
        genus-level, but we populate the species-level with an artificial
        negative ID. The artificial ID is defined based on a negative base (
        INVALID_CALL_BASE_ID), the taxon level (e.g. 2 for genus), and the
        valid parent ID (e.g. genus Escherichia's taxon ID): see helper
        function _cleaned_taxid_lineage for the precise formula.

        - Certain entries in NCBI may not have a full lineage classification;
        for example species and family will be defined but genus will be
        undefined. In this case, we populate the undefined taxonomic level
        with an artificial negative ID defined in the same manner as above.

    - m8 files correspond to BLAST tabular output format 6:
        Columns: read_id | _ref_id | percent_identity | alignment_length...

        * read_id = query (e.g., gene) sequence id
        * ref_id = subject (e.g., reference genome) sequence id
        * percent_identity = percentage of identical matches
        * alignment_length = length of the alignments
        * e_value = the expect value

        See:
        * http://www.metagenomics.wiki/tools/blast/blastn-output-format-6
        * http://www.metagenomics.wiki/tools/blast/evalue
    """
    with open_file_db_by_extension(lineage_map_path) as lineage_map, \
         open_file_db_by_extension(accession2taxid_dict_path) as accession2taxid_dict:  # noqa
        _call_hits_m8_work(input_m8, lineage_map, accession2taxid_dict,
                           output_m8, output_summary, min_alignment_length,
                           deuterostome_path, taxon_whitelist_path, taxon_blacklist_path)
Ejemplo n.º 8
0
def generate_taxon_count_json_from_m8(
        blastn_6_path, hit_level_path, count_type, lineage_map_path,
        deuterostome_path, taxon_whitelist_path, taxon_blacklist_path,
        duplicate_cluster_sizes_path, output_json_file):
    # Parse through hit file and m8 input file and format a JSON file with
    # our desired attributes, including aggregated statistics.

    duplicate_cluster_sizes = load_duplicate_cluster_sizes(duplicate_cluster_sizes_path)

    should_keep = build_should_keep_filter(
        deuterostome_path, taxon_whitelist_path, taxon_blacklist_path)
    # Setup
    aggregation = {}
    with open(hit_level_path) as hit_level_f, \
         open(blastn_6_path) as blastn_6_f, \
         open_file_db_by_extension(lineage_map_path) as lineage_map:

        num_ranks = len(lineage.NULL_LINEAGE)
        # See https://en.wikipedia.org/wiki/Double-precision_floating-point_format
        MIN_NORMAL_POSITIVE_DOUBLE = 2.0**-1022

        with log.log_context("generate_taxon_count_json_from_m8", {"substep": "loop_1"}):
            # Lines in m8_file and hit_level_file correspond (same read_id)
            for hit_row, blastn_6_row in zip(HitSummaryMergedReader(hit_level_f), BlastnOutput6NTRerankedReader(blastn_6_f)):
                # Retrieve data values from files
                read_id = hit_row["read_id"]
                hit_level = hit_row["level"]
                hit_taxid = hit_row["taxid"]
                if hit_level < 0:
                    log.write('hit_level < 0', debug=True)
                hit_source_count_type = hit_row.get("source_count_type")

                msg = "read_ids in %s and %s do not match: %s vs. %s" % (
                    os.path.basename(blastn_6_path), os.path.basename(hit_level_path),
                    blastn_6_row["qseqid"], read_id)
                assert blastn_6_row["qseqid"] == read_id, msg
                percent_identity = blastn_6_row["pident"]
                alignment_length = blastn_6_row["length"]

                if count_type == 'merged_NT_NR' and hit_source_count_type == 'NR':
                    # NOTE: At the moment of the change, applied ONLY in the scope of the prototype of NT/NR consensus project.
                    # Protein alignments (NR) are done at amino acid level. Each amino acid is composed of 3 nucleotides.
                    # To make alignment length values comparable across NT and NR alignments (for combined statistics),
                    # the NR alignment lengths are multiplied by 3.
                    alignment_length *= 3
                e_value = blastn_6_row["evalue"]

                # These have been filtered out before the creation of blastn_6_f and hit_level_f
                assert alignment_length > 0
                assert -0.25 < percent_identity < 100.25
                assert e_value == e_value

                if count_type == "NT" or hit_source_count_type == "NT":
                    # e_value could be 0 when large contigs are mapped
                    if e_value <= MIN_NORMAL_POSITIVE_DOUBLE:
                        e_value = MIN_NORMAL_POSITIVE_DOUBLE
                    e_value = math.log10(e_value)

                # Retrieve the taxon lineage and mark meaningless calls with fake
                # taxids.
                # lineage_map expects string ids
                hit_taxids_all_levels = lineage_map.get(
                    str(hit_taxid), lineage.NULL_LINEAGE)
                cleaned_hit_taxids_all_levels = lineage.validate_taxid_lineage(
                    hit_taxids_all_levels, hit_taxid, hit_level)
                assert num_ranks == len(cleaned_hit_taxids_all_levels)

                if should_keep(cleaned_hit_taxids_all_levels):
                    # Aggregate each level and collect statistics
                    agg_key = tuple(cleaned_hit_taxids_all_levels)
                    while agg_key:
                        agg_bucket = aggregation.get(agg_key)
                        if not agg_bucket:
                            agg_bucket = {
                                'nonunique_count': 0,
                                'unique_count': 0,
                                'sum_percent_identity': 0.0,
                                'sum_alignment_length': 0.0,
                                'sum_e_value': 0.0
                            }
                            aggregation[agg_key] = agg_bucket
                        agg_bucket['nonunique_count'] += get_read_cluster_size(
                            duplicate_cluster_sizes, read_id)
                        agg_bucket['unique_count'] += 1
                        agg_bucket['sum_percent_identity'] += percent_identity
                        agg_bucket['sum_alignment_length'] += alignment_length
                        agg_bucket['sum_e_value'] += e_value
                        if hit_source_count_type:
                            agg_bucket.setdefault('source_count_type', set()).add(hit_source_count_type)
                        # Chomp off the lowest rank as we aggregate up the tree
                        agg_key = agg_key[1:]

    # Produce the final output
    taxon_counts_attributes = []
    with log.log_context("generate_taxon_count_json_from_m8", {"substep": "loop_2"}):
        for agg_key, agg_bucket in aggregation.items():
            unique_count = agg_bucket['unique_count']
            nonunique_count = agg_bucket['nonunique_count']
            tax_level = num_ranks - len(agg_key) + 1
            # TODO: Extend taxonomic ranks as indicated on the commented out lines.
            taxon_counts_row = {
                "tax_id":
                agg_key[0],
                "tax_level":
                tax_level,
                # 'species_taxid' : agg_key[tax_level - 1] if tax_level == 1 else "-100",
                'genus_taxid':
                agg_key[2 - tax_level] if tax_level <= 2 else "-200",
                'family_taxid':
                agg_key[3 - tax_level] if tax_level <= 3 else "-300",
                # 'order_taxid' : agg_key[4 - tax_level] if tax_level <= 4 else "-400",
                # 'class_taxid' : agg_key[5 - tax_level] if tax_level <= 5 else "-500",
                # 'phyllum_taxid' : agg_key[6 - tax_level] if tax_level <= 6 else "-600",
                # 'kingdom_taxid' : agg_key[7 - tax_level] if tax_level <= 7 else "-700",
                # 'domain_taxid' : agg_key[8 - tax_level] if tax_level <= 8 else "-800",
                "count":  # this field will be consumed by the webapp
                nonunique_count if READ_COUNTING_MODE == ReadCountingMode.COUNT_ALL else unique_count,
                "nonunique_count":
                nonunique_count,
                "unique_count":
                unique_count,
                "dcr":
                nonunique_count / unique_count,
                "percent_identity":
                agg_bucket['sum_percent_identity'] / unique_count,
                "alignment_length":
                agg_bucket['sum_alignment_length'] / unique_count,
                "e_value":
                agg_bucket['sum_e_value'] / unique_count,
                "count_type":
                count_type
            }
            if agg_bucket.get('source_count_type'):
                taxon_counts_row['source_count_type'] = list(agg_bucket['source_count_type'])

            taxon_counts_attributes.append(taxon_counts_row)
        output_dict = {
            "pipeline_output": {
                "taxon_counts_attributes": taxon_counts_attributes
            }
        }

    with log.log_context(
        "generate_taxon_count_json_from_m8",
        {"substep": "json_dump", "output_json_file": output_json_file}
    ):
        with open(output_json_file, 'w') as outf:
            json.dump(output_dict, outf)
            outf.flush()
Ejemplo n.º 9
0
def generate_taxon_count_json_from_m8(m8_file, hit_level_file, e_value_type,
                                      count_type, lineage_map_path,
                                      deuterostome_path, taxon_whitelist_path,
                                      taxon_blacklist_path,
                                      cdhit_cluster_sizes_path,
                                      output_json_file):
    # Parse through hit file and m8 input file and format a JSON file with
    # our desired attributes, including aggregated statistics.

    cdhit_cluster_sizes = load_cdhit_cluster_sizes(cdhit_cluster_sizes_path)

    should_keep = build_should_keep_filter(deuterostome_path,
                                           taxon_whitelist_path,
                                           taxon_blacklist_path)
    # Setup
    aggregation = {}
    with open(hit_level_file, 'r', encoding='utf-8') as hit_f, \
         open(m8_file, 'r', encoding='utf-8') as m8_f, \
         open_file_db_by_extension(lineage_map_path, IdSeqDictValue.VALUE_TYPE_ARRAY) as lineage_map:  # noqa
        # Lines in m8_file and hit_level_file correspond (same read_id)
        hit_line = hit_f.readline()
        m8_line = m8_f.readline()
        num_ranks = len(lineage.NULL_LINEAGE)
        # See https://en.wikipedia.org/wiki/Double-precision_floating-point_format
        MIN_NORMAL_POSITIVE_DOUBLE = 2.0**-1022

        with log.log_context("generate_taxon_count_json_from_m8",
                             {"substep": "loop_1"}):
            while hit_line and m8_line:
                # Retrieve data values from files
                hit_line_columns = hit_line.rstrip("\n").split("\t")
                read_id = hit_line_columns[0]
                hit_level = hit_line_columns[1]
                hit_taxid = hit_line_columns[2]
                if int(hit_level) < 0:  # Skip negative levels and continue
                    hit_line = hit_f.readline()
                    m8_line = m8_f.readline()
                    continue

                # m8 files correspond to BLAST tabular output format 6:
                # Columns: read_id | _ref_id | percent_identity | alignment_length...
                #
                # * read_id = query (e.g., gene) sequence id
                # * _ref_id = subject (e.g., reference genome) sequence id
                # * percent_identity = percentage of identical matches
                # * alignment_length = length of the alignments
                # * e_value = the expect value
                #
                # See:
                # * http://www.metagenomics.wiki/tools/blast/blastn-output-format-6
                # * http://www.metagenomics.wiki/tools/blast/evalue

                m8_line_columns = m8_line.split("\t")
                msg = "read_ids in %s and %s do not match: %s vs. %s" % (
                    os.path.basename(m8_file),
                    os.path.basename(hit_level_file), m8_line_columns[0],
                    hit_line_columns[0])
                assert m8_line_columns[0] == hit_line_columns[0], msg
                percent_identity = float(m8_line_columns[2])
                alignment_length = float(m8_line_columns[3])
                e_value = float(m8_line_columns[10])

                # These have been filtered out before the creation of m8_f and hit_f
                assert alignment_length > 0
                assert -0.25 < percent_identity < 100.25
                assert e_value == e_value
                if e_value_type != 'log10':
                    # e_value could be 0 when large contigs are mapped
                    if e_value <= MIN_NORMAL_POSITIVE_DOUBLE:
                        e_value = MIN_NORMAL_POSITIVE_DOUBLE
                    e_value = math.log10(e_value)

                # Retrieve the taxon lineage and mark meaningless calls with fake
                # taxids.
                hit_taxids_all_levels = lineage_map.get(
                    hit_taxid, lineage.NULL_LINEAGE)
                cleaned_hit_taxids_all_levels = lineage.validate_taxid_lineage(
                    hit_taxids_all_levels, hit_taxid, hit_level)
                assert num_ranks == len(cleaned_hit_taxids_all_levels)

                if should_keep(cleaned_hit_taxids_all_levels):
                    # Aggregate each level and collect statistics
                    agg_key = tuple(cleaned_hit_taxids_all_levels)
                    while agg_key:
                        agg_bucket = aggregation.get(agg_key)
                        if not agg_bucket:
                            agg_bucket = {
                                'nonunique_count': 0,
                                'unique_count': 0,
                                'sum_percent_identity': 0.0,
                                'sum_alignment_length': 0.0,
                                'sum_e_value': 0.0
                            }
                            aggregation[agg_key] = agg_bucket
                        agg_bucket['nonunique_count'] += get_read_cluster_size(
                            cdhit_cluster_sizes, read_id)
                        agg_bucket['unique_count'] += 1
                        agg_bucket['sum_percent_identity'] += percent_identity
                        agg_bucket['sum_alignment_length'] += alignment_length
                        agg_bucket['sum_e_value'] += e_value
                        # Chomp off the lowest rank as we aggregate up the tree
                        agg_key = agg_key[1:]

                hit_line = hit_f.readline()
                m8_line = m8_f.readline()

    # Produce the final output
    taxon_counts_attributes = []
    with log.log_context("generate_taxon_count_json_from_m8",
                         {"substep": "loop_2"}):
        for agg_key, agg_bucket in aggregation.items():
            unique_count = agg_bucket['unique_count']
            nonunique_count = agg_bucket['nonunique_count']
            tax_level = num_ranks - len(agg_key) + 1
            # TODO: Extend taxonomic ranks as indicated on the commented out lines.
            taxon_counts_attributes.append({
                "tax_id":
                agg_key[0],
                "tax_level":
                tax_level,
                # 'species_taxid' : agg_key[tax_level - 1] if tax_level == 1 else "-100",
                'genus_taxid':
                agg_key[2 - tax_level] if tax_level <= 2 else "-200",
                'family_taxid':
                agg_key[3 - tax_level] if tax_level <= 3 else "-300",
                # 'order_taxid' : agg_key[4 - tax_level] if tax_level <= 4 else "-400",
                # 'class_taxid' : agg_key[5 - tax_level] if tax_level <= 5 else "-500",
                # 'phyllum_taxid' : agg_key[6 - tax_level] if tax_level <= 6 else "-600",
                # 'kingdom_taxid' : agg_key[7 - tax_level] if tax_level <= 7 else "-700",
                # 'domain_taxid' : agg_key[8 - tax_level] if tax_level <= 8 else "-800",
                "count":  # this field will be consumed by the webapp
                nonunique_count if READ_COUNTING_MODE == ReadCountingMode.COUNT_ALL else unique_count,
                "nonunique_count":
                nonunique_count,
                "unique_count":
                unique_count,
                "dcr":
                nonunique_count / unique_count,
                "percent_identity":
                agg_bucket['sum_percent_identity'] / unique_count,
                "alignment_length":
                agg_bucket['sum_alignment_length'] / unique_count,
                "e_value":
                agg_bucket['sum_e_value'] / unique_count,
                "count_type":
                count_type
            })
        output_dict = {
            "pipeline_output": {
                "taxon_counts_attributes": taxon_counts_attributes
            }
        }

    with log.log_context("generate_taxon_count_json_from_m8", {
            "substep": "json_dump",
            "output_json_file": output_json_file
    }):
        with open(output_json_file, 'w') as outf:
            json.dump(output_dict, outf)
            outf.flush()