def generate_info_from_sam(bowtie_sam_file, read2contig,
                            cdhitdup_cluster_sizes_path):
     contig_stats = defaultdict(int)
     contig_unique_counts = defaultdict(int)
     cdhit_cluster_sizes = load_cdhit_cluster_sizes(
         cdhitdup_cluster_sizes_path)
     with open(bowtie_sam_file, "r", encoding='utf-8') as samf:
         for line in samf:
             if line[0] == '@':
                 continue
             fields = line.split("\t")
             read = fields[0]
             contig = fields[2]
             contig_stats[contig] += get_read_cluster_size(
                 cdhit_cluster_sizes,
                 read)  # these are non-unique read counts now
             contig_unique_counts[contig] += 1
             if contig != '*':
                 read2contig[read] = contig
     for contig, unique_count in contig_unique_counts.items(
     ):  # TODO can't we just filter those out after spades, IN ONE PLACE
         if unique_count < MIN_CONTIG_SIZE:
             del contig_stats[contig]
         elif READ_COUNTING_MODE == ReadCountingMode.COUNT_UNIQUE:
             contig_stats[contig] = unique_count
     return contig_stats
Esempio n. 2
0
 def _count_reads_work(self, cluster_key, counter_name, fasta_files):
     # Count reads including duplicates (expanding cd-hit-dup clusters).
     self.should_count_reads = True
     self.counts_dict[counter_name] = count.reads_in_group(
         file_group=fasta_files,
         cluster_sizes=load_cdhit_cluster_sizes(self.input_cluster_sizes_path()),
         cluster_key=cluster_key)
    def generate_taxon_summary(read2contig, contig2lineage, read_dict,
                               added_reads_dict, db_type,
                               cdhit_cluster_sizes_path, should_keep):
        # Return an array with
        # { taxid: , tax_level:, contig_counts: { 'contig_name': <count>, .... } }
        cdhit_cluster_sizes = load_cdhit_cluster_sizes(
            cdhit_cluster_sizes_path)

        def new_summary():
            return defaultdict(lambda: defaultdict(lambda: [0, 0]))

        genus_summary = new_summary()
        species_summary = new_summary()

        def record_read(species_taxid, genus_taxid, contig, read_id):
            cluster_size = get_read_cluster_size(cdhit_cluster_sizes, read_id)

            def increment(counters):
                counters[0] += 1
                counters[1] += cluster_size

            increment(species_summary[species_taxid][contig])
            increment(genus_summary[genus_taxid][contig])

        for read_id, read_info in read_dict.items():
            contig = read2contig.get(read_id, '*')
            lineage = contig2lineage.get(contig)
            if contig != '*' and lineage:
                species_taxid, genus_taxid, _family_taxid = lineage
            else:
                # not mapping to a contig, or missing contig lineage
                species_taxid, genus_taxid = read_info[4:6]
                contig = '*'
            if should_keep((species_taxid, genus_taxid)):
                record_read(species_taxid, genus_taxid, contig, read_id)

        for read_id, read_info in added_reads_dict.items():
            contig = read2contig[read_id]
            species_taxid, genus_taxid, _family_taxid = contig2lineage[contig]
            if should_keep((species_taxid, genus_taxid)):
                record_read(species_taxid, genus_taxid, contig, read_id)

        # Filter out contigs that contain too few unique reads.
        # This used to happen in db_loader in idseq-web.  Any code left there that still appears to
        # do this filtering is effectively a no-op and the filtering cannot be done there because
        # the non-unique read counts are no longer output by the pipeline.
        for summary in [species_summary, genus_summary]:
            for taxid in list(summary.keys()):
                contig_counts = summary[taxid]
                for contig in list(contig_counts.keys()):
                    unique_count, nonunique_count = contig_counts[contig]
                    if unique_count < MIN_CONTIG_SIZE:
                        del contig_counts[contig]
                    else:
                        contig_counts[
                            contig] = nonunique_count if READ_COUNTING_MODE == ReadCountingMode.COUNT_ALL else unique_count
                if not contig_counts:
                    del summary[taxid]

        # construct the array for output
        output_array = []
        for idx, summary in enumerate([species_summary, genus_summary]):
            tax_level = idx + 1
            for taxid, contig_counts in summary.items():
                entry = {
                    'taxid': taxid,
                    'tax_level': tax_level,
                    'count_type': db_type.upper(),
                    'contig_counts': contig_counts
                }
                output_array.append(entry)

        return output_array
Esempio n. 4
0
def generate_taxon_count_json_from_m8(m8_file, hit_level_file, e_value_type,
                                      count_type, lineage_map_path,
                                      deuterostome_path, taxon_whitelist_path,
                                      taxon_blacklist_path,
                                      cdhit_cluster_sizes_path,
                                      output_json_file):
    # Parse through hit file and m8 input file and format a JSON file with
    # our desired attributes, including aggregated statistics.

    cdhit_cluster_sizes = load_cdhit_cluster_sizes(cdhit_cluster_sizes_path)

    should_keep = build_should_keep_filter(deuterostome_path,
                                           taxon_whitelist_path,
                                           taxon_blacklist_path)
    # Setup
    aggregation = {}
    with open(hit_level_file, 'r', encoding='utf-8') as hit_f, \
         open(m8_file, 'r', encoding='utf-8') as m8_f, \
         open_file_db_by_extension(lineage_map_path, IdSeqDictValue.VALUE_TYPE_ARRAY) as lineage_map:  # noqa
        # Lines in m8_file and hit_level_file correspond (same read_id)
        hit_line = hit_f.readline()
        m8_line = m8_f.readline()
        num_ranks = len(lineage.NULL_LINEAGE)
        # See https://en.wikipedia.org/wiki/Double-precision_floating-point_format
        MIN_NORMAL_POSITIVE_DOUBLE = 2.0**-1022

        with log.log_context("generate_taxon_count_json_from_m8",
                             {"substep": "loop_1"}):
            while hit_line and m8_line:
                # Retrieve data values from files
                hit_line_columns = hit_line.rstrip("\n").split("\t")
                read_id = hit_line_columns[0]
                hit_level = hit_line_columns[1]
                hit_taxid = hit_line_columns[2]
                if int(hit_level) < 0:  # Skip negative levels and continue
                    hit_line = hit_f.readline()
                    m8_line = m8_f.readline()
                    continue

                # m8 files correspond to BLAST tabular output format 6:
                # Columns: read_id | _ref_id | percent_identity | alignment_length...
                #
                # * read_id = query (e.g., gene) sequence id
                # * _ref_id = subject (e.g., reference genome) sequence id
                # * percent_identity = percentage of identical matches
                # * alignment_length = length of the alignments
                # * e_value = the expect value
                #
                # See:
                # * http://www.metagenomics.wiki/tools/blast/blastn-output-format-6
                # * http://www.metagenomics.wiki/tools/blast/evalue

                m8_line_columns = m8_line.split("\t")
                msg = "read_ids in %s and %s do not match: %s vs. %s" % (
                    os.path.basename(m8_file),
                    os.path.basename(hit_level_file), m8_line_columns[0],
                    hit_line_columns[0])
                assert m8_line_columns[0] == hit_line_columns[0], msg
                percent_identity = float(m8_line_columns[2])
                alignment_length = float(m8_line_columns[3])
                e_value = float(m8_line_columns[10])

                # These have been filtered out before the creation of m8_f and hit_f
                assert alignment_length > 0
                assert -0.25 < percent_identity < 100.25
                assert e_value == e_value
                if e_value_type != 'log10':
                    # e_value could be 0 when large contigs are mapped
                    if e_value <= MIN_NORMAL_POSITIVE_DOUBLE:
                        e_value = MIN_NORMAL_POSITIVE_DOUBLE
                    e_value = math.log10(e_value)

                # Retrieve the taxon lineage and mark meaningless calls with fake
                # taxids.
                hit_taxids_all_levels = lineage_map.get(
                    hit_taxid, lineage.NULL_LINEAGE)
                cleaned_hit_taxids_all_levels = lineage.validate_taxid_lineage(
                    hit_taxids_all_levels, hit_taxid, hit_level)
                assert num_ranks == len(cleaned_hit_taxids_all_levels)

                if should_keep(cleaned_hit_taxids_all_levels):
                    # Aggregate each level and collect statistics
                    agg_key = tuple(cleaned_hit_taxids_all_levels)
                    while agg_key:
                        agg_bucket = aggregation.get(agg_key)
                        if not agg_bucket:
                            agg_bucket = {
                                'nonunique_count': 0,
                                'unique_count': 0,
                                'sum_percent_identity': 0.0,
                                'sum_alignment_length': 0.0,
                                'sum_e_value': 0.0
                            }
                            aggregation[agg_key] = agg_bucket
                        agg_bucket['nonunique_count'] += get_read_cluster_size(
                            cdhit_cluster_sizes, read_id)
                        agg_bucket['unique_count'] += 1
                        agg_bucket['sum_percent_identity'] += percent_identity
                        agg_bucket['sum_alignment_length'] += alignment_length
                        agg_bucket['sum_e_value'] += e_value
                        # Chomp off the lowest rank as we aggregate up the tree
                        agg_key = agg_key[1:]

                hit_line = hit_f.readline()
                m8_line = m8_f.readline()

    # Produce the final output
    taxon_counts_attributes = []
    with log.log_context("generate_taxon_count_json_from_m8",
                         {"substep": "loop_2"}):
        for agg_key, agg_bucket in aggregation.items():
            unique_count = agg_bucket['unique_count']
            nonunique_count = agg_bucket['nonunique_count']
            tax_level = num_ranks - len(agg_key) + 1
            # TODO: Extend taxonomic ranks as indicated on the commented out lines.
            taxon_counts_attributes.append({
                "tax_id":
                agg_key[0],
                "tax_level":
                tax_level,
                # 'species_taxid' : agg_key[tax_level - 1] if tax_level == 1 else "-100",
                'genus_taxid':
                agg_key[2 - tax_level] if tax_level <= 2 else "-200",
                'family_taxid':
                agg_key[3 - tax_level] if tax_level <= 3 else "-300",
                # 'order_taxid' : agg_key[4 - tax_level] if tax_level <= 4 else "-400",
                # 'class_taxid' : agg_key[5 - tax_level] if tax_level <= 5 else "-500",
                # 'phyllum_taxid' : agg_key[6 - tax_level] if tax_level <= 6 else "-600",
                # 'kingdom_taxid' : agg_key[7 - tax_level] if tax_level <= 7 else "-700",
                # 'domain_taxid' : agg_key[8 - tax_level] if tax_level <= 8 else "-800",
                "count":  # this field will be consumed by the webapp
                nonunique_count if READ_COUNTING_MODE == ReadCountingMode.COUNT_ALL else unique_count,
                "nonunique_count":
                nonunique_count,
                "unique_count":
                unique_count,
                "dcr":
                nonunique_count / unique_count,
                "percent_identity":
                agg_bucket['sum_percent_identity'] / unique_count,
                "alignment_length":
                agg_bucket['sum_alignment_length'] / unique_count,
                "e_value":
                agg_bucket['sum_e_value'] / unique_count,
                "count_type":
                count_type
            })
        output_dict = {
            "pipeline_output": {
                "taxon_counts_attributes": taxon_counts_attributes
            }
        }

    with log.log_context("generate_taxon_count_json_from_m8", {
            "substep": "json_dump",
            "output_json_file": output_json_file
    }):
        with open(output_json_file, 'w') as outf:
            json.dump(output_dict, outf)
            outf.flush()
 def count_reads(self):
     self.should_count_reads = True
     self.counts_dict[self.name] = reads_in_group(
         file_group=self.output_files_local()[0:2],
         cluster_sizes=load_cdhit_cluster_sizes(self.input_cluster_sizes_path()),
         cluster_key=lambda x: x)