Esempio n. 1
0
def _count_reads_expanding_duplicates(local_file_path, cluster_sizes,
                                      cluster_key):
    # See documentation for reads_in_group use case with cluster_sizes, below.
    unique_count, nonunique_count = 0, 0
    for read in fasta.iterator(local_file_path):
        # A read header looks someting like
        #
        #    >M05295:357:000000000-CRPNR:1:1101:22051:10534 OPTIONAL RANDOM STUFF"
        #     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        #
        # where the first character on the line is '>' and the read ID (underlined above)
        # extends from '>' to the first whitespace character, not including '>' itself.
        #
        # The fasta iterator already asserts that read.header[0] is '>'.
        #
        # As we proceed down along the pipeline, read IDs get annotated with taxonomic information,
        # changing the above into something like
        #
        #   >NT:ABC2433.1:NR:ABC5656.2:M05295:357:000000000-CRPNR:1:1101:22051:10534 OPTIONAL RANDOM STUFF"
        #    ^^^^^^^^^^^^^^^^^^^^^^^^^^
        #
        # The underlined annotation has to be stripped out by the cluster_key function,
        # so that we can use the original read ID to look up the cluster size.
        #
        read_id = read.header.split(None, 1)[0][1:]
        unique_count += 1
        nonunique_count += get_read_cluster_size(cluster_sizes,
                                                 cluster_key(read_id))
    return unique_count, nonunique_count
Esempio n. 2
0
    def run(self):
        input_fa_name = self.input_files_local[0][0]
        if len(self.input_files_local) > 1:
            input_fa_name = self.input_files_local[0][0]
            nt_hit_summary_path, nr_hit_summary_path = self.input_files_local[
                1][2], self.input_files_local[2][2]
        else:
            # This is used in `short-read-mngs/experimental.wdl`
            input_fa_name = self.input_files_local[0][0]
            nt_hit_summary_path, nr_hit_summary_path = self.input_files_local[
                0][1], self.input_files_local[0][2]

        # Open lineage db
        lineage_db = s3.fetch_reference(self.additional_files["lineage_db"],
                                        self.ref_dir_local,
                                        allow_s3mi=True)

        with open(nt_hit_summary_path) as nt_hit_summary_f, open(
                nr_hit_summary_path) as nr_hit_summary_f:
            nr_hits_by_read_id = {
                row["read_id"]: (row["taxid"], row["level"])
                for row in HitSummaryMergedReader(nr_hit_summary_f)
            }
            nt_hits_by_read_id = {
                row["read_id"]: (row["taxid"], row["level"])
                for row in HitSummaryMergedReader(nt_hit_summary_f)
            }

        with open(self.output_files_local()[0], "w") as output_fa, \
             open_file_db_by_extension(lineage_db) as lineage_map:  # noqa
            for read in fasta.iterator(input_fa_name):
                # Example read_id: "NR::NT:CP010376.2:NB501961:14:HM7TLBGX2:1:23109
                # :12720:8743/2"
                # Translate the read information into our custom format with fake
                # taxids at non-specific hit levels.
                # TODO: (tmorse) fasta parsing
                annotated_read_id = read.header.lstrip('>')
                read_id = annotated_read_id.split(":", 4)[-1]

                nr_taxid_species, nr_taxid_genus, nr_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage(
                    nr_hits_by_read_id, lineage_map, read_id)
                nt_taxid_species, nt_taxid_genus, nt_taxid_family = PipelineStepGenerateTaxidFasta.get_valid_lineage(
                    nt_hits_by_read_id, lineage_map, read_id)

                fields = [
                    "family_nr", nr_taxid_family, "family_nt", nt_taxid_family
                ]
                fields += [
                    "genus_nr", nr_taxid_genus, "genus_nt", nt_taxid_genus
                ]
                fields += [
                    "species_nr", nr_taxid_species, "species_nt",
                    nt_taxid_species
                ]
                fields += [annotated_read_id]
                new_read_name = ('>' + ':'.join(fields) + '\n')

                output_fa.write(new_read_name)
                output_fa.write(read.sequence + "\n")
    def generate_unidentified_fasta(self,
                                    input_fa,
                                    output_fa,
                                    clusters_dict=None,
                                    unique_output_fa=None):
        """
        Generates files with all unmapped reads. If COUNT_ALL, which was added
        in v4, then include non-unique reads extracted upstream by idseq-dedup.

        unique_output_fa exists primarily for counting. See count_reads above.
        """
        unique_output_file = open(unique_output_fa,
                                  "w") if clusters_dict else None
        with open(output_fa, "w") as output_file:
            for read in fasta.iterator(input_fa):
                if not read.header.startswith(UNMAPPED_HEADER_PREFIX):
                    continue

                output_file.write(read.header + "\n")
                output_file.write(read.sequence + "\n")
                if unique_output_file:
                    unique_output_file.write(read.header + "\n")
                    unique_output_file.write(read.sequence + "\n")

                if clusters_dict:
                    # get inner part of header like
                    # '>NR::NT::NB501961:14:HM7TLBGX2:4:23511:18703:20079/2'
                    line = read.header
                    header_suffix = ""
                    if line[-2:-1] == "/":  # /1 or /2
                        line, header_suffix = line[:-2], line[-2:]
                        assert header_suffix in ('/1', '/2')
                        assert len(
                            read.header) == len(line) + len(header_suffix)

                    key = line.split(UNMAPPED_HEADER_PREFIX)[1]
                    other_keys = clusters_dict[key][
                        1:]  # key should always be present
                    for other_key in other_keys:
                        other_header = UNMAPPED_HEADER_PREFIX + other_key + header_suffix
                        output_file.write(other_header + "\n")
                        output_file.write(read.sequence +
                                          "\n")  # write duplicate seq
def parse_clusters_file(
    cdhit_clusters_path: str,
    deduped_fasta_path: str,
) -> Dict[str, Optional[Tuple]]:
    # First identify the cluster representative reads emitted by cd-hit-dup.  Originally we
    # used the ".clstr" output of cd-hit-dup for this, but turns out that for unpaired reads
    # the actual deduped output of cdhit contains different representatives.
    clusters_dict: Dict[str, Optional[Tuple]] = {}
    for read in iterator(deduped_fasta_path):
        # A read header looks someting like
        #
        #    >M05295:357:000000000-CRPNR:1:1101:22051:10534 OPTIONAL RANDOM STUFF"
        #     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        #
        # where the first character on the line is '>' and the read ID (underlined above)
        # extends from '>' to the first whitespace character, not including '>' itself.
        #
        # The fasta iterator already asserts that read.header[0] is '>'.
        #
        read_id = read.header.split(None, 1)[0][1:]
        clusters_dict[read_id] = None  # not yet known

    def record_cluster_size(
        cluster_size: int,
        emitted_reads_from_cluster: set,
        other_reads_from_cluster: set,
        line_number: int,
    ):
        assert emitted_reads_from_cluster, f"""If this assertion fails,
        CD-HIT-DUP has forgotten to emit a read for this cluster.  In that case,
        just use the current read_id as cluster_representative.  Everything will
        work fine, aside from reduced sensitivity. {line_number}"""

        assert len(emitted_reads_from_cluster) == 1, f"""If this assertion
        fails, CD-HIT-DUP has emitted multiple reads from the same cluster.
        Feel free to comment out this assertion if that happens a lot in
        practice.  Everything will run fine, but read counts contributed by that
        cluster will be exaggerated.  If you want to fix that, make the cluster
        sizes a float --- divide the actual cluster size by the number of reads
        emitted for the cluster, i.e. by len(emitted_reads_from_cluster).
        Probably an even better way of fixing it would be to emit your own fasta
        based on the .clstr file if that's reliable, or use a tool other than
        cdhit that doesn't have this bug.  {line_number}:
        {emitted_reads_from_cluster}"""

        cluster_representative = emitted_reads_from_cluster.pop()

        assert cluster_representative in clusters_dict, "If this fails it's our bug here."

        assert cluster_size - 1 == len(other_reads_from_cluster), """other_reads_from_cluster should
        contain the number of reads specified by cluster_size minus cluster_representative:
        {}, {}""".format(cluster_size, other_reads_from_cluster)

        clusters_dict[cluster_representative] = (cluster_size,) + tuple(other_reads_from_cluster)
        return

    with open(cdhit_clusters_path, "r") as clusters_file:
        # set of reads in both dedup1.fa and current cluster; cardinality 1!
        emitted_reads_from_cluster: Set[str] = set()
        other_reads_from_cluster: Set[str] = set()
        cluster_size = 0
        read_id = None
        line_number = 0
        for line in clusters_file:
            line_number += 1
            if line.startswith(">"):
                continue
            parts = line.strip().split()
            serial = int(parts[0])
            assert parts[2][0] == ">", line
            assert parts[2].endswith("..."), line
            if serial == 0 and cluster_size > 0:
                # We've just encountered the first read of a new cluster.  Emit
                # all data held for the old cluster.
                record_cluster_size(
                    cluster_size,
                    emitted_reads_from_cluster,
                    other_reads_from_cluster,
                    line_number,
                )
                emitted_reads_from_cluster = set()
                other_reads_from_cluster = set()
                cluster_size = 0
            assert cluster_size == serial, f"{line_number}: {cluster_size}, {serial}, {line}"
            read_id = parts[2][1:-3]
            cluster_size += 1
            if read_id in clusters_dict:
                emitted_reads_from_cluster.add(read_id)
            else:
                other_reads_from_cluster.add(read_id)
        # record last cluster
        if cluster_size > 0:
            record_cluster_size(
                cluster_size,
                emitted_reads_from_cluster,
                other_reads_from_cluster,
                line_number,
            )

    return clusters_dict