def align_to_haplotype(self, this_haplotype, haplotypes, prefix, suffix,
                           reads, contig, ref_start):
        """Align reads to a given haplotype, not necessarily the reference.

    Align reads to a graph of haplotypes, reporting the alignments relative
    to a specific haplotype. This allows treating any alternate allele as
    the reference.

    Args:
      this_haplotype: string. Sequence of the haplotype to treat as reference,
        reporting alignments according to its coordinates.
      haplotypes: list of strings. All haplotypes to use in the graph, including
        this_haplotype.
      prefix: string. Sequence to the left of where the haplotypes differ.
      suffix: string. Sequence to the right of where the haplotypes differ.
      reads: reads to align.
      contig: string. Name of the 'reference' to report in read alignments.
      ref_start: integer. Start position of the region to report in read
        alignments. This should mark the beginning of the prefix sequence.

    Returns:
      Reads. Realigned and reported relative to the chosen haplotype.
    """
        if not reads:
            return []
        fast_pass_realigner = fast_pass_aligner.FastPassAligner()
        aln_config = self.config.aln_config
        aln_config.read_size = len(reads[0].aligned_sequence)
        aln_config.force_alignment = True
        fast_pass_realigner.set_options(aln_config)
        fast_pass_realigner.set_reference(prefix + this_haplotype + suffix)
        fast_pass_realigner.set_ref_start(contig, ref_start)

        # Testing found that when the prefix and suffix both go right up to the
        # ref/alt variants, the alignment does not work well, so a margin of 100
        # bases on each side of the variant are used here to pad each
        # haplotype with enough sequence to align against. While some further
        # testing showed this could be reduced, 100 is the only value that has been
        # tested with a full training experiment.
        central_allele_margin = min(len(prefix), len(suffix), 100)
        fast_pass_realigner.set_ref_prefix_len(
            len(prefix) - central_allele_margin)
        fast_pass_realigner.set_ref_suffix_len(
            len(suffix) - central_allele_margin)
        extended_haplotypes = [
            prefix + target + suffix for target in haplotypes
        ]
        fast_pass_realigner.set_haplotypes(extended_haplotypes)
        return fast_pass_realigner.realign_reads(reads)
Exemple #2
0
    def call_fast_pass_aligner(self, assembled_region):
        """Helper function to call fast pass aligner module."""
        if not assembled_region.reads:
            return []

        contig = assembled_region.region.reference_name
        ref_start = max(
            0,
            min(assembled_region.read_span.start,
                assembled_region.region.start) - _REF_ALIGN_MARGIN)
        ref_end = min(
            self.ref_reader.contig(contig).n_bases,
            max(assembled_region.read_span.end, assembled_region.region.end) +
            _REF_ALIGN_MARGIN)

        ref_prefix = self.ref_reader.query(
            ranges.make_range(contig, ref_start,
                              assembled_region.region.start))
        ref = self.ref_reader.query(assembled_region.region)

        # If we can't create the ref suffix then return the original alignments.
        if ref_end <= assembled_region.region.end:
            return assembled_region.reads
        else:
            ref_suffix = self.ref_reader.query(
                ranges.make_range(contig, assembled_region.region.end,
                                  ref_end))

        ref_seq = ref_prefix + ref + ref_suffix

        fast_pass_realigner = fast_pass_aligner.FastPassAligner()
        # Read sizes may vary. We need this for realigner initialization and sanity
        # checks.
        self.config.aln_config.read_size = len(
            assembled_region.reads[0].aligned_sequence)
        self.config.aln_config.force_alignment = False
        fast_pass_realigner.set_normalize_reads(self.config.normalize_reads)
        fast_pass_realigner.set_options(self.config.aln_config)
        fast_pass_realigner.set_reference(ref_seq)
        fast_pass_realigner.set_ref_start(contig, ref_start)
        fast_pass_realigner.set_ref_prefix_len(len(ref_prefix))
        fast_pass_realigner.set_ref_suffix_len(len(ref_suffix))
        fast_pass_realigner.set_haplotypes([
            ref_prefix + target + ref_suffix
            for target in assembled_region.haplotypes
        ])
        return fast_pass_realigner.realign_reads(assembled_region.reads)