def test_long_cigar_parsing_no_context(self):

        cigar = "110M1I174M"
        query = (
            "AATCCAGCTAGCTGTGCAGCATCTGACAAGCTCTATTCAACTGCAGTAATGCTACCTCGTA"
            "CTCACGCTTTCCAAGTGCTTGGCGTCGCATCTCGGTCCTTTGTACGCCGAAAAAATGGCCTG"
            "ACAACTAAGCTACGGCACGCTGCCATGTTGGGTCATAACGATATCTCTGGTTCATCCGTGAC"
            "CGAACATGTCATGGAGTAGCAGGAGCTATTAATTCGCGGAGGACAATGAGGTTCGTAGTCAC"
            "TGTCTTCCGCAATCGTACATCGCTCCTGCAGGTGGCCT")
        cutsite_window = 12
        context = False

        intBC, indels = alignment_utilities.parse_cigar(
            cigar,
            query,
            self.long_ref,
            0,
            0,
            self.long_barcode_interval,
            self.long_cutsites,
            cutsite_window,
            context,
            context_size=0,
        )

        self.assertEqual(intBC, "ATCTGACAAGCTCT")

        expected_cuts = ["111:1I", "None", "None"]

        for i in range(len(indels)):

            self.assertEqual(indels[i], expected_cuts[i])
    def test_basic_cigar_string_deletion(self):

        cigar = "6M2D2M"
        query = "ACGGTTTT"
        cutsite_window = 1
        context = False

        intBC, indels = alignment_utilities.parse_cigar(
            cigar,
            query,
            self.basic_ref,
            0,
            0,
            self.basic_barcode_interval,
            self.basic_cutsites,
            cutsite_window,
            context,
        )

        self.assertEqual(intBC, "GG")
        self.assertEqual(len(self.basic_cutsites), len(indels))
        self.assertEqual(indels[0], "7:2D")
    def test_basic_cigar_string_match(self):

        query = "ACGGTTAATT"
        cigar = "10M"
        cutsite_window = 1
        context = False

        intBC, indels = alignment_utilities.parse_cigar(
            cigar,
            query,
            self.basic_ref,
            0,
            0,
            self.basic_barcode_interval,
            self.basic_cutsites,
            cutsite_window,
            context,
        )

        self.assertEqual(intBC, "GG")
        self.assertEqual(len(self.basic_cutsites), len(indels))
        self.assertEqual(indels[0], "None")
    def test_basic_cigar_string_insertion_with_context(self):

        cigar = "8M3I2M"
        query = "ACGGTTAAGTGTT"
        cutsite_window = 1
        context = True

        intBC, indels = alignment_utilities.parse_cigar(
            cigar,
            query,
            self.basic_ref,
            0,
            0,
            self.basic_barcode_interval,
            self.basic_cutsites,
            cutsite_window,
            context,
            context_size=1,
        )

        self.assertEqual(intBC, "GG")
        self.assertEqual(len(self.basic_cutsites), len(indels))
        self.assertEqual(indels[0], "A[9:3I]GTGT")
    def test_complex_cigar_parsing_intersite_deletion(self):

        cigar = "98M13D55M54D130M"
        query = (
            "AATCCAGCTAGCTGTGCAGCTGACAGGGAAGCAAATTCAACTGCAGTAATGCTACCTCGT"
            "ACTCACGCTTTCCAAGTGCTTGGCGTCGCATCTCGGTCAAAAAGGCCTTACAACAAAGCTACTGA"
            "ACGAAGCAATGTTAGGACATAAAGATATCGGAGGACAATGATGTAAGTAGTCACTGTCTTCCTAA"
            "ATAGTCAAACTCTCCAGAACATTGAATAGAGGGCCCGAAAAAACCATCTCAAAAGCCTCTACACA"
            "GACTTCTAGAATACAAACAACTGATCTT")
        cutsite_window = 12
        context = True

        intBC, indels = alignment_utilities.parse_cigar(
            cigar,
            query,
            self.long_ref,
            0,
            0,
            self.long_barcode_interval,
            self.long_cutsites,
            cutsite_window,
            context,
            context_size=5,
        )

        self.assertEqual(intBC, "TGACAGGGAAGCAA")

        expected_cuts = [
            "CGGTC[99:13D]AAAAA",
            "GATAT[167:54D]CGGAG",
            "GATAT[167:54D]CGGAG",
        ]

        for i in range(len(indels)):

            self.assertEqual(indels[i], expected_cuts[i])
    def test_intersite_deletion_parsing(self):

        cigar = "112M108D173M"
        query = (
            "AATCCAGCTAGCTGTGCAGCTTGTTTTAAACCAGATTCAACTGCAGTAATGCTACCTCGT"
            "ACTCACGCTTTCCAAGTGCTTGGCGTAGCATCTAGGTCCTAAGTACGCCGAACGGCTGACAATGC"
            "GGTTCGTAGTCACTGTCTACCGCAAACGTCAATCGCTCATCCAGGTGGCCAAGAGGGCACGTTTA"
            "CACACGCTGATCATCCTCGACTGTGCCCTCTAGTAGCCAGCCAGAGGTTGTGTGCCCCTCCCCCG"
            "GGCCGTCCGTGACCCTGGAAGGTGCCACTC")
        cutsite_window = 12
        context = True

        intBC, indels = alignment_utilities.parse_cigar(
            cigar,
            query,
            self.long_ref,
            0,
            0,
            self.long_barcode_interval,
            self.long_cutsites,
            cutsite_window,
            context,
            context_size=5,
        )

        self.assertEqual(intBC, "TTGTTTTAAACCAG")

        expected_cuts = [
            "CCGAA[113:108D]CGGCT",
            "CCGAA[113:108D]CGGCT",
            "CCGAA[113:108D]CGGCT",
        ]

        for i in range(len(indels)):

            self.assertEqual(indels[i], expected_cuts[i])
Beispiel #7
0
def call_alleles(
    alignments: pd.DataFrame,
    ref_filepath: Optional[str] = None,
    ref: Optional[str] = None,
    barcode_interval: Tuple[int, int] = (20, 34),
    cutsite_locations: List[int] = [112, 166, 220],
    cutsite_width: int = 12,
    context: bool = True,
    context_size: int = 5,
) -> pd.DataFrame:
    """Call indels from CIGAR strings.

    Given many alignments, we extract the indels by comparing the CIGAR strings
    of each alignment to the reference sequence.

    Args:
        alignments: Alignments provided in DataFrame
        ref_filepath: Filepath to the reference sequence
        ref: Nucleotide sequence of the reference
        barcode_interval: Interval in reference corresponding to the integration
            barcode
        cutsite_locations: A list of all cutsite positions in the reference
        cutsite_width: Number of nucleotides left and right of cutsite location
            that indels can appear in.
        context: Include sequence context around indels
        context_size: Number of bases to the right and left to include as
            context

    Returns:
        A DataFrame mapping each sequence alignment to the called indels.
    """
    if (ref is None) == (ref_filepath is None):
        raise PreprocessError(
            "Either `ref_filepath` or `ref` must be provided.")

    alignment_to_indel = {}
    alignment_to_intBC = {}

    if ref_filepath:
        ref = str(list(SeqIO.parse(ref_filepath, "fasta"))[0].seq)

    for _, row in tqdm(
            alignments.iterrows(),
            total=alignments.shape[0],
            desc="Parsing CIGAR strings into indels",
    ):

        intBC, indels = alignment_utilities.parse_cigar(
            row.CIGAR,
            row.Seq,
            ref,
            row.ReferenceBegin,
            row.QueryBegin,
            barcode_interval,
            cutsite_locations,
            cutsite_width,
            context=context,
            context_size=context_size,
        )

        alignment_to_indel[row.readName] = indels
        alignment_to_intBC[row.readName] = intBC

    indel_df = pd.DataFrame.from_dict(
        alignment_to_indel,
        orient="index",
        columns=[f"r{i}" for i in range(1,
                                        len(cutsite_locations) + 1)],
    )

    indel_df["allele"] = indel_df.apply(
        lambda x: "".join([str(i) for i in x.values]), axis=1)
    indel_df["intBC"] = indel_df.index.map(alignment_to_intBC)

    alignments.set_index("readName", inplace=True)

    alignments = alignments.join(indel_df)

    alignments.reset_index(inplace=True)

    # check cut-sites and raise a warning if any missing data is detected
    cutsites = utilities.get_default_cut_site_columns(alignments)
    if np.any((alignments[cutsites] == "").sum(axis=0) > 0):
        warnings.warn(
            "Detected missing data in alleles. You might"
            " consider re-running align_sequences with a"
            " lower gap-open penalty, or using a separate"
            " alignment strategy.",
            PreprocessWarning,
        )

    return alignments