Beispiel #1
 def trinucleotid_runs(data):
     Calculates the dropchance based on the chance of trinucleotid microsatellites to mutate. Microsatellites are
     repeats of two to six units that have a higher chance to mutate than regular polymers.
     The dropchances are additive in this case (@shouldDrop()).
     :param data: The DNA sequence to check for microsatellites with length 3.
     :return: The dropchance based on the occurence of microsatellites with length 3.
     return shouldDrop(
             ("microsatelliteLongerThanX(3,10)", 0.001),
             ("microsatelliteLongerThanX(3,15)", 0.002),
             ("microsatelliteLongerThanX(3,20)", 0.003),
             ("microsatelliteLongerThanX(3,25)", 0.004),
             ("microsatelliteLongerThanX(3,30)", 0.005),
             ("microsatelliteLongerThanX(3,35)", 0.006),
             ("microsatelliteLongerThanX(3,40)", 0.007),
             ("microsatelliteLongerThanX(3,45)", 0.008),
             ("microsatelliteLongerThanX(3,50)", 0.009),
             ("microsatelliteLongerThanX(3,55)", 0.010),
             ("microsatelliteLongerThanX(3,60)", 0.011),
             ("microsatelliteLongerThanX(3,65)", 0.012),
             ("microsatelliteLongerThanX(3,70)", 0.013),
             ("microsatelliteLongerThanX(3,75)", 0.014),
             ("microsatelliteLongerThanX(3,80)", 0.015),
             ("microsatelliteLongerThanX(3,85)", 0.016),
             ("microsatelliteLongerThanX(3,90)", 0.017),
             ("microsatelliteLongerThanX(3,95)", 0.018),
             ("microsatelliteLongerThanX(3,100)", 0.019),
Beispiel #2
 def ch_at_permutation(data, ch):
     Calculates the dropchance based on the number of occurrences of a nucleotide, A and T in this case since their
     chance to mutate is generally lower.
     These errors are additiv.
     :param data: The sequence to check for the number of occurences of a nucleotide.
     :param ch: The nucleotide to check for.
     :return: The dropchance based on the number of occurences of the given nucleotide.
     preamble = "charCountBiggerEqualThanX("
     return shouldDrop(
             (preamble + ch + ",20)", 0.001),
             (preamble + ch + ",40)", 0.001),
             (preamble + ch + ",60)", 0.001),
             (preamble + ch + ",80)", 0.001),
             (preamble + ch + ",100)", 0.001),
             (preamble + ch + ",120)", 0.001),
             (preamble + ch + ",140)", 0.001),
             (preamble + ch + ",160)", 0.001),
             (preamble + ch + ",180)", 0.001),
             (preamble + ch + ",200)", 0.001),
Beispiel #3
 def random_permutations(data):
     Calculates the dropchance for simulated random mutations in the DNA-Data.
     :param data: The sequence to simulate random mutations for.
     :return: The dropchance based on random mutations.
     return shouldDrop(data, [("*", 0.02)])
Beispiel #4
 def illegal_symbols(data):
     Checks the DNA data for illegal symbols and returns a dropchance of 1.0 if the sequence contains them.
     :param data: The sequence to check for illegal symbols.
     :return: The dropchance based on the occurence of illegal symbols (0.0 or 1.0).
     res = shouldDrop(data, [("strContainsIllegalChars(ACGT)", 1.0)])
     return res
Beispiel #5
    def motif_regex_search(data):

        :param data:
        return shouldDrop(
                # Promoter recognition motif (Euk).
                ("strContainsSubRegex(CANYYY)", 0.01),
                ("strContainsSubRegex(ANCCAATCA)", 0.01),
                ("strContainsSubRegex(KGGGCGGRRY)", 0.01),
                ("strContainsSubRegex(KRGGCGKRRY)", 0.01),
                # Promoter recognition motifs (Prok).
                ("strContainsSubRegex(AAAWWTWTTTTNNNAAA)", 0.05),
                # Ribosomal binding site (Euk).
                ("strContainsSubRegex(RCCACCATGG)", 0.05),
                # Ribosomal binding site (Prok).
                ("strContainsSubRegex(AGGAGGACAGCTAUG)", 0.05),
                # Lox sites.
Beispiel #6
    def motif_search(data):

        :param data:
        return shouldDrop(
                # Promoter recognition motif (Euk).
                ("strContainsSub(TATAAA)", 0.01),
                # Promoter recognition motifs (Prok).
                ("strContainsSub(TTGACA)", 0.05),
                ("strContainsSub(TGTATAATG)", 0.05),
                # Polyadenylation signals (Euk).
                ("strContainsSub(AATAAA)", 0.01),
                ("strContainsSub(TTGTGTGTTG)", 0.01),
                # Lox sites.
                ("strContainsSub(ATAACTTCGTATAGCATACATTATACGAAGTTAT)", 1.01),
                ("strContainsSub(ATAACTTCGTATAGCATACATTATACGAACGGTA)", 1.01),
                ("strContainsSub(TACCGTTCGTATAGCATACATTATACGAAGTTAT)", 1.01),
                ("strContainsSub(TACCGTTCGTATAGCATACATTATACGAACGGTA)", 1.01),
                ("strContainsSub(TACCGTTCGTATATGGTATTATATACGAAGTTAT)", 1.01),
                ("strContainsSub(TACCGTTCGTATATTCTATCTTATACGAAGTTAT)", 1.01),
                ("strContainsSub(TACCGTTCGTATAGGATACTTTATACGAAGTTAT)", 1.01),
                ("strContainsSub(TACCGTTCGTATATACTATACTATACGAAGTTAT)", 1.01),
                ("strContainsSub(TACCGTTCGTATACTATAGCCTATACGAAGTTAT)", 1.01),
                ("strContainsSub(ATAACTTCGTATATGGTATTATATACGAACGGTA)", 1.01),
                ("strContainsSub(ATAACTTCGTATAGTATACCTTATACGAAGTTAT)", 1.01),
                # Lox site spacers not covered by the Lox sites.
                ("strContainsSub(AGGTATGC)", 1.01),
                ("strContainsSub(TTGTATGG)", 1.01),
                ("strContainsSub(GGATAGTA)", 1.01),
                ("strContainsSub(GTGTATTT)", 1.01),
                ("strContainsSub(GGTTACGG)", 1.01),
                ("strContainsSub(TTTTAGGT)", 1.01),
                ("strContainsSub(GTACACAT)", 1.01),
                # Restriction enzyme recognition motifs.
                # BpiI
                ("strContainsSub(GAAGAC)", 1.01),
                # inverse BpiI
                ("strContainsSub(CTTCTG)", 1.01),
                # BsaI
                ("strContainsSub(GGTCTC)", 1.01),
                # inverse BsaI
                ("strContainsSub(CCAGAG)", 1.01),
                ("strContainsSub(CGTCTC)", 0.01),
                ("strContainsSub(GCGATG)", 0.01),
                ("strContainsSub(GCTCTTC)", 0.01),
                # Oligo Adapters.
                ("strContainsSub(CTCGTAGACTGCGTACCA)", 0.01),
                ("strContainsSub(GACGATGAGTCCTGAGTA)", 0.01),
                # 5' extensions.
                ("strContainsSub(GGTTCCACGTAAGCTTCC)", 0.01),
                ("strContainsSub(GCGATTACCCTGTACACC)", 0.01),
                ("strContainsSub(GCCAGTACATCAATTGCC)", 0.01),
                # Twister Adapters:
                ("strContainsSub(GAAGTGCCATTCCGCCTGACCT)", 1.0
                 ),  # Twister 5' Adapter
                 1.0),  # Twister 3' Adapter