def trinucleotid_runs(data): """ Calculates the dropchance based on the chance of trinucleotid microsatellites to mutate. Microsatellites are repeats of two to six units that have a higher chance to mutate than regular polymers. The dropchances are additive in this case (@shouldDrop()). :param data: The DNA sequence to check for microsatellites with length 3. :return: The dropchance based on the occurence of microsatellites with length 3. """ return shouldDrop( data, [ ("microsatelliteLongerThanX(3,10)", 0.001), ("microsatelliteLongerThanX(3,15)", 0.002), ("microsatelliteLongerThanX(3,20)", 0.003), ("microsatelliteLongerThanX(3,25)", 0.004), ("microsatelliteLongerThanX(3,30)", 0.005), ("microsatelliteLongerThanX(3,35)", 0.006), ("microsatelliteLongerThanX(3,40)", 0.007), ("microsatelliteLongerThanX(3,45)", 0.008), ("microsatelliteLongerThanX(3,50)", 0.009), ("microsatelliteLongerThanX(3,55)", 0.010), ("microsatelliteLongerThanX(3,60)", 0.011), ("microsatelliteLongerThanX(3,65)", 0.012), ("microsatelliteLongerThanX(3,70)", 0.013), ("microsatelliteLongerThanX(3,75)", 0.014), ("microsatelliteLongerThanX(3,80)", 0.015), ("microsatelliteLongerThanX(3,85)", 0.016), ("microsatelliteLongerThanX(3,90)", 0.017), ("microsatelliteLongerThanX(3,95)", 0.018), ("microsatelliteLongerThanX(3,100)", 0.019), ], )
def ch_at_permutation(data, ch): """ Calculates the dropchance based on the number of occurrences of a nucleotide, A and T in this case since their chance to mutate is generally lower. These errors are additiv. :param data: The sequence to check for the number of occurences of a nucleotide. :param ch: The nucleotide to check for. :return: The dropchance based on the number of occurences of the given nucleotide. """ preamble = "charCountBiggerEqualThanX(" return shouldDrop( data, [ (preamble + ch + ",20)", 0.001), (preamble + ch + ",40)", 0.001), (preamble + ch + ",60)", 0.001), (preamble + ch + ",80)", 0.001), (preamble + ch + ",100)", 0.001), (preamble + ch + ",120)", 0.001), (preamble + ch + ",140)", 0.001), (preamble + ch + ",160)", 0.001), (preamble + ch + ",180)", 0.001), (preamble + ch + ",200)", 0.001), ], )
def random_permutations(data): """ Calculates the dropchance for simulated random mutations in the DNA-Data. :param data: The sequence to simulate random mutations for. :return: The dropchance based on random mutations. """ return shouldDrop(data, [("*", 0.02)])
def illegal_symbols(data): """ Checks the DNA data for illegal symbols and returns a dropchance of 1.0 if the sequence contains them. :param data: The sequence to check for illegal symbols. :return: The dropchance based on the occurence of illegal symbols (0.0 or 1.0). """ res = shouldDrop(data, [("strContainsIllegalChars(ACGT)", 1.0)]) return res
def motif_regex_search(data): """ :param data: :return: """ return shouldDrop( data, [ # Promoter recognition motif (Euk). ("strContainsSubRegex(CANYYY)", 0.01), ("strContainsSubRegex(ANCCAATCA)", 0.01), ("strContainsSubRegex(KGGGCGGRRY)", 0.01), ("strContainsSubRegex(KRGGCGKRRY)", 0.01), # Promoter recognition motifs (Prok). ("strContainsSubRegex(AAAWWTWTTTTNNNAAA)", 0.05), # Ribosomal binding site (Euk). ("strContainsSubRegex(RCCACCATGG)", 0.05), # Ribosomal binding site (Prok). ("strContainsSubRegex(AGGAGGACAGCTAUG)", 0.05), # Lox sites. ("strContainsSubRegex(ATAACTTCGTATAGTAYACATTATACGAAGTTAT)", 0.01), ])
def motif_search(data): """ :param data: :return: """ return shouldDrop( data, [ # Promoter recognition motif (Euk). ("strContainsSub(TATAAA)", 0.01), # Promoter recognition motifs (Prok). ("strContainsSub(TTGACA)", 0.05), ("strContainsSub(TGTATAATG)", 0.05), # Polyadenylation signals (Euk). ("strContainsSub(AATAAA)", 0.01), ("strContainsSub(TTGTGTGTTG)", 0.01), # Lox sites. ("strContainsSub(ATAACTTCGTATAGCATACATTATACGAAGTTAT)", 1.01), ("strContainsSub(ATAACTTCGTATAGCATACATTATACGAACGGTA)", 1.01), ("strContainsSub(TACCGTTCGTATAGCATACATTATACGAAGTTAT)", 1.01), ("strContainsSub(TACCGTTCGTATAGCATACATTATACGAACGGTA)", 1.01), ("strContainsSub(TACCGTTCGTATATGGTATTATATACGAAGTTAT)", 1.01), ("strContainsSub(TACCGTTCGTATATTCTATCTTATACGAAGTTAT)", 1.01), ("strContainsSub(TACCGTTCGTATAGGATACTTTATACGAAGTTAT)", 1.01), ("strContainsSub(TACCGTTCGTATATACTATACTATACGAAGTTAT)", 1.01), ("strContainsSub(TACCGTTCGTATACTATAGCCTATACGAAGTTAT)", 1.01), ("strContainsSub(ATAACTTCGTATATGGTATTATATACGAACGGTA)", 1.01), ("strContainsSub(ATAACTTCGTATAGTATACCTTATACGAAGTTAT)", 1.01), # Lox site spacers not covered by the Lox sites. ("strContainsSub(AGGTATGC)", 1.01), ("strContainsSub(TTGTATGG)", 1.01), ("strContainsSub(GGATAGTA)", 1.01), ("strContainsSub(GTGTATTT)", 1.01), ("strContainsSub(GGTTACGG)", 1.01), ("strContainsSub(TTTTAGGT)", 1.01), ("strContainsSub(GTACACAT)", 1.01), # Restriction enzyme recognition motifs. # BpiI ("strContainsSub(GAAGAC)", 1.01), # inverse BpiI ("strContainsSub(CTTCTG)", 1.01), # BsaI ("strContainsSub(GGTCTC)", 1.01), # inverse BsaI ("strContainsSub(CCAGAG)", 1.01), ("strContainsSub(CGTCTC)", 0.01), ("strContainsSub(GCGATG)", 0.01), ("strContainsSub(GCTCTTC)", 0.01), # Oligo Adapters. ("strContainsSub(CTCGTAGACTGCGTACCA)", 0.01), ("strContainsSub(GACGATGAGTCCTGAGTA)", 0.01), # 5' extensions. ("strContainsSub(GGTTCCACGTAAGCTTCC)", 0.01), ("strContainsSub(GCGATTACCCTGTACACC)", 0.01), ("strContainsSub(GCCAGTACATCAATTGCC)", 0.01), # Twister Adapters: ("strContainsSub(GAAGTGCCATTCCGCCTGACCT)", 1.0 ), # Twister 5' Adapter ("strContainsSub(AGGCTAGGTGGAGGCTCAGTG)", 1.0), # Twister 3' Adapter ])