def CyclicWithRC(): """ Generates cyclic syperstring of minimal length with every read OR its reverse complement """ kmers = f.LoadFile('\\rosalind_gasm.txt').splitlines() Graph = p.DeBruijnRC(kmers) # Get first node cycle = [] node = random.choice(list(Graph.keys())) # extend cycle for i in range(len(Graph)//2): cycle.append(node) if Graph[node][0] in Graph: node = Graph[node][0] else: # Find node with most overlap # Use that one node = MaxOverlap(Graph[node][0],Graph) # Merge into one string based on overlap superstring = cycle[0] for i in cycle[1:]: superstring = p.Combine(superstring,i) # Get rid of overlap at end of string k = len(superstring) for i in range(k-1,0,-1): if superstring[:i] == superstring[k-i:]: f.ExportToFile('rosalind_gasm_output.txt',superstring[:k-i]) return
def TT(): """ Given 2 DNA strings of equal length, FASTA format, returns the transition / transversion ratio Transitions: A<->G, C<->T Transversions: A<->T, A<->C, G<->C, G<->T """ input = f.LoadFile('\\rosalind_tran.txt') [Labels, DNA] = f.FASTA(input) p = DNA[0] q = DNA[1] transition = 0 transversion = 0 for i in range(len(p)): if p[i] == q[i]: continue else: if (p[i] in 'AG') and (q[i] in 'AG'): transition += 1 elif (p[i] in 'CT') and (q[i] in 'CT'): transition += 1 else: transversion += 1 ratio = str(transition / transversion) f.ExportToFile('rosalind_tran_output.txt', ratio) return
def Lex(): """ Given a collection of symbols and a +integer n, returns all strings of length n that can be formed from the alphabet, ordered lexicographically""" input = f.LoadFile('\\rosalind_lexf.txt').splitlines() sym = input[0].split() r = int(input[1]) n = len(sym) lex_list = [] # for first round for _ in range((n**r) // n): # should match 1/n of total in list for i in sym: lex_list.append(i) lex_list = sorted(lex_list) # First go through whole list correct number of times (j) m = (n**r) // n k = 0 for j in range(1, r): i = -1 # index resets each go-through m = m // n # Smaller number of each sym in a row each round k += 1 # More cycles each time to get through list for __ in range(n**k): for s in sym: for _ in range(m): i += 1 lex_list[i] += s f.ExportToFile('rosalind_lexf_output.txt', '\n'.join(lex_list)) return
def UnrootedTree(): """ Given positive integer n, returns the number of internal nodes of any unrooted binary tree having n leaves. n leaves --> n-2 internal nodes""" n = int(f.LoadFile('\\rosalind_inod.txt')) f.ExportToFile('rosalind_inod_output.txt',str(n-2)) return
def Founder(): """ Returns matrix representing log(prob) that after i generations, no copies of the recessive allele will remain in the population """ input = f.LoadFile('\\rosalind_foun.txt').splitlines() [N, m] = [int(x) for x in input[0].split()] A = [int(x) for x in input[1].split()] # initialize matrix B = [] for i in range(m): #possible generations B.append([]) for j in range(len(A)): #possible initial copies B[i].append(0) # Add DriftToNone to correct box for i in range(1, m + 1): for j in range(len(A)): B[i - 1][j] = str(log10(DriftToNone(N, i, A[j]))) # print in format B_print = [] for line in B: B_print.append(' '.join(line)) f.ExportToFile('rosalind_foun_output.txt', '\n'.join(B_print)) return
def DeBruijnRC(): """ Returns adjacency list, based on given DNA strings and their reverse complements""" S = f.LoadFile('\\rosalind_dbru.txt').splitlines() # Make list of S U Src SuRC = [] for i in S: SuRC.append(ReverseComplement(i)) SuRC.extend(S) SuRC = RemoveDuplicates(SuRC) # Add all prefixes to adj_dict adj_dict = {} for kmer in SuRC: adj_dict[kmer[:-1]] = [] for i in adj_dict: for j in SuRC: if i == j[:-1]: # Look for strings with that prefix adj_dict[i].append(j[1:]) #If so, add suffix # Return in format output = [] for i in adj_dict: for j in adj_dict[i]: output.append(('(%s, %s)' % (i, j))) f.ExportToFile('rosalind_dbru_output.txt', '\n'.join(output)) return
def OverlapGraph(): """ Returns adjacency list of labels of DNA in FASTA format""" input = f.LoadFile('\\rosalind_grph.txt') [Labels, DNA] = f.FASTA(input) temp_dict = {} adj_dict = {} for kmer in DNA: temp_dict[kmer] = [] for kmer in temp_dict: for i in DNA: if (kmer[-3:] == i[:3] # if overlap by 3 and kmer != i): # don't include self! temp_dict[kmer].append(i) # Remove any without matches if temp_dict[kmer] != []: adj_dict[kmer] = temp_dict[kmer] # Replace with labels name_dict = {} for kmer in adj_dict: kmer_ind = DNA.index(kmer) val_inds = [] for value in adj_dict[kmer]: val_inds.append(DNA.index(value)) name_dict[Labels[kmer_ind]] = [Labels[i] for i in val_inds] # Return in format output = [] for name in name_dict: for i in name_dict[name]: output.append(' '.join([name, i])) f.ExportToFile('rosalind_grph_output.txt', '\n'.join(output)) return
def CompareSpectra(): """ Given 2 spectra, returns: 1. the largest mutliplicity of set1(-)set2 2. abs(x) which maximizes (set1(-)set2)(x) """ input = f.LoadFile('\\rosalind_conv.txt').splitlines() temp_spec = [float(x) for x in input[0].split()] test_spec = [float(x) for x in input[1].split()] # Find all possible differences between spectra differences = [] for i in temp_spec: for j in test_spec: differences.append(round(i - j, 5)) # Find diff that occurs most frequently mode = max(set(differences), key=differences.count) # Count how frequently count = 0 for i in differences: if i == mode: count += 1 #print(count, mode, sep = '\n') f.ExportToFile('rosalind_conv_output.txt', '\n'.join([str(count), str(mode)])) return
def Drift(): """ Predicts probability that in a population of N diploid individuals initially possessing m copies of a dominant allele, we will observe after g generations at least k copies of a recessive allele (assuming Wright-Fisher model) """ input = f.LoadFile('\\rosalind_wfmd.txt').split() N = int(input[0]) * 2 m = int(input[1]) # initial num of copies of dom allele in pop (i) g = int(input[2]) # after g generations... k = int(input[3]) # prob that at least k copies of recessive (j) # Calculate probability of number of dominant alleles # Start with generation 0 curr_gen = [0 for i in range(N + 1)] # initialize as 0 #-we know there is a 100% prob that there are m alleles #-everything else is 0 curr_gen[m] = 1 # iterate over generations for gen in range(g): next_gen = [0 for i in range(N + 1)] #initialize as 0 for i in range(N + 1): #starting point for j in range(N + 1): #ending point # temp-term = markov transition probability temp_term = nCr(N, i) * (j / N)**i * (1 - (j / N))**(N - i) # add to previous p (pA + pB = Ptotal) next_gen[i] += temp_term * curr_gen[j] curr_gen = next_gen # update as current generation prob = str(sum(curr_gen[:-k])) #sum = 'at least k' f.ExportToFile('rosalind_wfmd_output.txt', prob) return
def Subsets(): """ Given positive int n, returns total number of subsets 1:n modulo 1000000""" n = int(f.LoadFile('\\rosalind_sset.txt')) P = 2**n % 1000000 f.ExportToFile('rosalind_sset_output.txt', str(P)) return
def MatchSpectrum(): """ Given: 1) A positive integer n 2) n protein strings 3) A multiset corresponding to the complete spectrum of some unknown protein string... ... Returns the maximum multiplicity, and the string where this occurs """ input = f.LoadFile('\\rosalind_prsm.txt').splitlines() n = int(input[0]) proteins = input[1:n+1] spectrum = [float(x) for x in input[n+2:]] # Find the masses for each protein masses = [] for p in proteins: masses.append(GetMasses(p)) # Find mode for each modes = [] for m in masses: modes.append(CompareSpectra(m,spectrum)) # Return protein w max modes, and that max max_mode = max(modes) max_index = modes.index(max_mode) max_protein = proteins[max_index] f.ExportToFile('rosalind_prsm_output.txt','\n'.join([str(max_index),max_protein])) return
def DistanceMatrix(): """ Given n DNA strings (FASTA), returns distance matrix """ input = f.LoadFile('\\rosalind_pdst.txt') [Label, DNA] = f.FASTA(input) # Initialize distance matrix D = [] for _ in DNA: D.append([]) for i in range(len(D)): for _ in range(len(D)): D[i].append(0) # Calculate Hamming Distance, add to matrix for i in range(len(DNA)): for j in range(len(DNA)): dist = HammingDistance(DNA[i], DNA[j]) D[i][j] = str(dist / len(DNA[0])) # Properly format D_formatted = [] for line in D: D_formatted.append(' '.join(line)) f.ExportToFile('rosalind_pdst_output.txt', '\n'.join(D_formatted)) return
def NewickDistanceWeights(): """ Gives distances between pair of nodes in trees (Newick) """ input = f.LoadFile('\\rosalind_nkew.txt').splitlines() # Separate into Trees and Pairs Trees = [] Pairs = [] for line in input: if ';' in line: Trees.append(line) elif line != '': Pairs.append(line.split()) # For each tree in the file distances = [] for i in range(len(Trees)): tree = Phylo.read(io.StringIO(Trees[i]), 'newick') # If no edgeweights specified, use code below (weight=1) """clades = tree.find_clades() for clade in clades: clade.branch_length = 1""" d = tree.distance(Pairs[i][0], Pairs[i][1]) distances.append(str(d)) f.ExportToFile('rosalind_nkew_output.txt', ' '.join(distances)) return
def Sets(): """ Returns 6 sets: 1. A U B 2. A intersection B 3. A - B 4. B - A 5. Ac 6. Bc """ input = f.LoadFile('\\rosalind_seto.txt').splitlines() n = int(input[0]) A = input[1].replace('{', '').replace('}', '').split(', ') B = input[2].replace('{', '').replace('}', '').split(', ') # Make Union set AB_union = RemoveDuplicates(A + B) # either A or B (or both) AB_intersect = [i for i in A if i in B] #both A & B AB_diff = [i for i in A if i not in B] # A not B BA_diff = [i for i in B if i not in A] # B not A U = [str(i) for i in range(1, n + 1)] # for set complements A_comp = [i for i in U if i not in A] # U not A B_comp = [i for i in U if i not in B] # U not B # Return in format Sets = [AB_union, AB_intersect, AB_diff, BA_diff, A_comp, B_comp] output = [] for set in Sets: output.append('{%s}\n' % ', '.join(set)) f.ExportToFile('rosalind_seto_output.txt', ''.join(output)) return
def ErrorCorrection(): """ Given list of DNA (FASTA) with correct reads occuring at least twice, returns incorrect reads and the corrected version.""" input = f.LoadFile('\\rosalind_corr.txt') [Labels, DNA] = f.FASTA(input) correct_DNA = [] # Read is correct if it appears at least twice, #-possibly as reverse complement for i in DNA: if Freq(i, DNA) > 1: correct_DNA.append(i) # Add all reverse complements to correct_DNA new_correct = [] for i in correct_DNA: new_correct.append(i) new_correct.append(ReverseComplement(i)) correct_DNA = RemoveDuplicates(new_correct) # Compare each read against the correct ones output = [] for read in DNA: # If its in correct_Dna, ignore if read not in correct_DNA: # Find which string it matches best match = MinimumDistance(read, correct_DNA) # print in format output.append('%s->%s' % (read, match)) f.ExportToFile('rosalind_corr_output.txt', '\n'.join(output)) return
def InterleavingMotifs(): [p,q] = f.LoadFile('\\rosalind_scsp.txt').splitlines() k = len(p) l = len(q) matrix = [] matrix = MakeMatrixSCS(matrix,k,l,p,q) scs = InterpretMatrixSCS(matrix,k,l,p,q) f.ExportToFile('rosalind_scsp_output.txt',scs) return
def CompletingaTree(): """ Given positive integer n and an adjacency list corresponding to a graph on n nodes that contains no cycles, returns the minimum number of edges that can be added to the graph to product a tree""" input = f.LoadFile('\\rosalind_tree.txt').splitlines() n = int(input[0]) edges = len(input[1:]) minimum = str(n - edges - 1) f.ExportToFile('rosalind_tree_output.txt', minimum) return
def HammingDistance(): """Returns the Hamming Distance between 2 strings""" input = f.LoadFile('\\rosalind_hamm.txt').splitlines() p = input[0] q = input[1] dist = 0 for i in range(len(p)): if p[i] != q[i]: dist += 1 f.ExportToFile('rosalind_hamm_output.txt', str(dist)) return
def ProteinTomRNA(): """ Returns total number of different RNA strings from which the protein could have been translated, modulo 1000000""" protein = f.LoadFile('\\rosalind_mrna.txt') protein += 'X' # add stop codon to end combo = 1 for aa in protein: if aa in mRNA_dict: combo = combo*mRNA_dict[aa] f.ExportToFile('rosalind_mrna_output.txt', str(combo % 1000000)) return
def ExpectedVal(): """ Given positive int n and array P representing probabilities corresponding to an allel frequency, returns array B representing the expected allele frequency of the next generation """ input = f.LoadFile('\\rosalind_ebin.txt').splitlines() n = int(input[0]) P = [float(x) for x in input[1].split()] B = [str(round(i * n, 4)) for i in P] f.ExportToFile('rosalind_ebin_output.txt', ' '.join(B)) return
def Spectrum(): """ Given prefix spectrum of protein, returns protein string""" L = f.LoadFile('\\rosalind_spec.txt').splitlines() L = list(reversed(sorted([float(x) for x in L]))) protein = [] for i in range(len(L) - 1): aa = round(L[i] - L[i + 1], 4) protein.insert(0, inv_massdict[aa]) f.ExportToFile('rosalind_spec_output.txt', ''.join(protein)) return
def IndependentAlleles(): input = f.LoadFile('\\rosalind_lia.txt').split() k = int(input[0]) N = int(input[1]) P = 2**k prob = 0 for i in range(N, P + 1): prob += nCr(P, i) * (0.25**i) * (0.75**(P - i) ) # formula for Mendel's 2nd Law f.ExportToFile('rosalind_lia_output.txt', str(prob)) return
def GlobalAlignment(): """ Uses MakeMatrix to return the maximum alignment score between 2 DNA strings (FASTA)""" input = f.LoadFile('\\rosalind_glob.txt') [Labels, [p, q]] = f.FASTA(input) k = len(p) l = len(q) matrix = [] maxalign = MakeMatrixGlobal(matrix, k, l, p, q) f.ExportToFile('rosalind_glob_output.txt', str(maxalign)) return
def Splicing(): """ Returns sum of combinations C(n,k) for m<=k<=n, modulo 1000000 """ [n, m] = f.LoadFile('\\rosalind_aspc.txt').split() n = int(n) m = int(m) count = 0 for k in range(m, n + 1): count += nCr(n, k) f.ExportToFile('rosalind_aspc_output.txt', str(count % 1000000)) return
def Splicing(): """ Given a DNA substring and a collection of substrings acting as introns, returns a protein string from transcribing and translating exons""" input = f.LoadFile('\\rosalind_splc.txt') [Label, DNA] = f.FASTA(input) t = DNA[0] # original string for substr in DNA[1:]: t = t.replace(substr, '') # remove introns RNA = DNAtoRNA(t) f.ExportToFile('rosalind_splc_output.txt', RNAtoProtein(RNA)) return
def EditDistance(): """ Given 2 strings, FASTA, returns the edit distance """ input = f.LoadFile('\\rosalind_edit.txt') [Labels, [p, q]] = f.FASTA(input) k = len(p) l = len(q) matrix = [] result = MakeMatrixDist(matrix, k, l, p, q) f.ExportToFile('rosalind_edit_output.txt', str(result)) return
def GenotypeFromPedigree(newick): """ Combine all previous functions, and convert to exportable format""" input = f.LoadFile('\\rosalind_mend.txt') tree = ReduceTree(input) while CountParantheses(tree): tree = SolveProbabilities(tree) result = tree.replace('(', '') result = result.replace(')', '') result = result.replace(';', '') result = result.split(',') f.ExportToFile('rosalind_mend_output.txt', ' '.join(result)) return
def RNAtoProtein(): """ Uses RNA_dict to convert RNA string to protein""" s = f.LoadFile('\\rosalind_prot.txt') codons = [] protein = '' for i in range(0, len(s), 3): codons.append(s[i:i + 3]) # separate into codons for triplet in codons: if triplet in ['UAA', 'UAG', 'UGA']: # stop at stop codons break else: protein += RNA_dict[triplet] f.ExportToFile('rosalind_prot_output.txt', protein) return
def SortingByReversals(): """ Uses ReversalDistanceWithPairs() to report back reversal distance and pairs encoding the reversal """ input = f.LoadFile('\\rosalind_sort.txt').splitlines() q = [int(x) for x in input[0].split()] p = [int(x) for x in input[1].split()] output = [] [distance, indices] = ReversalDistanceWithPairs(p, q) output.append(str(distance)) for i in indices: output.append(' '.join(str(x) for x in i)) f.ExportToFile('rosalind_sort_output.txt', '\n'.join(output)) return
def Trie(): strings = f.LoadFile('\\rosalind_trie.txt').splitlines() """ Puts together all of above functions to make a trie! """ trie = [(0, 1, '')] # root! for string in strings: [parent, x] = FindParent(trie, string) trie = BranchOff(trie, string, parent, x) # Format for printing! trief = [] for tup in trie[1:]: # Don't include root trief.append(' '.join([str(x) for x in tup])) f.ExportToFile('rosalind_trie_output.txt', '\n'.join(trief)) return