def DistanceMatrix(): """ Given n DNA strings (FASTA), returns distance matrix """ input = f.LoadFile('\\rosalind_pdst.txt') [Label, DNA] = f.FASTA(input) # Initialize distance matrix D = [] for _ in DNA: D.append([]) for i in range(len(D)): for _ in range(len(D)): D[i].append(0) # Calculate Hamming Distance, add to matrix for i in range(len(DNA)): for j in range(len(DNA)): dist = HammingDistance(DNA[i], DNA[j]) D[i][j] = str(dist / len(DNA[0])) # Properly format D_formatted = [] for line in D: D_formatted.append(' '.join(line)) f.ExportToFile('rosalind_pdst_output.txt', '\n'.join(D_formatted)) return
def OverlapGraph(): """ Returns adjacency list of labels of DNA in FASTA format""" input = f.LoadFile('\\rosalind_grph.txt') [Labels, DNA] = f.FASTA(input) temp_dict = {} adj_dict = {} for kmer in DNA: temp_dict[kmer] = [] for kmer in temp_dict: for i in DNA: if (kmer[-3:] == i[:3] # if overlap by 3 and kmer != i): # don't include self! temp_dict[kmer].append(i) # Remove any without matches if temp_dict[kmer] != []: adj_dict[kmer] = temp_dict[kmer] # Replace with labels name_dict = {} for kmer in adj_dict: kmer_ind = DNA.index(kmer) val_inds = [] for value in adj_dict[kmer]: val_inds.append(DNA.index(value)) name_dict[Labels[kmer_ind]] = [Labels[i] for i in val_inds] # Return in format output = [] for name in name_dict: for i in name_dict[name]: output.append(' '.join([name, i])) f.ExportToFile('rosalind_grph_output.txt', '\n'.join(output)) return
def ErrorCorrection(): """ Given list of DNA (FASTA) with correct reads occuring at least twice, returns incorrect reads and the corrected version.""" input = f.LoadFile('\\rosalind_corr.txt') [Labels, DNA] = f.FASTA(input) correct_DNA = [] # Read is correct if it appears at least twice, #-possibly as reverse complement for i in DNA: if Freq(i, DNA) > 1: correct_DNA.append(i) # Add all reverse complements to correct_DNA new_correct = [] for i in correct_DNA: new_correct.append(i) new_correct.append(ReverseComplement(i)) correct_DNA = RemoveDuplicates(new_correct) # Compare each read against the correct ones output = [] for read in DNA: # If its in correct_Dna, ignore if read not in correct_DNA: # Find which string it matches best match = MinimumDistance(read, correct_DNA) # print in format output.append('%s->%s' % (read, match)) f.ExportToFile('rosalind_corr_output.txt', '\n'.join(output)) return
def TT(): """ Given 2 DNA strings of equal length, FASTA format, returns the transition / transversion ratio Transitions: A<->G, C<->T Transversions: A<->T, A<->C, G<->C, G<->T """ input = f.LoadFile('\\rosalind_tran.txt') [Labels, DNA] = f.FASTA(input) p = DNA[0] q = DNA[1] transition = 0 transversion = 0 for i in range(len(p)): if p[i] == q[i]: continue else: if (p[i] in 'AG') and (q[i] in 'AG'): transition += 1 elif (p[i] in 'CT') and (q[i] in 'CT'): transition += 1 else: transversion += 1 ratio = str(transition / transversion) f.ExportToFile('rosalind_tran_output.txt', ratio) return
def Splicing(): """ Given a DNA substring and a collection of substrings acting as introns, returns a protein string from transcribing and translating exons""" input = f.LoadFile('\\rosalind_splc.txt') [Label, DNA] = f.FASTA(input) t = DNA[0] # original string for substr in DNA[1:]: t = t.replace(substr, '') # remove introns RNA = DNAtoRNA(t) f.ExportToFile('rosalind_splc_output.txt', RNAtoProtein(RNA)) return
def EditDistance(): """ Given 2 strings, FASTA, returns the edit distance """ input = f.LoadFile('\\rosalind_edit.txt') [Labels, [p, q]] = f.FASTA(input) k = len(p) l = len(q) matrix = [] result = MakeMatrixDist(matrix, k, l, p, q) f.ExportToFile('rosalind_edit_output.txt', str(result)) return
def GlobalAlignment(): """ Uses MakeMatrix to return the maximum alignment score between 2 DNA strings (FASTA)""" input = f.LoadFile('\\rosalind_glob.txt') [Labels, [p, q]] = f.FASTA(input) k = len(p) l = len(q) matrix = [] maxalign = MakeMatrixGlobal(matrix, k, l, p, q) f.ExportToFile('rosalind_glob_output.txt', str(maxalign)) return
def CountOptimalAlignments(): """ Uses MakeMatrix and ConstructPath to return number of optimal alignments. Repeats ConstructPath multiple times to get all possible optimal alignments """ input = f.LoadFile('\\rosalind_ctea.txt') [Labels,[p,q]] = f.FASTA(input) k = len(p) l = len(q) matrix = [] [editdistance,matrix] = MakeMatrixDist(matrix,k,l,p,q) return ConstructPath(matrix,k,l,p,q) % 134217727
def EditDistanceAlignment(): """ Uses MakeMatrix and InterpretMatrix to return alignments """ input = f.LoadFile('\\rosalind_edta.txt') [Labels, [p, q]] = f.FASTA(input) k = len(p) l = len(q) matrix = [] [editdistance, matrix] = MakeMatrixDist(matrix, k, l, p, q) [p_aligned, q_aligned] = InterpretMatrix(matrix, k, l, p, q) output = [str(editdistance), ''.join(p_aligned), ''.join(q_aligned)] f.ExportToFile('rosalind_edta_output.txt', '\n'.join(output)) return
def SharedMotif(): """ Finds the longest motif shared by all DNA strings in list, FASTA format """ input = f.LoadFile('\\rosalind_lcsm.txt') [Labels,DNA] = f.FASTA(input) t = min(DNA) # shortest string k = len(t) # length of shortest string # From length of shortest sequence to 0 for j in range(k,0,-1): # Do all at longest length first # From 0 to end of first sequence for i in range(k-j+1): # Adjust window template = t[i:i+j] #i + j be less than the length of t if InAll(template,DNA) == True: f.ExportToFile('rosalind_lcsm_output.txt',template) return
def MultipleAlignment(): """ Returns multiple alignment score, and aligned scores. Input is meant to be very short, so this will take advantage of that. Thus, not useful for longer or more numerous sequences. """ input = f.LoadFile('\\rosalind_mult.txt') [labels, strings] = f.FASTA(input) [a, b, c, d] = GappedStrings(strings) alignment_combos = list(k.product(a, b, c, d)) maxscore = -100000000000000000 for combo in alignment_combos: if MultiAlignScore(combo) > maxscore: maxscore = MultiAlignScore(combo) minalignments = combo output = [str(maxscore), '\n'.join(minalignments)] f.ExportToFile('rosalind_mult_output.txt', '\n'.join(output)) return
def MaxMatching(): """ Given RNA string (FASTA), return total possible number of maximum matchings""" input = f.LoadFile('\\rosalind_mmch.txt') [Label, s] = f.FASTA(input) nuc_dict = {'A': 0, 'U': 0, 'C': 0, 'G': 0} for nuc in s: nuc_dict[nuc] += 1 minAU = min([nuc_dict['A'], nuc_dict['U']]) minCG = min([nuc_dict['C'], nuc_dict['G']]) maxAU = max([nuc_dict['A'], nuc_dict['U']]) maxCG = max([nuc_dict['C'], nuc_dict['G']]) matches = str(nPr(maxAU, minAU) * nPr(maxCG, minCG)) f.ExportToFile('rosalind_mmch_output.txt', matches) return
def kmerComp(): """ Given DNA string s (FASTA), returns the 4-mer composition of s""" input = f.LoadFile('\\rosalind_kmer.txt') [Label, s] = f.FASTA(input) # Generate all 4-mers, ordered lexographically lex_list = Lex('ACGT', 4) A = [] # go through lex_list, count frequency of each 4-mer for kmer in lex_list: count = 0 for i in range(len(s) - 3): #-k+1 test = s[i:i + 4] if test == kmer: count += 1 A.append(str(count)) f.ExportToFile('rosalind_kmer_output.txt', ' '.join(A)) return
def ReversePalindromes(): """ Identifies all reverse palindromes in DNA string, FASTA format, of length 4 - 12""" input = f.LoadFile('\\rosalind_revp.txt') [Label, s] = f.FASTA(input) k = len(s) pal_tuples = [] # Adjust start of window for i in range(k): # Adjust size of window for j in range(i + 3, k + 1): if j - i <= 12: if s[i:j] == ReverseComplement(s[i:j]): pal_tuples.append(' '.join((str(i + 1), str(j - 1)))) f.ExportToFile('rosalind_revp_output.txt', '\n'.join(pal_tuples)) return
def ConsensusandProfile(): """ Returns conensus string and profile matrix for a collection of FASTA format DNA strings""" input = f.LoadFile('\\rosalind_cons.txt') [Labels, DNA] = f.FASTA(input) k = len(DNA[0]) # Initialize profile profile = {'A': [], 'C': [], 'G': [], 'T': []} for _ in range(k): profile['A'].append(0) profile['C'].append(0) profile['G'].append(0) profile['T'].append(0) # Fill in profile for line in DNA: i = -1 for nuc in line: i += 1 profile[nuc][i] += 1 # Find consensus consensus = '' for i in range(k): nuc_l = [ profile['A'][i], profile['C'][i], profile['G'][i], profile['T'][i] ] for key in profile: if max(nuc_l) == profile[key][i]: consensus += key break # Report results in proper format output = [ consensus, 'A: %s' % (' '.join(map(str, profile['A']))), 'C: %s' % (' '.join(map(str, profile['C']))), 'G: %s' % (' '.join(map(str, profile['G']))), 'T: %s' % (' '.join(map(str, profile['T']))) ] f.ExportToFile('rosalind_cons_output.txt', '\n'.join(output)) return
def GC(): """Computes GC content of set of DNA strings. Returns name and content of string with highest GC-content""" input = f.LoadFile('\\rosalind_gc.txt') [Labels, DNA] = f.FASTA(input) # Calculate GC content gc_content = [] for seq in DNA: count = 0 for nuc in seq: if nuc in 'GC': count += 1 gc_content.append(count / len(seq) * 100) # Find & report max GC + label maxgc = max(gc_content) maxgc_index = gc_content.index(maxgc) label = Labels[maxgc_index] f.ExportToFile('rosalind_gc_output.txt', '\n'.join([label, str(maxgc)])) return
def FailureArray(): """ Given DNA string (FASTA), returns failure array""" input = f.LoadFile('\\rosalind_kmp.txt') [Label, s] = f.FASTA(input) # initialize P P = [] for _ in range(len(s)): P.append(0) k = 0 for i in range(2, len(s) + 1): while k > 0 and s[k] != s[i - 1]: k = P[k - 1] if s[k] == s[i - 1]: k += 1 P[i - 1] = k f.ExportToFile('rosalind_kmp_output.txt', ' '.join(str(x) for x in P)) return
def ORF(): """ Returns all distinct candidate protein strings that can be translated from ORFs of the given DNA string, FASTA format""" input = f.LoadFile('\\rosalind_orf.txt') [Labels,DNA] = f.FASTA(input) allDNA = [] allDNA.append(ReverseComplement(DNA)) # Convert to RNA RNA = [] for i in allDNA: RNA.append(DNAtoRNA(i)) # Get all strings from start codon to first stop seq = [] for r in RNA: start = [] stop = [] for i in range(len(r)): if r[i:i+3] == 'AUG': start.append(i) elif r[i:i+3] in ['UAA', 'UAG', 'UGA']: stop.append(i) for i in start: for j in stop: if ( j > i and (j-i) % 3 == 0): seq.append(r[i:j]) break # Convert to Protein proteins = [] for s in seq: proteins.append(RNAtoProtein(s)) proteins = RemoveDuplicates(proteins) f.ExportToFile('rosalind_orf_output.txt', '\n'.join(proteins)) return
def SplicedMotif(): """ Given DNA strings s & t, FASTA format, returns a collection of indices in which symbols of t appears as a subsequence of s """ input = f.LoadFile('\\rosalind_sseq.txt') [Labels, DNA] = f.FASTA(input) s = DNA[0] t = DNA[1] loc = [] i = 0 # Go through s one nucleotide at a time for sym in s: i += 1 # once first symbol if sym == t[0]: loc.append(str(i)) t = t[1:] if t == '': break f.ExportToFile('rosalind_sseq_output.txt', ' '.join(loc)) return
def SuperString(): """ Given several DNA strings, FASTA format, returns shortest possible superstring""" input = f.LoadFile('\\rosalind_long.txt') [Labels, DNA] = f.FASTA(input) while len(DNA) > 2: # Repeat cycle until only one string left # Find pair of strings with greatest overlap ## Initialize overlap matrix overlap_matrix = [] for i in range(len(DNA)): overlap_matrix.append([]) for j in range(len(DNA)): overlap_matrix[i].append(0) ## Fill in with overlaps for i in DNA: for j in DNA: overlap_matrix[DNA.index(i)][DNA.index(j)] = Overlap(i, j) # Replace strings with max overlap with superstring ## Find index of max overlap + value max_overlap = MaxMatrix(overlap_matrix) ind1 = max_overlap[0] ind2 = max_overlap[1] ## Make superstring based on this info s = Combine(DNA[ind1], DNA[ind2]) ## Remove shorter strings, add superstring x = copy.copy(DNA[ind1]) y = copy.copy(DNA[ind2]) DNA.remove(x) DNA.remove(y) DNA.append(s) superstring = Combine(DNA[0], DNA[1]) f.ExportToFile('rosalind_long_output.txt', superstring) return
def LCS(): input = f.LoadFile('\\rosalind_lcsq.txt') [Labels, DNA] = f.FASTA(input) p = DNA[0] q = DNA[1] # Make a matrix that compares the two strings k = len(p) l = len(q) matrix = [] matrix = MakeMatrix(matrix, k, l, p, q) # Use the length of lcs to find the location of the last element length = max([max(row) for row in matrix]) [maxcol, maxrow, lcs, query] = LastElement(matrix, p, length) # Expand the lcs by finding where the next lowest # in the matrix occurs while len(lcs) < length: [maxcol, maxrow, lcs, query] = ExpandLCS(matrix, p, maxcol, maxrow, query, lcs) lcs = ''.join(list(reversed(lcs))) f.ExportToFile('rosalind_lcsq_output.txt', lcs) return
def NglyMotif(): """Finds locations of N-glycosylation motiif in proteins, given uniprot IDs. Motif = N + (anything but P) + (S or T) + anything but P""" input = f.LoadFile('\\rosalind_mprt.txt').splitlines() output = [] for id in input: url = 'http://www.uniprot.org/uniprot/' + str(id) + '.fasta' fasta = urllib.request.urlopen(url).read().decode("utf-8") [labels, protein] = f.FASTA(fasta) # Get locations of motif locs = [] for i in range(len(protein) - 4): #4 = len(motif) m = protein[i:i + 4] if (m[0] == 'N' and m[2] in 'ST' and 'P' not in m): locs.append(str(i + 1)) if locs != []: output.append(id) output.append(' '.join(str(x) for x in locs)) f.ExportToFile('rosalind_mprt_output.txt', '\n'.join(output)) return