def main(): '''Main call. Parses, runs, and saves problem specific data.''' # Parse the input data. seq_list = ReadFASTA('data/rosalind_gc.txt') highest_gc = map(str, max_gc_content(seq_list)) # Print and save the answer. print '\n'.join(highest_gc) with open('output/005_GC.txt', 'w') as output_data: output_data.write('\n'.join(highest_gc))
def main(): '''Main call. Reads, runs, and saves problem specific data.''' # Parse the two input protein strings. s, t = [fasta[1] for fasta in ReadFASTA('data/rosalind_gaff.txt')] # Get the alignment score. score = global_alignment_affine_gap_penalty(s, t, BLOSUM62(), 11, 1) # Print and save the answer. print '\n'.join(score) with open('output/096_GAFF.txt', 'w') as output_data: output_data.write('\n'.join(score))
def main(): '''Main call. Reads, runs, and saves problem specific data.''' # Read and parse the input data. word1, word2 = [fasta[1] for fasta in ReadFASTA('data/rosalind_smgb.txt')] # Get the alignment. alignment = semiglobal_alignment(word1, word2, 1) # Print and save the answer. print '\n'.join(alignment) with open('output/101_SMGB.txt', 'w') as output_data: output_data.write('\n'.join(alignment))
def main(): '''Main call. Reads, runs, and saves problem specific data.''' # Parse the input data. v, w = [fasta[1] for fasta in ReadFASTA('data/rosalind_mgap.txt')] # Get the maximum number of gaps. max_gaps = str(maximum_gap_symbols(v, w)) # Print and save the answer. print max_gaps with open('output/083_MGAP.txt', 'w') as output_data: output_data.write(max_gaps)
def main(): '''Main call. Reads, runs, and saves problem specific data.''' # Parse the input data. rna = ReadFASTA('data/rosalind_cat.txt')[0][1] # Get the number of noncrossing perfect bondings. noncrossing = str(noncrossing_perfect_bondings(rna)) # Print and save the answer. print noncrossing with open('output/033_CAT.txt', 'w') as output_file: output_file.write(str(noncrossing))
def main(): '''Main call. Reads, runs, and saves problem specific data.''' # Read and parse the input data. word1, word2 = [fasta[1] for fasta in ReadFASTA('data/rosalind_sims.txt')] # Get the fitting alignment. alignment = fitting_alignment(word1, word2) # Print and save the answer. print '\n'.join(alignment) with open('output/100_SIMS.txt', 'w') as output_data: output_data.write('\n'.join(alignment))
from scripts import ReadFASTA from protein_map import ProteinDictDNA dlist = ReadFASTA('data/rosalind_splc.txt') #print(dlist) RNA = dlist[0][1] lenDNA = len(dlist) for i in range(1, lenDNA): RNA = RNA.replace(dlist[i][1], '') #print(RNA) prot_dct = ProteinDictDNA() protein = '' for i in range(0, len(RNA), 3): test = RNA[i:i + 3] if (prot_dct[test] != 'Stop'): protein += prot_dct[test] print(protein)
# Quick lambda function to insert indels. insert_indel = lambda word, i: word[:i] + '-' + word[i:] # Insert indels to get the alignment. while reduce(mul, current_index) != 0: for i, perm_value in enumerate( perm_list[backtrack[tuple(current_index)]]): if perm_value == 0: alignment[i] = insert_indel(alignment[i], current_index[i]) else: current_index[i] -= 1 # Note: We don't need to prepend any indels because we forced a match at the start of all words. # Remove the forced match from all alignments to recover the correct alignment. return [str(max_score)] + [aligned[1:] for aligned in alignment] if __name__ == '__main__': from scripts import ReadFASTA # Parse the input data. words = [fasta[1] for fasta in ReadFASTA('data/rosalind_mult.txt')] # Get the alignment. words_aligned = multiple_alignment(words) # Print and save the answer. print '\n'.join(words_aligned) with open('output/085_MULT.txt', 'w') as output_data: output_data.write('\n'.join(words_aligned))
#!/usr/bin/env python ''' A solution to a ROSALIND bioinformatics problem. Problem Title: Computing GC Content Rosalind ID: GC Rosalind #: 005 URL: http://rosalind.info/problems/gc/ ''' from scripts import ReadFASTA # Our data is in FASTA form. dna_list = ReadFASTA('data/rosalind_gc.txt') highest_GC = -1 highest_GC_name = '' for index, dna_seq in enumerate(dna_list): GC_count = 0 for nucleotide in dna_seq[1]: if nucleotide == 'G' or nucleotide == 'C': GC_count += 1 GC_amount = ( (GC_count*100.0)/len(dna_seq[1]) ) if GC_amount > highest_GC: highest_GC = GC_amount highest_GC_name = dna_list[index][0] # Print the solution. print highest_GC_name, '\n', highest_GC
#!/usr/bin/env python ''' A solution to a ROSALIND bioinformatics problem. Problem Title: Open Reading Frames Rosalind ID: ORF Rosalind #: 018 URL: http://rosalind.info/problems/orf/ ''' from scripts import ReadFASTA, ReverseComplementDNA, ProteinDictDNA dna_list = [ReadFASTA('data/rosalind_orf.txt')[0][1]] dna_list.append(ReverseComplementDNA(dna_list[0])) dna_dict = ProteinDictDNA() # Use a set since we want to return distinct protein. # Sets keep track of distinct elements without us needing to worry about adding duplicates. protein_orf = set() for dna in dna_list: for i in range(len(dna) - 2): # Check for the Start codon. if dna[i:i + 3] == 'ATG': # Use a new index since we'll want to return to the ith position of the strand in case there are multiple start codons in a row. j = i current_protein = '' # Continue, if necessary, until we hit the end of the DNA sequence. while j + 3 < len(dna) - 1: # Add the protein and break if we hit a Stop codon. if dna_dict[dna[j:j + 3]] == 'Stop': protein_orf.add(current_protein)
#!/usr/bin/env python ''' A solution to a ROSALIND bioinformatics problem. Problem Title: Locating Restriction Sites Rosalind ID: REVP Rosalind #: 021 URL: http://rosalind.info/problems/revp/ ''' from scripts import ReadFASTA, ReverseComplementDNA dna = ReadFASTA('data/rosalind_revp.txt')[0][1] locations = [] for length in range(4, 13): for index in range(len(dna) - length + 1): if dna[index:index + length] == ReverseComplementDNA(dna[index:index + length]): print index + 1, length locations.append(str(index + 1) + ' ' + str(length)) with open('output/021_REVP.txt', 'w') as output_data: for location in locations: output_data.write(location + '\n')
''' A solution to a ROSALIND bioinformatics problem. Problem Title: Maximum Matchings and RNA Secondary Structures Rosalind ID: MMCH Rosalind #: 040 URL: http://rosalind.info/problems/mmch/ ''' from math import factorial from scripts import ReadFASTA def nPr(n, k): '''Returns the number of k-pernumatations of n.''' return factorial(n) / factorial(n - k) rna = ReadFASTA('data/rosalind_mmch.txt')[0][1] # Counts the number of each times each nucleotide appears in the RNA string. AU_num = [rna.count(nucleotide) for nucleotide in 'AU'] GC_num = [rna.count(nucleotide) for nucleotide in 'GC'] # There are nPr(max, min) edges for each AU, CG. Total number of edges is then the product. max_matchings = nPr(max(AU_num), min(AU_num)) * nPr(max(GC_num), min(GC_num)) print max_matchings with open('output/040_MMCH.txt', 'w') as output_data: output_data.write(str(max_matchings))
#!/usr/bin/env python ''' A solution to a ROSALIND bioinformatics problem. Problem Title: Transitions and Transversions Rosalind ID: TRAN Rosalind #: 031 URL: http://rosalind.info/problems/tran/ ''' from scripts import ReadFASTA dna1, dna2 = [fasta[1] for fasta in ReadFASTA('data/rosalind_tran.txt')] transitions = transversions = 0.0 for i in xrange(len(dna1)): if dna1[i] == dna2[i]: pass # Check if the nucleotides are in the same purine/pyrimidine group. elif dna1[i] in [['A', 'G'], ['C', 'T']][dna2[i] in ['C', 'T']]: transitions += 1 else: transversions += 1 print transitions / transversions with open('output/031_TRAN.txt', 'w') as output_data: output_data.write(str(transitions / transversions))
#!/usr/bin/env python ''' A solution to a ROSALIND bioinformatics problem. Problem Title: Perfect Matchings and RNA Secondary Structures Rosalind ID: PMCH Rosalind #: 026 URL: http://rosalind.info/problems/pmch/ ''' from math import factorial from scripts import ReadFASTA rna = ReadFASTA('data/rosalind_pmch.txt')[0][1] pmch = factorial(rna.count('A'))*factorial(rna.count('C')) print pmch with open('output/026_PMCH.txt', 'w') as output_data: output_data.write(str(pmch))
subintervals.append([rna[1:i], rna[i + 1:]]) if subintervals == []: # If we didn't find any subintervals, there are no possible noncrossing matchings. noncross_dict[rna] = 0 else: # Reduce the problem to noncrossing matchings over the substrings. noncross_dict[rna] = sum([ Noncrossing(subint[0]) * Noncrossing(subint[1]) for subint in subintervals ]) % 1000000 return noncross_dict[rna] def check_subinterval(subint): '''Checks if a given subinterval has the same number of matching nucleotides.''' N = [subint.count(nucleotide) for nucleotide in 'AUCG'] if N[0] == N[1] and N[2] == N[3]: return True return False rna = ReadFASTA('data/rosalind_cat.txt')[0][1] noncross_dict = {} matchings = {'A': 'U', 'U': 'A', 'C': 'G', 'G': 'C'} noncross = Noncrossing(rna) print noncross with open('output/033_CAT.txt', 'w') as output_file: output_file.write(str(noncross))
from scripts import ReadFASTA def reverseComplementDNA(acid): out = "".maketrans("TAGC", "ATCG") return acid.translate(out)[::-1].lstrip() dna_list = ReadFASTA("data/corr_data.txt") dna_groups = [] for dna_tuple in dna_list: in_group = False dna = dna_tuple[1] for index, group in enumerate(dna_groups): if dna in group or reverseComplementDNA(str(dna)) in group: dna_groups[index].append(dna) in_group = True break if not in_group: dna_groups.append([dna]) dna_groups += [[], []] + dna_groups while len(dna_groups) > 2: if len(dna_groups[len(dna_groups) - 1]) > 1: dna_groups[0].append(dna_groups.pop(len(dna_groups) - 1)) else: dna_groups[1] += dna_groups.pop(len(dna_groups) - 1)
#!/usr/bin/env python ''' A solution to a ROSALIND bioinformatics problem. Problem Title: Creating a Distance Matrix Rosalind ID: PDST Rosalind #: 041 URL: http://rosalind.info/problems/PDST/ ''' from numpy import zeros from scripts import ReadFASTA dna_list = [fasta[1] for fasta in ReadFASTA('data/rosalind_pdst.txt')] # All seqences have the same length. dna_len = len(dna_list[0]) M = zeros((len(dna_list), len(dna_list))) for i in range(len(dna_list)): for j in range(len(dna_list)): if i < j: for k in range(dna_len): if dna_list[i][k] != dna_list[j][k]: M[i][j] += 1.0 / dna_len elif i > j: M[i][j] = M[j][i] print M
return 1 else: # If we've already computed the value, return it! if rna in noncross_dict: return noncross_dict[rna] # Otherwise, calculate the value, add it to the dictionary, and return it. else: subintervals = [] for i in xrange(1, len(rna)): if rna[0] == matchings[rna[i]]: subintervals.append([rna[1:i], rna[i + 1:]]) # Reduce the problem to noncrossing matchings over the matching substrings, and the matchings for the next starting point. noncross_dict[rna] = (sum([ Noncrossing(subint[0]) * Noncrossing(subint[1]) for subint in subintervals ]) + Noncrossing(rna[1:])) % 1000000 return noncross_dict[rna] rna = ReadFASTA('data/rosalind_motz.txt')[0][1] matchings = {'A': 'U', 'U': 'A', 'C': 'G', 'G': 'C'} noncross_dict = {} noncross = Noncrossing(rna) print noncross with open('output/048_MOTZ.txt', 'w') as output_file: output_file.write(str(noncross))
#!/usr/bin/env python ''' A solution to a ROSALIND bioinformatics problem. Problem Title: Maximum Matchings and RNA Secondary Structures Rosalind ID: MMCH Rosalind #: 040 URL: http://rosalind.info/problems/mmch/ ''' from math import factorial from scripts import ReadFASTA def nPr(n, k): '''Returns the number of k-pernumatations of n.''' return factorial(n)/factorial(n-k) rna = ReadFASTA('data/rosalind_mmch.txt')[0][1] # Counts the number of each times each nucleotide appears in the RNA string. AU_num = [rna.count(nucleotide) for nucleotide in 'AU'] GC_num = [rna.count(nucleotide) for nucleotide in 'GC'] # There are nPr(max, min) edges for each AU, CG. Total number of edges is then the product. max_matchings = nPr(max(AU_num), min(AU_num))*nPr(max(GC_num), min(GC_num)) print max_matchings with open('output/040_MMCH.txt', 'w') as output_data: output_data.write(str(max_matchings))
from scripts import ReadFASTA from protein_map import ProteinDictDNA dna_list = ReadFASTA('data/rna_splicing.txt') exon = dna_list[0][1] for intron in dna_list[1:]: # print(intron[1]) exon = exon.replace(intron[1], '') # print(exon) proteinDict = ProteinDictDNA() # print(proteinDict) # for index in range(0, len(exon), 3): # print(str(exon[index:index+3])) # print(proteinDict[exon[index:index+3]]) exon_protein = '' index = 3 while index < len(exon): codon = exon[index:index + 3] p = proteinDict[codon] if p != 'Stop': exon_protein += proteinDict[codon] index = index + 3 print(exon_protein)
from scripts import ReadFASTA DNA1, DNA2 = [fasta[1] for fasta in ReadFASTA('data/rosalind_tran.txt')] transitions = transversions = 0.0 for i in xrange(len(DNA1)): if DNA1[i] == DNA2[i]: pass # Check if the nucleotides are in the same purine/pyrimidine group. elif DNA1[i] in [['A', 'G'], ['C', 'T']][DNA2[i] in ['C', 'T']]: transitions += 1 else: transversions += 1 print transitions / transversions with open('output/031_TRAN.txt', 'w') as output_data: output_data.write(str(transitions / transversions))
#!/usr/bin/env python ''' A solution to a ROSALIND bioinformatics problem. Problem Title: k-Mer Composition Rosalind ID: KMER Rosalind #: 036 URL: http://rosalind.info/problems/kmer/ ''' from itertools import product from scripts import ReadFASTA dna = ReadFASTA('data/rosalind_kmer.txt')[0][1] # Get a list of all 4-mers in lexiographic order. kmer_list = [''.join(kmer) for kmer in list(product('ACGT', repeat=4))] # Initialize the count of each 4-mer at zero for each 4-mer. kmer_count = [0] * (4**4) # Count each 4-mer for i in range(len(dna) - 3): kmer_count[kmer_list.index(dna[i:i + 4])] += 1 print ' '.join(map(str, kmer_count)) with open('output/036_KMER.txt', 'w') as output_data: output_data.write(' '.join(map(str, kmer_count)))
# Backtrack to start of the local alignment starting at the highest scoring cell. while backtrack[i][j] != 3 and i * j != 0: if backtrack[i][j] == 0: i -= 1 elif backtrack[i][j] == 1: j -= 1 elif backtrack[i][j] == 2: i -= 1 j -= 1 # Cut the strings at the ending point of the backtrack. v_aligned = v_aligned[i:] w_aligned = w_aligned[j:] return max_score, v_aligned, w_aligned if __name__ == '__main__': # Parse the two input protein strings. s, t = [fasta[1] for fasta in ReadFASTA('input/rosalind_loca.txt')] # Get the local alignment (given sigma = 5 in problem statement). alignment = local_alignment(s, t, PAM250(), 5) # Print and save the answer. print '\n'.join(alignment) with open('output/local_alignment.txt', 'w') as output_data: output_data.write('\n'.join(alignment))
#!/usr/bin/env python ''' A solution to a ROSALIND bioinformatics problem. Problem Title: Error Correction in Reads Rosalind ID: CORR Rosalind #: 034 URL: http://rosalind.info/problems/corr/ ''' from scripts import ReadFASTA, ReverseComplementDNA as RevComp, HammingDistance as Hamm # Group together identical DNA sequences, up to reverse complement. dna_groups = [] for dna in [fasta[1] for fasta in ReadFASTA('data/rosalind_corr.txt')]: in_group = False for index, group in enumerate(dna_groups): if dna in group or RevComp(dna) in group: dna_groups[index].append(dna) in_group = True break if not in_group: dna_groups.append([dna]) # Sort the DNA groups as either being a correct read in index 0, or incorrect read in index 1. dna_groups = [[], []] + dna_groups while len(dna_groups) > 2: if len(dna_groups[len(dna_groups) - 1]) > 1: # Convert to set to eliminate repeats. dna_groups[0].append(dna_groups.pop(len(dna_groups) - 1))
# Backtrack to start of the local alignment starting at the highest scoring cell. while backtrack[i][j] != 3 and i * j != 0: if backtrack[i][j] == 0: i -= 1 elif backtrack[i][j] == 1: j -= 1 elif backtrack[i][j] == 2: i -= 1 j -= 1 # Cut the strings at the ending point of the backtrack. v_aligned = v_aligned[i:] w_aligned = w_aligned[j:] return max_score, v_aligned, w_aligned if __name__ == '__main__': # Parse the two input protein strings. s, t = [fasta[1] for fasta in ReadFASTA('data/rosalind_loca.txt')] # Get the local alignment (given sigma = 5 in problem statement). alignment = local_alignment(s, t, PAM250(), 5) # Print and save the answer. print '\n'.join(alignment) with open('output/081_LOCA.txt', 'w') as output_data: output_data.write('\n'.join(alignment))
from scripts import ReadFASTA, ProteinDictDNA dna_list = ReadFASTA('../data/rosalind_splc.txt') exon = dna_list[0][1] # Remove the introns. for intron in dna_list[1:]: print(intron) exon = exon.replace(intron[1], '') # Translate the exons. dna_dict = ProteinDictDNA() exon_protein = '' for index in range(0, len(exon), 3): exon_protein += dna_dict[exon[index:index + 3]] if dna_dict[exon[index:index + 3]] != 'Stop' else '' print exon_protein with open('../data/SPLC.txt', 'w') as output_data: output_data.write(exon_protein)
'''Extracts all substrings from the first string in a list, and sends longest substring candidates to be checked.''' longest = '' for start_index in xrange(len(string_list[0])): for end_index in xrange(len(string_list[0]), start_index, -1): # Break if the length becomes too small, as it will only get smaller. if end_index - start_index <= len(longest): break elif CheckSubstring(string_list[0][start_index:end_index], string_list): longest = string_list[0][start_index:end_index] return longest def CheckSubstring(find_string, string_list): 'Checks if a given substring appears in all members of a given collection of strings and returns True/False.' for string in string_list: if (len(string) < len(find_string)) or (find_string not in string): return False return True if __name__ == '__main__': fasta_list = ReadFASTA('data/rosalind_lcsm.txt') dna = [] for fasta in fasta_list: dna.append(fasta[1]) lcsm = LongestSubstring(dna) print lcsm with open('output/014_LCSM.txt', 'w') as output_data: output_data.write(lcsm)
#!/usr/bin/env python ''' A solution to a ROSALIND bioinformatics problem. Problem Title: Finding a Spliced Motif Rosalind ID: SSEQ Rosalind #: 030 URL: http://rosalind.info/problems/sseq/ ''' from scripts import ReadFASTA dna, sub_seq = [fasta[1] for fasta in ReadFASTA('data/rosalind_sseq.txt')] sseq_indicies, i = [], 0 for nucleotide in sub_seq: # In practice: Use exception handling/additional constraints as such a subsequence does not necessarily exist. while dna[i] != nucleotide: i += 1 # Use i+1 as the indicies because Rosalind starts at i=1 instead of i=0. sseq_indicies.append(str(i + 1)) i += 1 print ' '.join(sseq_indicies) with open('output/030_SSEQ.txt', 'w') as output_data: output_data.write(' '.join(sseq_indicies))
from scripts import ReadFASTA # from string import maketrans def reverseComplementDNA(dna): intable = "ATGC" outtable = "TACG" out = "".maketrans(intable, outtable) return dna.translate(out) dna_list = ReadFASTA('data/corr.txt') dna_dict = {} for dna in dna_list[1:]: dna_dict[dna[1]] = 0 dna_groups = [] for dna_tuple in dna_list: in_group = False dna = dna_tuple[1] complementDNA = reverseComplementDNA(dna[1]) for index, group in enumerate(dna_groups): if in_group print(dna_dict) print(reverseComplementDNA("AGGGGGA"))
#!/usr/bin/env python ''' A solution to a ROSALIND bioinformatics problem. Problem Title: Consensus and Profile Rosalind ID: SUBS Rosalind #: 009 URL: http://rosalind.info/problems/subs/ ''' from numpy import zeros from scripts import ReadFASTA # Data is in FASTA form dna_list = ReadFASTA('data/rosalind_cons.txt') # Setup an array and count into the array M = zeros((4, len(dna_list[0][1])), dtype=int) snp_dict = {'A': 0, 'C': 1, 'G': 2, 'T': 3} for dna in dna_list: for index, snp in enumerate(dna[1]): M[snp_dict[snp]][index] += 1 # Determine the consensus string consensus = '' to_snp = {0: 'A', 1: 'C', 2: 'G', 3: 'T'} for i in range(0, len(dna_list[0][1])): maxval = [-1, -1] for j in range(0, 4): if maxval[1] < M[j][i]: maxval = [j, M[j][i]]