for mass in c_counter: if mass not in t_counter: return False if c_counter[mass] > t_counter[mass]: return False return True def output_format(pep): masses = [] for amino_acid in pep: masses.append(peptide.mass_table[amino_acid]) return '-'.join(map(str, masses)) candidates = peptide.amino_acids winners = [] while candidates: candidates = branch(candidates) new_candidates = [] for candidate in candidates: c_spectrum = peptide.cyclic_spectrum(candidate) l_spectrum = peptide.linear_spectrum(candidate) if c_spectrum == spectrum: winners.append(candidate) elif consistent(l_spectrum, spectrum): new_candidates.append(candidate) candidates = new_candidates inout.output(' '.join(set(map(output_format, winners))))
# ATTCTGGA # CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAATGCCTAGCGGCTTGTGGTTTCTCCTACGCTCC # 3 # Sample Output # # 6 7 26 27 78 import inout # my module for handling Rosalind's file I/O pattern = inout.infilelines[0].strip() sequence = inout.infilelines[1].strip() d = int(inout.infilelines[2].strip()) patlen = len(pattern) def mismatches(s1, s2): count = 0 for loc in range(len(s1)): if s1[loc] != s2[loc]: count = count + 1 return count matches = [] for loc in range(len(sequence) - patlen + 1): if mismatches(pattern, sequence[loc:loc + patlen]) <= d: matches.append(loc) inout.output(" ".join(map(str, matches)))
# Edit Distance Problem: Find the edit distance between two strings. # Input: Two strings. # Output: The edit distance between these strings. # Sample Input: # PLEASANTLY # MEANLY # Sample Output: # 5 import inout import common str1 = inout.infilelines[0].strip() str2 = inout.infilelines[1].strip() # Plan: use the maximum alignment score algorithm from 76-3, setting matches as 0 and mismatches and indels at -1 # Then the edit distance is just the inverse of the score import string scoring_matrix = common.mismatch_scoring_matrix(string.ascii_uppercase) indel_penalty = -1 longest, backtrack_matrix = common.scored_longest_common_subsequence(scoring_matrix, indel_penalty, str1, str2) inout.output(str(-longest))
# 3 2 4 0 # 3 2 4 2 # 0 7 3 3 # 3 3 0 2 # 1 3 2 2 # Sample Output: # 34 import inout import common n = int(inout.infilelines[0].strip()) m = int(inout.infilelines[1].strip()) if len(inout.infilelines) != 4 + 2 * n: raise Exception('Expected {} input lines based on n={}, saw {}'.format( 4 + 2 * n, n, inout.infilelines)) downmatrix = common.parse_matrix(map(str.strip, inout.infilelines[2:2 + n]), n, m + 1) rightmatrix = common.parse_matrix( map(str.strip, inout.infilelines[2 + n + 1:4 + 2 * n]), n + 1, m) if inout.infilelines[2 + n].strip() != '-': raise Exception( 'Expected - ({}) separating downmatrix from rightmatrix, saw {} ({})'. format(ord('-'), inout.infilelines[2 + n], ord(inout.infilelines[2 + n]))) longest = common.longest_path(n, m, downmatrix, rightmatrix) inout.output(str(longest))
# Solve the Middle Edge in Linear Space Problem (for protein strings). Use the BLOSUM62 scoring matrix and a linear indel penalty equal to 5. # Input: Two amino acid strings. # Output: A middle edge in the alignment graph in the form (i, j) (k, l), where (i, j) connects to (k, l). # To compute scores, use the BLOSUM62 scoring matrix and a (linear) indel penalty equal to 5. # Sample Input: # PLEASANTLY # MEASNLY # Sample Output: # (4, 3) (5, 4) import inout import common str1 = inout.infilelines[0].strip() str2 = inout.infilelines[1].strip() scoring_matrix = common.parse_scoring_matrix(inout.readlines('BLOSUM62.txt')) indel_penalty = -5 from_row, from_col, to_row, to_col = common.alignment_middle_edge( scoring_matrix, indel_penalty, str1, str2) inout.output('({}, {}) ({}, {})'.format(from_row, from_col, to_row, to_col))
# Input: The adjacency list of a directed graph that has an Eulerian path. # Output: An Eulerian path in this graph. # Sample Input: # 0 -> 2 # 1 -> 3 # 2 -> 1 # 3 -> 0,4 # 6 -> 3,7 # 7 -> 8 # 8 -> 9 # 9 -> 6 # Sample Output: # 6->7->8->9->6->3->0->2->1->3->4 import inout import common edge_strs = map(str.strip, inout.infilelines) graph = common.parse_graph_edges(edge_strs) path = common.find_eulerian_path(graph) inout.output('->'.join(path))
# Spectral Convolution Problem: Compute the convolution of a spectrum. # Input: A collection of integers Spectrum. # Output: The list of elements in the convolution of Spectrum. If an element has multiplicity k, it should # appear exactly k times; you may return the elements in any order. # Sample Input: # 0 137 186 323 # Sample Output: # 137 137 186 186 323 49 import inout spectrum = map(int, inout.infilelines[0].strip().split(' ')) convolution = [] l = len(spectrum) for i in range(l): for j in range(i + 1, l): diff = spectrum[i] - spectrum[j] if diff != 0: convolution.append(abs(diff)) inout.output(' '.join(map(str, sorted(convolution))))
# ACGTTGCATGTCGCATGATGCATGAGAGCT # 4 # Sample Output # # CATG GCAT import inout # my module for handling Rosalind's file I/O sequence = inout.infilelines[0].strip() k = int(inout.infilelines[1].strip()) kmer_counts = {} max_kmer_count = 0 for idx in range(len(sequence) - k + 1): kmer = sequence[idx:idx+k] if kmer in kmer_counts: count = kmer_counts[kmer] + 1 else: count = 1 kmer_counts[kmer] = count if count > max_kmer_count: max_kmer_count = count max_kmers = kmer elif count == max_kmer_count: max_kmers = max_kmers + " " + kmer inout.output(max_kmers)
# Sample Input: # LEQN # Sample Output: # 0 113 114 128 129 227 242 242 257 355 356 370 371 484 import inout peptide = inout.infilelines[0].strip() mass_table = {} for line in inout.readlines('integer_mass_table.txt'): amino_acid, mass = line.strip().split(' ') mass_table[amino_acid] = int(mass) def total_mass(peptide): total = 0 for amino_acid in peptide: total = total + mass_table[amino_acid] return total spectrum = [0, total_mass(peptide)] peptide_2 = peptide + peptide # for easy cyclic access for k in range(1, len(peptide)): for n in range(len(peptide)): subpep = peptide_2[n:n+k] spectrum.append(total_mass(subpep)) inout.output(' '.join(map(str, sorted(spectrum))))
# Input: A string Text and a collection of strings Patterns. # Output: All starting positions in Text where a string from Patterns appears as a substring. # Sample Input: # AATCGGGTTCAATCGGGGT # ATCG # GGGT # Sample Output: # 1 4 11 15 import inout import common text = inout.infilelines[0].strip() strings = map(str.strip, inout.infilelines[1:]) trie = common.create_trie(strings) matches = common.match_trie(trie, text) inout.output(' '.join(map(str, matches)))
# n nodes, first label the root with 1 and then label the remaining nodes with the integers 2 through n in # any order you like. Each edge of the adjacency list of Trie(Patterns) will be encoded by a triple: the first # two members of the triple must be the integers labeling the initial and terminal nodes of the edge, # respectively; the third member of the triple must be the symbol labeling the edge. # Sample Input: # GGTA # CG # GGC # Sample Output: # 1 2 G # 2 3 G # 3 4 T # 4 5 A # 3 6 C # 1 7 C # 7 8 G import inout import common strings = map(str.strip, inout.infilelines) trie = common.create_trie(strings) trie_out = common.output_trie(trie) inout.output(trie_out)
# Input: An integer k and a string Text. # Output: DeBruijnk(Text). # Sample Input: # 4 # AAGATTCTCTAC # Sample Output: # AAG -> AGA # AGA -> GAT # ATT -> TTC # CTA -> TAC # CTC -> TCT # GAT -> ATT # TCT -> CTA,CTC # TTC -> TCT import inout import common k = int(inout.infilelines[0].strip()) sequence = inout.infilelines[1].strip() graph = common.debruijn_graph(common.all_kmers(sequence, k)) graph_strs = [] for k,v in graph.iteritems(): graph_strs.append(common.debruijn_to_str(k,v)) inout.output('\n'.join(graph_strs))
# Input: Three DNA strings. # Output: The length of a longest common subsequence of these three strings, followed by a multiple # alignment of the three strings corresponding to such an alignment. # Sample Input: # ATATCCG # TCCGA # ATGTACTG # Sample Output: # 3 # ATATCC-G- # ---TCC-GA # ATGTACTG- import inout import common str1 = inout.infilelines[0].strip() str2 = inout.infilelines[1].strip() str3 = inout.infilelines[2].strip() scoring_matrix = common.mismatch_scoring_matrix_fitted("ACGT") indel_penalty = -1 score, backtrack_matrix, best_x, best_y, best_z = common.align_3(str1, str2 ,str3) aligned1, aligned2, aligned3 = common.output_align_3(backtrack_matrix, str1, str2 ,str3, best_x, best_y, best_z) inout.output('{}\n{}\n{}\n{}'.format(score, aligned1, aligned2, aligned3))
# GREEDYMOTIFSEARCH with pseudocounts # Sample Input: # 3 5 # GGCGTTCAGGCA # AAGAATCAGTCA # CAAGGAGTTCGC # CACGTCAATCAC # CAATAATATTCG # Sample Output: # TTC # ATC # TTC # ATC # TTC import inout import common k,t = map(int, inout.infilelines[0].strip().split(' ')) sequences = map(str.strip, inout.infilelines[1:]) best_motifs = common.greedy_motif_search_with_pseudocounts(sequences, k, t) inout.output('\n'.join(best_motifs))
# Input: A collection Patterns of k-mers. # Output: The overlap graph Overlap(Patterns), in the form of an adjacency list. # Sample Input: # ATGCG # GCATG # CATGC # AGGCA # GGCAT # Sample Output: # AGGCA -> GGCAT # CATGC -> ATGCG # GCATG -> CATGC # GGCAT -> GCATG import inout import common sequences = map(str.strip, inout.infilelines) inout.output('\n'.join( map(common.overlap_to_str, common.overlap_graph(sequences))))
# an edge connects node 0 to node 1 with weight 7. # Output: The length of a longest path in the graph, followed by a longest path. # Sample Input: # 0 # 4 # 0->1:7 # 0->2:4 # 2->3:2 # 1->4:1 # 3->4:3 # Sample Output: # 9 # 0->2->3->4 import inout import common source = inout.infilelines[0].strip() sink = inout.infilelines[1].strip() edges = map(str.strip, inout.infilelines[2:]) dag = common.parse_dag_edges(edges) ordering = common.wikipedia_depth_first_topological_sort(dag, sink) weight, backtrack = common.longest_dag_weight(dag, ordering, source, sink) path = common.output_longest_dag_path(backtrack, source, sink) inout.output('{}\n{}'.format(weight, path))
# alignment of the three strings corresponding to such an alignment. # Sample Input: # ATATCCG # TCCGA # ATGTACTG # Sample Output: # 3 # ATATCC-G- # ---TCC-GA # ATGTACTG- import inout import common str1 = inout.infilelines[0].strip() str2 = inout.infilelines[1].strip() str3 = inout.infilelines[2].strip() scoring_matrix = common.mismatch_scoring_matrix_fitted("ACGT") indel_penalty = -1 score, backtrack_matrix, best_x, best_y, best_z = common.align_3( str1, str2, str3) aligned1, aligned2, aligned3 = common.output_align_3(backtrack_matrix, str1, str2, str3, best_x, best_y, best_z) inout.output('{}\n{}\n{}\n{}'.format(score, aligned1, aligned2, aligned3))
# Implement LINEARSPACEALIGNMENT to solve the Global Alignment Problem for a large dataset. # Input: Two long (10000 amino acid) protein strings written in the single-letter amino acid alphabet. # Output: The maximum alignment score of these strings, followed by an alignment achieving this # maximum score. Use the BLOSUM62 scoring matrix and indel penalty sigma = 5. # Sample Input: # PLEASANTLY # MEANLY # Sample Output: # 8 # PLEASANTLY # -MEA--N-LY import inout import common str1 = inout.infilelines[0].strip() str2 = inout.infilelines[1].strip() scoring_matrix = common.parse_scoring_matrix(inout.readlines('BLOSUM62.txt')) indel_penalty = -5 score, alignment1, alignment2 = common.linear_space_alignment(scoring_matrix, indel_penalty, str1, str2) inout.output('{}\n{}\n{}'.format(str(score), alignment1, alignment2))
# Input: An integer money and an array coins = (coin1, ..., coind). # Output: The minimum number of coins with denominations coins that changes money. # Sample Input: # 40 # 50,25,20,10,5,1 # Sample Output: # 2 import inout import common change = int(inout.infilelines[0].strip()) coins = map(int, inout.infilelines[1].strip().split(',')) numcoins = common.make_change(change, coins) inout.output(str(numcoins))
# Input: A string BWT(Text), followed by a collection of Patterns. # Output: A list of integers, where the i-th integer corresponds to the number of substring matches of the # i-th member of Patterns in Text. # Sample Input: # TCCTCTATGAGATCCTATTCTATGAAACCTTCA$GACCAAAATTCTCCGGC # CCT CAC GAG CAG ATC # Sample Output: # 2 1 1 0 1 import inout import common bwt_text = inout.infilelines[0].strip() patterns = inout.infilelines[1].strip().split(' ') counts = '' for pattern in patterns: counts += str(common.bwt_matching(bwt_text, pattern)) + ' ' inout.output(counts.strip())
# Protein Translation Problem: Translate an RNA string into an amino acid string. # Sample Input: # AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA # Sample Output: # MAMAPRTEINSTRING import inout import codon sequence = inout.infilelines[0].strip() inout.output(codon.transcribe(sequence))
r = [] for m_kmer in enumerate_mismatches(kmer, maxdist - 1): for loc in range(k): for base in ['A', 'C', 'G', 'T']: new_kmer = m_kmer[:loc] + base + m_kmer[loc + 1:] r.append(new_kmer) return set(r) kmer_counts = {} max_kmers = [] max_kmer_count = 0 for idx in range(len(sequence) - k + 1): kmer = sequence[idx:idx + k] m_kmers = list(enumerate_mismatches(kmer, d)) m_kmers.extend(list(enumerate_mismatches(reverse_complement(kmer), d))) for m_kmer in m_kmers: if m_kmer in kmer_counts: count = kmer_counts[m_kmer] + 1 else: count = 1 kmer_counts[m_kmer] = count if count > max_kmer_count: max_kmer_count = count max_kmers = [m_kmer] elif count == max_kmer_count: max_kmers.append(m_kmer) inout.output(' '.join(max_kmers))
# Inverse Burrows-Wheeler Transform Problem: Reconstruct a string from its Burrows-Wheeler transform. # Input: A string Transform (with a single "$" symbol). # Output: The string Text such that BWT(Text) = Transform. # Sample Input: # TTCCTAACG$A # Sample Output: # TACATCACGT$ import inout import common text = inout.infilelines[0].strip() original = common.inv_bwt(text) inout.output(original)
return survivors # I'm sure there's a better way to do this but I don't know enough Python yet def mklist(item): return [item] candidates = map(mklist,amino_acids) winner = '' winner_score = 0 while candidates: candidates = branch(candidates) new_candidates = [] for candidate in candidates: c_mass = sum(candidate) t_mass = max(spectrum) # if the mass of the candidate peptide equals the mass of the target peptide if c_mass == t_mass: new_candidates.append(candidate) c_score = score(candidate, spectrum) if c_score > winner_score: winner = candidate winner_score = c_score elif c_mass < t_mass: new_candidates.append(candidate) # else: the candidate mass is too large, so it does not go on to the next round candidates = cut(new_candidates, spectrum, N) inout.output('-'.join(map(str,winner)))
# Input: Two strings s and t. # Output: A longest common subsequence of s and t. # # Note: If more than one LCS exists, you may return any one. # Sample Input: # AACCTTGG # ACACTGTGA # Sample Output: # AACTGG import inout import common str1 = inout.infilelines[0].strip() str2 = inout.infilelines[1].strip() # output_longest_common_subsequence was hitting the default limit of 1000 for the test dataset # https://class.coursera.org/bioinformatics-001/forum/thread?thread_id=742 import sys sys.setrecursionlimit(2000) longest, backtrack_matrix = common.longest_common_subsequence(str1, str2) lcs = common.output_longest_common_subsequence(backtrack_matrix, str1, len(str1), len(str2)) inout.output(lcs)
# Input: The adjacency list of a directed graph that has an Eulerian path. # Output: An Eulerian path in this graph. # Sample Input: # CTT -> TTA # ACC -> CCA # TAC -> ACC # GGC -> GCT # GCT -> CTT # TTA -> TAC # Sample Output: # GGCTTACCA import inout import common edge_strs = map(str.strip, inout.infilelines) graph = common.parse_graph_edges(edge_strs) path = common.find_eulerian_path(graph) inout.output(common.assemble_path(path))
# Constructing Suffix Array Problem: Construct the suffix array of a string. # Input: A string Text. # Output: SuffixArray(Text). # Sample Input: # AACGATAGCGGTAGA$ # Sample Output: # 15, 14, 0, 1, 12, 6, 4, 2, 8, 13, 3, 7, 9, 10, 11, 5 import inout import common text = inout.infilelines[0].strip() array = common.create_suffix_array(text) inout.output(', '.join(map(str, array.values())))
# Input: A collection Patterns of k-mers. # Output: The overlap graph Overlap(Patterns), in the form of an adjacency list. # Sample Input: # ATGCG # GCATG # CATGC # AGGCA # GGCAT # Sample Output: # AGGCA -> GGCAT # CATGC -> ATGCG # GCATG -> CATGC # GGCAT -> GCATG import inout import common sequences = map(str.strip, inout.infilelines) inout.output('\n'.join(map(common.overlap_to_str, common.overlap_graph(sequences))))
# output BestMotifs # Input: Integers k and t, followed by a collection of strings Dna. # Output: A collection of strings BestMotifs resulting from applying GREEDYMOTIFSEARCH(Dna,k,t). If at any step you find more than one Profile-most probable k-mer in a given string, use the one occurring first. # Sample Input: # 3 5 # GGCGTTCAGGCA # AAGAATCAGTCA # CAAGGAGTTCGC # CACGTCAATCAC # CAATAATATTCG # Sample Output: # CAG # CAG # CAA # CAA # CAA import inout import common k,t = map(int, inout.infilelines[0].strip().split(' ')) sequences = map(str.strip, inout.infilelines[1:]) best_motifs = common.greedy_motif_search(sequences, k, t) inout.output('\n'.join(best_motifs))
# Given two strings, find all their shared k-mers. # Input: An integer k and two strings. # Output: All k-mers shared by these strings, in the form of ordered pairs (x, y). # Sample Input: # 3 # AAACTCATC # TTTCAAATC # Sample Output: # (0, 4) # (0, 0) # (4, 2) # (6, 6) import inout import common k = int(inout.infilelines[0].strip()) str1, str2 = map(str.strip, inout.infilelines[1:3]) result = common.shared_kmers(k, str1, str2) def output_one_pair(pair): return '({}, {})'.format(pair[0], pair[1]) inout.output('\n'.join(map(output_one_pair, result)))
return [kmer] else: r = [] for m_kmer in enumerate_mismatches(kmer, maxdist - 1): for loc in range(k): for base in ['A', 'C', 'G', 'T']: new_kmer = '{}{}{}'.format(m_kmer[:loc], base, m_kmer[loc + 1:]) r.append(new_kmer) return set(r) kmer_counts = {} max_kmers = [] max_kmer_count = 0 for idx in range(len(sequence) - k + 1): kmer = sequence[idx:idx+k] for m_kmer in enumerate_mismatches(kmer, d): if m_kmer in kmer_counts: count = kmer_counts[m_kmer] + 1 else: count = 1 kmer_counts[m_kmer] = count if count > max_kmer_count: max_kmer_count = count max_kmers = [m_kmer] elif count == max_kmer_count: max_kmers.append(m_kmer) inout.output(' '.join(max_kmers))
# Longest Repeat Problem: Find the longest repeat in a string. # Input: A string Text. # Output: A longest repeat in Text, i.e., a longest substring of Text that appears in Text more than once. # Sample Input: # ATATCGTTTTATCGTT # Sample Output: # TATCGTT import inout import common text = inout.infilelines[0].strip() trie = common.create_suffix_trie(text, 100) substring = common.find_longest_substring_in_suffix_trie(trie, 1, '') inout.output(substring)
# an edge connects node 0 to node 1 with weight 7. # Output: The length of a longest path in the graph, followed by a longest path. # Sample Input: # 0 # 4 # 0->1:7 # 0->2:4 # 2->3:2 # 1->4:1 # 3->4:3 # Sample Output: # 9 # 0->2->3->4 import inout import common source = inout.infilelines[0].strip() sink = inout.infilelines[1].strip() edges = map(str.strip, inout.infilelines[2:]) dag = common.parse_dag_edges(edges) ordering = common.wikipedia_depth_first_topological_sort(dag, sink) weight, backtrack = common.longest_dag_weight(dag, ordering, source, sink) path = common.output_longest_dag_path(backtrack, source, sink) inout.output('{}\n{}'.format(weight,path))
import inout k,d = map(int, inout.infilelines[0].strip().split(' ')) dna_lines = map(str.strip, inout.infilelines[1:]) def enumerate_mismatches (kmer, maxdist): if maxdist == 0: return [kmer] else: r = [] for m_kmer in enumerate_mismatches(kmer, maxdist - 1): for loc in range(k): for base in ['A', 'C', 'G', 'T']: new_kmer = m_kmer[:loc] + base + m_kmer[loc + 1:] r.append(new_kmer) return set(r) def motifs (sequence, k, d): motifs = [] for idx in range(len(sequence) - k + 1): kmer = sequence[idx:idx+k] motifs.extend(enumerate_mismatches(kmer, d)) return motifs common_motifs = motifs(dna_lines[0], k, d) for line in dna_lines[1:]: common_motifs = set(common_motifs) & set(motifs(line, k, d)) inout.output(' '.join(common_motifs))
# Reverse Complement Problem # # Reverse complement a nucleotide pattern. # # Given: A DNA string Pattern. # # Return: Pattern, the reverse complement of Pattern. # Sample Dataset # # AAAACCCGGT # Sample Output # # ACCGGGTTTT complement = { 'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G' } import inout # my module for handling Rosalind's file I/O sequence = inout.infilelines[0].strip() output = '' for base in reversed(sequence): output = output + complement[base] inout.output(output)
# Spectral Convolution Problem: Compute the convolution of a spectrum. # Input: A collection of integers Spectrum. # Output: The list of elements in the convolution of Spectrum. If an element has multiplicity k, it should # appear exactly k times; you may return the elements in any order. # Sample Input: # 0 137 186 323 # Sample Output: # 137 137 186 186 323 49 import inout spectrum = map(int, inout.infilelines[0].strip().split(' ')) convolution = [] l = len(spectrum) for i in range(l): for j in range(i+1, l): diff = spectrum[i]-spectrum[j] if diff != 0: convolution.append(abs(diff)) inout.output(' '.join(map(str,sorted(convolution))))
# Input: Two strings v and w, each of length at most 1000. # Output: The score of an optimal overlap alignment of v and w, followed by an alignment of a suffix v' of # v and a prefix w' of w achieving this maximum score. Use an alignment score in which matches count # +1 and both the mismatch and indel penalties are 2. # Sample Input: # PAWHEAE # HEAGAWGHEE # Sample Output: # 1 # HEAE # HEAG import inout import common str1 = inout.infilelines[0].strip() str2 = inout.infilelines[1].strip() import string scoring_matrix = common.mismatch_scoring_matrix_overlap(string.ascii_uppercase) indel_penalty = -2 longest, backtrack_matrix, best_row, best_col = common.scored_longest_common_subsequence_overlap(scoring_matrix, indel_penalty, str1, str2) aligned1, aligned2 = common.output_longest_common_subsequence_local(backtrack_matrix, str1, str2, best_row, best_col) inout.output('{}\n{}\n{}'.format(longest, aligned1, aligned2))
# Return: All integer(s) i minimizing Skew(Prefixi (Text)) over all values of i (from 0 to |Genome|). # Sample Dataset # # CCTATCGGTGGATTAGCATGTCCCTGTACGTTTCGCCGCGAACTAGTTCACACGGCTTGATGGCAAATGGTTTTTCCGGCGACCGTAATCGTCCACCGAG # Sample Output # # 53 97 import inout # my module for handling Rosalind's file I/O sequence = inout.infilelines[0].strip() all_min_skew_loc = [] skew, skew_loc, min_skew = 0, 1, 1000 for base in sequence: if base == 'C': skew = skew - 1 elif base == 'G': skew = skew + 1 if skew < min_skew: min_skew = skew all_min_skew_loc = [skew_loc] elif skew == min_skew: all_min_skew_loc.append(skew_loc) skew_loc = skew_loc + 1 inout.output(" ".join(map(str, all_min_skew_loc)))
def output_format(pep): masses = [] for amino_acid in pep: masses.append(peptide.mass_table[amino_acid]) return '-'.join(map(str,masses)) candidates = peptide.amino_acids winner = '' winner_score = 0 while candidates: candidates = branch(candidates) new_candidates = [] for candidate in candidates: c_mass = peptide.total_mass(candidate) t_mass = spectrum[-1] # if the mass of the candidate peptide equals the mass of the target peptide if c_mass == t_mass: new_candidates.append(candidate) c_score = score(candidate, spectrum) if c_score > winner_score: winner = candidate winner_score = c_score elif c_mass < t_mass: new_candidates.append(candidate) # else: the candidate mass is too large, so it does not go on to the next round candidates = cut(new_candidates, spectrum, N) inout.output(output_format(winner))
# Input: An integer k and a string Text. # Output: DeBruijnk(Text). # Sample Input: # 4 # AAGATTCTCTAC # Sample Output: # AAG -> AGA # AGA -> GAT # ATT -> TTC # CTA -> TAC # CTC -> TCT # GAT -> ATT # TCT -> CTA,CTC # TTC -> TCT import inout import common k = int(inout.infilelines[0].strip()) sequence = inout.infilelines[1].strip() graph = common.debruijn_graph(common.all_kmers(sequence, k)) graph_strs = [] for k, v in graph.iteritems(): graph_strs.append(common.debruijn_to_str(k, v)) inout.output('\n'.join(graph_strs))
# Number of Breakpoints Problem: Find the number of breakpoints in a permutation. # Input: A permutation P. # Output: The number of breakpoints in P. # Sample Input: # (+3 +4 +5 -12 -8 -7 -6 +1 +2 +10 +9 -11 +13 +14) # Sample Output: # 8 import inout import common permutation = common.greedysorting_parse(inout.infilelines[0].strip()) inout.output(str(common.count_breakpoints(permutation)))
# Input: A permutation P. # Output: The sequence of permutations corresponding to applying GREEDYSORTING to P, ending with # the identity permutation. # Sample Input: # (-3 +4 +1 +5 -2) # Sample Output: # (-1 -4 +3 +5 -2) # (+1 -4 +3 +5 -2) # (+1 +2 -5 -3 +4) # (+1 +2 +3 +5 +4) # (+1 +2 +3 -4 -5) # (+1 +2 +3 +4 -5) # (+1 +2 +3 +4 +5) import inout import common permutation = common.greedysorting_parse(inout.infilelines[0].strip()) sequence = common.greedysorting(permutation) sequence_out = common.greedysorting_out(sequence) inout.output(sequence_out)
import inout # module for handling Rosalind's file I/O sequence = inout.infilelines[0].strip() d = {} for char in sequence: if char in d: d[char] += 1 else: d[char] = 1 counts = (d['A'], d['C'], d['G'], d['T']) inout.output(' '.join(map(str, counts)))
# Output: The score of an optimal overlap alignment of v and w, followed by an alignment of a suffix v' of # v and a prefix w' of w achieving this maximum score. Use an alignment score in which matches count # +1 and both the mismatch and indel penalties are 2. # Sample Input: # PAWHEAE # HEAGAWGHEE # Sample Output: # 1 # HEAE # HEAG import inout import common str1 = inout.infilelines[0].strip() str2 = inout.infilelines[1].strip() import string scoring_matrix = common.mismatch_scoring_matrix_overlap(string.ascii_uppercase) indel_penalty = -2 longest, backtrack_matrix, best_row, best_col = common.scored_longest_common_subsequence_overlap( scoring_matrix, indel_penalty, str1, str2) aligned1, aligned2 = common.output_longest_common_subsequence_local( backtrack_matrix, str1, str2, best_row, best_col) inout.output('{}\n{}\n{}'.format(longest, aligned1, aligned2))
# Input: An integer k and a string Text. # Output: Compositionk(Text), where the k-mers are written in lexicographic order. # Sample Input: # 5 # CAATCCAAC # Sample Output: # AATCC # ATCCA # CAATC # CCAAC # TCCAA import inout import common k = int(inout.infilelines[0].strip()) sequence = inout.infilelines[1].strip() kmers = sorted(common.all_kmers(sequence, k)) inout.output('\n'.join(kmers))
# Protein Translation Problem: Translate an RNA string into an amino acid string. # Sample Input: # AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA #Sample Output: # MAMAPRTEINSTRING import inout import codon sequence = inout.infilelines[0].strip() inout.output(codon.transcribe(sequence))
# Implement LINEARSPACEALIGNMENT to solve the Global Alignment Problem for a large dataset. # Input: Two long (10000 amino acid) protein strings written in the single-letter amino acid alphabet. # Output: The maximum alignment score of these strings, followed by an alignment achieving this # maximum score. Use the BLOSUM62 scoring matrix and indel penalty sigma = 5. # Sample Input: # PLEASANTLY # MEANLY # Sample Output: # 8 # PLEASANTLY # -MEA--N-LY import inout import common str1 = inout.infilelines[0].strip() str2 = inout.infilelines[1].strip() scoring_matrix = common.parse_scoring_matrix(inout.readlines('BLOSUM62.txt')) indel_penalty = -5 score, alignment1, alignment2 = common.linear_space_alignment( scoring_matrix, indel_penalty, str1, str2) inout.output('{}\n{}\n{}'.format(str(score), alignment1, alignment2))
import inout # module for handling Rosalind's file I/O sequence = inout.infilelines[0].strip() reversed_seq = sequence[::-1] complements = { "A": "T", "T": "A", "C": "G", "G": "C" } rc = [complements[x] for x in reversed_seq] inout.output(''.join(rc))