for mass in c_counter:
        if mass not in t_counter:
            return False
        if c_counter[mass] > t_counter[mass]:
            return False
    return True


def output_format(pep):
    masses = []
    for amino_acid in pep:
        masses.append(peptide.mass_table[amino_acid])
    return '-'.join(map(str, masses))


candidates = peptide.amino_acids
winners = []
while candidates:
    candidates = branch(candidates)
    new_candidates = []
    for candidate in candidates:
        c_spectrum = peptide.cyclic_spectrum(candidate)
        l_spectrum = peptide.linear_spectrum(candidate)
        if c_spectrum == spectrum:
            winners.append(candidate)
        elif consistent(l_spectrum, spectrum):
            new_candidates.append(candidate)
    candidates = new_candidates

inout.output(' '.join(set(map(output_format, winners))))
Ejemplo n.º 2
0
# ATTCTGGA
# CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAATGCCTAGCGGCTTGTGGTTTCTCCTACGCTCC
# 3

# Sample Output
#
# 6 7 26 27 78

import inout  # my module for handling Rosalind's file I/O
pattern = inout.infilelines[0].strip()
sequence = inout.infilelines[1].strip()
d = int(inout.infilelines[2].strip())

patlen = len(pattern)


def mismatches(s1, s2):
    count = 0
    for loc in range(len(s1)):
        if s1[loc] != s2[loc]:
            count = count + 1
    return count


matches = []
for loc in range(len(sequence) - patlen + 1):
    if mismatches(pattern, sequence[loc:loc + patlen]) <= d:
        matches.append(loc)

inout.output(" ".join(map(str, matches)))
Ejemplo n.º 3
0
# Edit Distance Problem: Find the edit distance between two strings.
# Input: Two strings.
# Output: The edit distance between these strings.

# Sample Input:
# PLEASANTLY
# MEANLY

# Sample Output:
# 5

import inout
import common

str1 = inout.infilelines[0].strip()
str2 = inout.infilelines[1].strip()

# Plan: use the maximum alignment score algorithm from 76-3, setting matches as 0 and mismatches and indels at -1
# Then the edit distance is just the inverse of the score

import string
scoring_matrix = common.mismatch_scoring_matrix(string.ascii_uppercase)
indel_penalty = -1

longest, backtrack_matrix = common.scored_longest_common_subsequence(scoring_matrix, indel_penalty, str1, str2)

inout.output(str(-longest))
Ejemplo n.º 4
0
# 3 2 4 0
# 3 2 4 2
# 0 7 3 3
# 3 3 0 2
# 1 3 2 2

# Sample Output:
# 34

import inout
import common

n = int(inout.infilelines[0].strip())
m = int(inout.infilelines[1].strip())
if len(inout.infilelines) != 4 + 2 * n:
    raise Exception('Expected {} input lines based on n={}, saw {}'.format(
        4 + 2 * n, n, inout.infilelines))
downmatrix = common.parse_matrix(map(str.strip, inout.infilelines[2:2 + n]), n,
                                 m + 1)
rightmatrix = common.parse_matrix(
    map(str.strip, inout.infilelines[2 + n + 1:4 + 2 * n]), n + 1, m)
if inout.infilelines[2 + n].strip() != '-':
    raise Exception(
        'Expected - ({}) separating downmatrix from rightmatrix, saw {} ({})'.
        format(ord('-'), inout.infilelines[2 + n],
               ord(inout.infilelines[2 + n])))

longest = common.longest_path(n, m, downmatrix, rightmatrix)

inout.output(str(longest))
# Solve the Middle Edge in Linear Space Problem (for protein strings). Use the BLOSUM62 scoring matrix and a linear indel penalty equal to 5.
# Input: Two amino acid strings.
# Output: A middle edge in the alignment graph in the form (i, j) (k, l), where (i, j) connects to (k, l).
# To compute scores, use the BLOSUM62 scoring matrix and a (linear) indel penalty equal to 5.

# Sample Input:
# PLEASANTLY
# MEASNLY

# Sample Output:
# (4, 3) (5, 4)

import inout
import common

str1 = inout.infilelines[0].strip()
str2 = inout.infilelines[1].strip()

scoring_matrix = common.parse_scoring_matrix(inout.readlines('BLOSUM62.txt'))
indel_penalty = -5

from_row, from_col, to_row, to_col = common.alignment_middle_edge(
    scoring_matrix, indel_penalty, str1, str2)
inout.output('({}, {}) ({}, {})'.format(from_row, from_col, to_row, to_col))
Ejemplo n.º 6
0
# Input: The adjacency list of a directed graph that has an Eulerian path.
# Output: An Eulerian path in this graph.

# Sample Input:
#      0 -> 2
#      1 -> 3
#      2 -> 1
#      3 -> 0,4
#      6 -> 3,7
#      7 -> 8
#      8 -> 9
#      9 -> 6

# Sample Output:
#      6->7->8->9->6->3->0->2->1->3->4

import inout
import common

edge_strs = map(str.strip, inout.infilelines)

graph = common.parse_graph_edges(edge_strs)
path = common.find_eulerian_path(graph)

inout.output('->'.join(path))
# Spectral Convolution Problem: Compute the convolution of a spectrum.
#     Input: A collection of integers Spectrum.
#     Output: The list of elements in the convolution of Spectrum. If an element has multiplicity k, it should
#     appear exactly k times; you may return the elements in any order.

# Sample Input:
#     0 137 186 323

# Sample Output:
#     137 137 186 186 323 49

import inout

spectrum = map(int, inout.infilelines[0].strip().split(' '))

convolution = []
l = len(spectrum)
for i in range(l):
    for j in range(i + 1, l):
        diff = spectrum[i] - spectrum[j]
        if diff != 0:
            convolution.append(abs(diff))

inout.output(' '.join(map(str, sorted(convolution))))
Ejemplo n.º 8
0
# ACGTTGCATGTCGCATGATGCATGAGAGCT
# 4

# Sample Output
#
# CATG GCAT

import inout 	# my module for handling Rosalind's file I/O

sequence = inout.infilelines[0].strip()
k = int(inout.infilelines[1].strip())

kmer_counts = {}
max_kmer_count = 0
for idx in range(len(sequence) - k + 1):
	kmer = sequence[idx:idx+k]
	
	if kmer in kmer_counts:
		count = kmer_counts[kmer] + 1
	else:
		count = 1		
	kmer_counts[kmer] = count
	
	if count > max_kmer_count:
		max_kmer_count = count
		max_kmers = kmer
	elif count == max_kmer_count:
		max_kmers = max_kmers + " " + kmer

inout.output(max_kmers)
Ejemplo n.º 9
0
# Sample Input:
#     LEQN

# Sample Output:
#     0 113 114 128 129 227 242 242 257 355 356 370 371 484

import inout

peptide = inout.infilelines[0].strip()

mass_table = {}
for line in inout.readlines('integer_mass_table.txt'):
	amino_acid, mass = line.strip().split(' ')
	mass_table[amino_acid] = int(mass)

def total_mass(peptide):
	total = 0
	for amino_acid in peptide:
		total = total + mass_table[amino_acid]
	return total
	
spectrum = [0, total_mass(peptide)]

peptide_2 = peptide + peptide	# for easy cyclic access
for k in range(1, len(peptide)):
	for n in range(len(peptide)):
		subpep = peptide_2[n:n+k]
		spectrum.append(total_mass(subpep))
		
inout.output(' '.join(map(str, sorted(spectrum))))
Ejemplo n.º 10
0
# Input: A string Text and a collection of strings Patterns.
# Output: All starting positions in Text where a string from Patterns appears as a substring.

# Sample Input:
# AATCGGGTTCAATCGGGGT
# ATCG
# GGGT

# Sample Output:
# 1 4 11 15

import inout
import common

text = inout.infilelines[0].strip()
strings = map(str.strip, inout.infilelines[1:])

trie = common.create_trie(strings)
matches = common.match_trie(trie, text)

inout.output(' '.join(map(str, matches)))
Ejemplo n.º 11
0
# n nodes, first label the root with 1 and then label the remaining nodes with the integers 2 through n in
# any order you like. Each edge of the adjacency list of Trie(Patterns) will be encoded by a triple: the first
# two members of the triple must be the integers labeling the initial and terminal nodes of the edge,
# respectively; the third member of the triple must be the symbol labeling the edge.

# Sample Input:
# GGTA
# CG
# GGC

# Sample Output:
# 1 2 G
# 2 3 G
# 3 4 T
# 4 5 A

# 3 6 C

# 1 7 C
# 7 8 G

import inout
import common

strings = map(str.strip, inout.infilelines)

trie = common.create_trie(strings)
trie_out = common.output_trie(trie)

inout.output(trie_out)
Ejemplo n.º 12
0
# Input: An integer k and a string Text.
# Output: DeBruijnk(Text).

# Sample Input:
#      4
#      AAGATTCTCTAC

# Sample Output:
#      AAG -> AGA
#      AGA -> GAT
#      ATT -> TTC
#      CTA -> TAC
#      CTC -> TCT
#      GAT -> ATT
#      TCT -> CTA,CTC
#      TTC -> TCT
     
import inout
import common

k = int(inout.infilelines[0].strip())
sequence = inout.infilelines[1].strip()

graph = common.debruijn_graph(common.all_kmers(sequence, k))		

graph_strs = []
for k,v in graph.iteritems():
	graph_strs.append(common.debruijn_to_str(k,v))

inout.output('\n'.join(graph_strs))
Ejemplo n.º 13
0
# Input: Three DNA strings.
# Output: The length of a longest common subsequence of these three strings, followed by a multiple
# alignment of the three strings corresponding to such an alignment.

# Sample Input:
# ATATCCG
# TCCGA
# ATGTACTG

# Sample Output:
# 3
# ATATCC-G-
# ---TCC-GA
# ATGTACTG-

import inout
import common

str1 = inout.infilelines[0].strip()
str2 = inout.infilelines[1].strip()
str3 = inout.infilelines[2].strip()

scoring_matrix = common.mismatch_scoring_matrix_fitted("ACGT")
indel_penalty = -1

score, backtrack_matrix, best_x, best_y, best_z = common.align_3(str1, str2 ,str3)
aligned1, aligned2, aligned3 = common.output_align_3(backtrack_matrix, str1, str2 ,str3, best_x, best_y, best_z)

inout.output('{}\n{}\n{}\n{}'.format(score, aligned1, aligned2, aligned3))
# GREEDYMOTIFSEARCH with pseudocounts

# Sample Input:
#     3 5
#     GGCGTTCAGGCA
#     AAGAATCAGTCA
#     CAAGGAGTTCGC
#     CACGTCAATCAC
#     CAATAATATTCG

# Sample Output:
#      TTC
#      ATC
#      TTC
#      ATC
#      TTC

import inout
import common

k,t = map(int, inout.infilelines[0].strip().split(' '))
sequences = map(str.strip, inout.infilelines[1:])

best_motifs = common.greedy_motif_search_with_pseudocounts(sequences, k, t)

inout.output('\n'.join(best_motifs))
Ejemplo n.º 15
0
# Input: A collection Patterns of k-mers.
# Output: The overlap graph Overlap(Patterns), in the form of an adjacency list.

# Sample Input:
#      ATGCG
#      GCATG
#      CATGC
#      AGGCA
#      GGCAT

# Sample Output:
#      AGGCA -> GGCAT
#      CATGC -> ATGCG
#      GCATG -> CATGC
#      GGCAT -> GCATG

import inout
import common

sequences = map(str.strip, inout.infilelines)

inout.output('\n'.join(
    map(common.overlap_to_str, common.overlap_graph(sequences))))
Ejemplo n.º 16
0
# an edge connects node 0 to node 1 with weight 7.
# Output: The length of a longest path in the graph, followed by a longest path.

# Sample Input:
# 0
# 4
# 0->1:7
# 0->2:4
# 2->3:2
# 1->4:1
# 3->4:3

# Sample Output:
# 9
# 0->2->3->4

import inout
import common

source = inout.infilelines[0].strip()
sink = inout.infilelines[1].strip()
edges = map(str.strip, inout.infilelines[2:])

dag = common.parse_dag_edges(edges)
ordering = common.wikipedia_depth_first_topological_sort(dag, sink)

weight, backtrack = common.longest_dag_weight(dag, ordering, source, sink)
path = common.output_longest_dag_path(backtrack, source, sink)

inout.output('{}\n{}'.format(weight, path))
Ejemplo n.º 17
0
# alignment of the three strings corresponding to such an alignment.

# Sample Input:
# ATATCCG
# TCCGA
# ATGTACTG

# Sample Output:
# 3
# ATATCC-G-
# ---TCC-GA
# ATGTACTG-

import inout
import common

str1 = inout.infilelines[0].strip()
str2 = inout.infilelines[1].strip()
str3 = inout.infilelines[2].strip()

scoring_matrix = common.mismatch_scoring_matrix_fitted("ACGT")
indel_penalty = -1

score, backtrack_matrix, best_x, best_y, best_z = common.align_3(
    str1, str2, str3)
aligned1, aligned2, aligned3 = common.output_align_3(backtrack_matrix, str1,
                                                     str2, str3, best_x,
                                                     best_y, best_z)

inout.output('{}\n{}\n{}\n{}'.format(score, aligned1, aligned2, aligned3))
Ejemplo n.º 18
0
# Implement LINEARSPACEALIGNMENT to solve the Global Alignment Problem for a large dataset.
# Input: Two long (10000 amino acid) protein strings written in the single-letter amino acid alphabet.
# Output: The maximum alignment score of these strings, followed by an alignment achieving this
# maximum score. Use the BLOSUM62 scoring matrix and indel penalty sigma = 5.

# Sample Input:
# PLEASANTLY
# MEANLY

# Sample Output:
# 8
# PLEASANTLY
# -MEA--N-LY

import inout
import common

str1 = inout.infilelines[0].strip()
str2 = inout.infilelines[1].strip()

scoring_matrix = common.parse_scoring_matrix(inout.readlines('BLOSUM62.txt'))
indel_penalty = -5

score, alignment1, alignment2 = common.linear_space_alignment(scoring_matrix, indel_penalty, str1, str2)
inout.output('{}\n{}\n{}'.format(str(score), alignment1, alignment2))
# Input: An integer money and an array coins = (coin1, ..., coind).
# Output: The minimum number of coins with denominations coins that changes money.

# Sample Input:
# 40
# 50,25,20,10,5,1

# Sample Output:
# 2

import inout
import common

change = int(inout.infilelines[0].strip())
coins = map(int, inout.infilelines[1].strip().split(','))

numcoins = common.make_change(change, coins)

inout.output(str(numcoins))
# Input: A string BWT(Text), followed by a collection of Patterns.
# Output: A list of integers, where the i-th integer corresponds to the number of substring matches of the
# i-th member of Patterns in Text.

# Sample Input:
# TCCTCTATGAGATCCTATTCTATGAAACCTTCA$GACCAAAATTCTCCGGC
# CCT CAC GAG CAG ATC

# Sample Output:
# 2 1 1 0 1

import inout
import common

bwt_text = inout.infilelines[0].strip()
patterns = inout.infilelines[1].strip().split(' ')

counts = ''
for pattern in patterns:
    counts += str(common.bwt_matching(bwt_text, pattern)) + ' '

inout.output(counts.strip())
Ejemplo n.º 21
0
# Protein Translation Problem: Translate an RNA string into an amino acid string.

# Sample Input:
#     AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA

# Sample Output:
#     MAMAPRTEINSTRING

import inout
import codon

sequence = inout.infilelines[0].strip()

inout.output(codon.transcribe(sequence))
Ejemplo n.º 22
0
        r = []
        for m_kmer in enumerate_mismatches(kmer, maxdist - 1):
            for loc in range(k):
                for base in ['A', 'C', 'G', 'T']:
                    new_kmer = m_kmer[:loc] + base + m_kmer[loc + 1:]
                    r.append(new_kmer)
        return set(r)


kmer_counts = {}
max_kmers = []
max_kmer_count = 0
for idx in range(len(sequence) - k + 1):
    kmer = sequence[idx:idx + k]
    m_kmers = list(enumerate_mismatches(kmer, d))
    m_kmers.extend(list(enumerate_mismatches(reverse_complement(kmer), d)))
    for m_kmer in m_kmers:
        if m_kmer in kmer_counts:
            count = kmer_counts[m_kmer] + 1
        else:
            count = 1
        kmer_counts[m_kmer] = count

        if count > max_kmer_count:
            max_kmer_count = count
            max_kmers = [m_kmer]
        elif count == max_kmer_count:
            max_kmers.append(m_kmer)

inout.output(' '.join(max_kmers))
Ejemplo n.º 23
0
# Inverse Burrows-Wheeler Transform Problem: Reconstruct a string from its Burrows-Wheeler transform.
# Input: A string Transform (with a single "$" symbol).
# Output: The string Text such that BWT(Text) = Transform.

# Sample Input:
# TTCCTAACG$A

# Sample Output:
# TACATCACGT$

import inout
import common

text = inout.infilelines[0].strip()

original = common.inv_bwt(text)

inout.output(original)
Ejemplo n.º 24
0
			
	return survivors
	
# I'm sure there's a better way to do this but I don't know enough Python yet
def mklist(item):
	return [item]
		
candidates = map(mklist,amino_acids)
winner = ''
winner_score = 0
while candidates:
	candidates = branch(candidates)
	new_candidates = []
	for candidate in candidates:
		c_mass = sum(candidate)
		t_mass = max(spectrum)

		# if the mass of the candidate peptide equals the mass of the target peptide
		if c_mass == t_mass:
			new_candidates.append(candidate)
			c_score = score(candidate, spectrum)
			if c_score > winner_score:
				winner = candidate
				winner_score = c_score
		elif c_mass < t_mass:
			new_candidates.append(candidate)
		# else: the candidate mass is too large, so it does not go on to the next round
	candidates = cut(new_candidates, spectrum, N) 
	
inout.output('-'.join(map(str,winner)))
Ejemplo n.º 25
0
# Inverse Burrows-Wheeler Transform Problem: Reconstruct a string from its Burrows-Wheeler transform.
# Input: A string Transform (with a single "$" symbol).
# Output: The string Text such that BWT(Text) = Transform.

# Sample Input:
# TTCCTAACG$A

# Sample Output:
# TACATCACGT$

import inout
import common

text = inout.infilelines[0].strip()

original = common.inv_bwt(text)

inout.output(original)
Ejemplo n.º 26
0
# Input: Two strings s and t.
# Output: A longest common subsequence of s and t.
#
# Note: If more than one LCS exists, you may return any one.

# Sample Input:
# AACCTTGG
# ACACTGTGA

# Sample Output:
# AACTGG

import inout
import common

str1 = inout.infilelines[0].strip()
str2 = inout.infilelines[1].strip()

# output_longest_common_subsequence was hitting the default limit of 1000 for the test dataset
# https://class.coursera.org/bioinformatics-001/forum/thread?thread_id=742
import sys
sys.setrecursionlimit(2000)

longest, backtrack_matrix = common.longest_common_subsequence(str1, str2)
lcs = common.output_longest_common_subsequence(backtrack_matrix, str1, len(str1), len(str2))

inout.output(lcs)
# Input: The adjacency list of a directed graph that has an Eulerian path.
# Output: An Eulerian path in this graph.

# Sample Input:
#      CTT -> TTA
#      ACC -> CCA
#      TAC -> ACC
#      GGC -> GCT
#      GCT -> CTT
#      TTA -> TAC

# Sample Output:
#      GGCTTACCA

import inout
import common

edge_strs = map(str.strip, inout.infilelines)

graph = common.parse_graph_edges(edge_strs)
path = common.find_eulerian_path(graph)

inout.output(common.assemble_path(path))
# Constructing Suffix Array Problem: Construct the suffix array of a string.
# Input: A string Text.
# Output: SuffixArray(Text).

# Sample Input:
# AACGATAGCGGTAGA$

# Sample Output:
# 15, 14, 0, 1, 12, 6, 4, 2, 8, 13, 3, 7, 9, 10, 11, 5

import inout
import common

text = inout.infilelines[0].strip()

array = common.create_suffix_array(text)

inout.output(', '.join(map(str, array.values())))
Ejemplo n.º 29
0
# Input: A collection Patterns of k-mers.
# Output: The overlap graph Overlap(Patterns), in the form of an adjacency list.

# Sample Input:
#      ATGCG
#      GCATG
#      CATGC
#      AGGCA
#      GGCAT

# Sample Output:
#      AGGCA -> GGCAT
#      CATGC -> ATGCG
#      GCATG -> CATGC
#      GGCAT -> GCATG
     
import inout
import common

sequences = map(str.strip, inout.infilelines)
			
inout.output('\n'.join(map(common.overlap_to_str, common.overlap_graph(sequences))))
Ejemplo n.º 30
0
#        output BestMotifs
     
# Input: Integers k and t, followed by a collection of strings Dna.

# Output: A collection of strings BestMotifs resulting from applying GREEDYMOTIFSEARCH(Dna,k,t). If at any step you find more than one Profile-most probable k-mer in a given string, use the one occurring first.

# Sample Input:
#     3 5
#     GGCGTTCAGGCA
#     AAGAATCAGTCA
#     CAAGGAGTTCGC
#     CACGTCAATCAC
#     CAATAATATTCG

# Sample Output:
#     CAG
#     CAG
#     CAA
#     CAA
#     CAA

import inout
import common

k,t = map(int, inout.infilelines[0].strip().split(' '))
sequences = map(str.strip, inout.infilelines[1:])

best_motifs = common.greedy_motif_search(sequences, k, t)

inout.output('\n'.join(best_motifs))
Ejemplo n.º 31
0
# Given two strings, find all their shared k-mers.
# Input: An integer k and two strings.
# Output: All k-mers shared by these strings, in the form of ordered pairs (x, y).

# Sample Input:
# 3
# AAACTCATC
# TTTCAAATC

# Sample Output:
# (0, 4)
# (0, 0)
# (4, 2)
# (6, 6)

import inout
import common

k = int(inout.infilelines[0].strip())
str1, str2 = map(str.strip, inout.infilelines[1:3])

result = common.shared_kmers(k, str1, str2)


def output_one_pair(pair):
    return '({}, {})'.format(pair[0], pair[1])


inout.output('\n'.join(map(output_one_pair, result)))
Ejemplo n.º 32
0
		return [kmer]
	else:
		r = []
		for m_kmer in enumerate_mismatches(kmer, maxdist - 1):
			for loc in range(k):
				for base in ['A', 'C', 'G', 'T']:
					new_kmer = '{}{}{}'.format(m_kmer[:loc], base, m_kmer[loc + 1:])
					r.append(new_kmer)
		return set(r)

kmer_counts = {}
max_kmers = []
max_kmer_count = 0
for idx in range(len(sequence) - k + 1):
	kmer = sequence[idx:idx+k]

	for m_kmer in enumerate_mismatches(kmer, d):
		if m_kmer in kmer_counts:
			count = kmer_counts[m_kmer] + 1
		else:
			count = 1
		kmer_counts[m_kmer] = count

		if count > max_kmer_count:
			max_kmer_count = count
			max_kmers = [m_kmer]
		elif count == max_kmer_count:
			max_kmers.append(m_kmer)

inout.output(' '.join(max_kmers))
# Longest Repeat Problem: Find the longest repeat in a string.
# Input: A string Text.
# Output: A longest repeat in Text, i.e., a longest substring of Text that appears in Text more than once.

# Sample Input:
# ATATCGTTTTATCGTT

# Sample Output:
# TATCGTT

import inout
import common

text = inout.infilelines[0].strip()

trie = common.create_suffix_trie(text, 100)
substring = common.find_longest_substring_in_suffix_trie(trie, 1, '')

inout.output(substring)
Ejemplo n.º 34
0
# an edge connects node 0 to node 1 with weight 7.
# Output: The length of a longest path in the graph, followed by a longest path.

# Sample Input:
# 0
# 4
# 0->1:7
# 0->2:4
# 2->3:2
# 1->4:1
# 3->4:3

# Sample Output:
# 9
# 0->2->3->4

import inout
import common

source = inout.infilelines[0].strip()
sink = inout.infilelines[1].strip()
edges = map(str.strip, inout.infilelines[2:])

dag = common.parse_dag_edges(edges)
ordering = common.wikipedia_depth_first_topological_sort(dag, sink)

weight, backtrack = common.longest_dag_weight(dag, ordering, source, sink)
path = common.output_longest_dag_path(backtrack, source, sink)

inout.output('{}\n{}'.format(weight,path))
Ejemplo n.º 35
0
     
import inout

k,d = map(int, inout.infilelines[0].strip().split(' '))
dna_lines = map(str.strip, inout.infilelines[1:])

def enumerate_mismatches (kmer, maxdist):
	if maxdist == 0:
		return [kmer]
	else:
		r = []
		for m_kmer in enumerate_mismatches(kmer, maxdist - 1):
			for loc in range(k):
				for base in ['A', 'C', 'G', 'T']:
					new_kmer = m_kmer[:loc] + base + m_kmer[loc + 1:]
					r.append(new_kmer)
		return set(r)

def motifs (sequence, k, d):
	motifs = []
	for idx in range(len(sequence) - k + 1):
		kmer = sequence[idx:idx+k]
		motifs.extend(enumerate_mismatches(kmer, d))
	return motifs
		
common_motifs = motifs(dna_lines[0], k, d)
for line in dna_lines[1:]:
	common_motifs = set(common_motifs) & set(motifs(line, k, d))

inout.output(' '.join(common_motifs))
Ejemplo n.º 36
0
# Reverse Complement Problem
#
# Reverse complement a nucleotide pattern.
#
# Given: A DNA string Pattern.
#
# Return: Pattern, the reverse complement of Pattern.

# Sample Dataset
#
# AAAACCCGGT

# Sample Output
#
# ACCGGGTTTT

complement = { 'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G' }

import inout 	# my module for handling Rosalind's file I/O
sequence = inout.infilelines[0].strip()

output = ''
for base in reversed(sequence):
	output = output + complement[base]

inout.output(output)
Ejemplo n.º 37
0
# Spectral Convolution Problem: Compute the convolution of a spectrum.
#     Input: A collection of integers Spectrum.
#     Output: The list of elements in the convolution of Spectrum. If an element has multiplicity k, it should
#     appear exactly k times; you may return the elements in any order.

# Sample Input:
#     0 137 186 323

# Sample Output:
#     137 137 186 186 323 49

import inout

spectrum = map(int, inout.infilelines[0].strip().split(' '))

convolution = []
l = len(spectrum)
for i in range(l):
	for j in range(i+1, l):
		diff = spectrum[i]-spectrum[j]
		if diff != 0:
			convolution.append(abs(diff))	
	
inout.output(' '.join(map(str,sorted(convolution))))
Ejemplo n.º 38
0
# Input: Two strings v and w, each of length at most 1000.
# Output: The score of an optimal overlap alignment of v and w, followed by an alignment of a suffix v' of
# v and a prefix w' of w achieving this maximum score. Use an alignment score in which matches count
# +1 and both the mismatch and indel penalties are 2.

# Sample Input:
# PAWHEAE
# HEAGAWGHEE

# Sample Output:
# 1
# HEAE
# HEAG

import inout
import common

str1 = inout.infilelines[0].strip()
str2 = inout.infilelines[1].strip()

import string
scoring_matrix = common.mismatch_scoring_matrix_overlap(string.ascii_uppercase)
indel_penalty = -2

longest, backtrack_matrix, best_row, best_col = common.scored_longest_common_subsequence_overlap(scoring_matrix, indel_penalty, str1, str2)
aligned1, aligned2 = common.output_longest_common_subsequence_local(backtrack_matrix, str1, str2, best_row, best_col)

inout.output('{}\n{}\n{}'.format(longest, aligned1, aligned2))
Ejemplo n.º 39
0
# Return: All integer(s) i minimizing Skew(Prefixi (Text)) over all values of i (from 0 to |Genome|).

# Sample Dataset
# 
# CCTATCGGTGGATTAGCATGTCCCTGTACGTTTCGCCGCGAACTAGTTCACACGGCTTGATGGCAAATGGTTTTTCCGGCGACCGTAATCGTCCACCGAG

# Sample Output
# 
# 53 97

import inout 	# my module for handling Rosalind's file I/O
sequence = inout.infilelines[0].strip()

all_min_skew_loc = []
skew, skew_loc, min_skew = 0, 1, 1000
for base in sequence:
	if base == 'C':
		skew = skew - 1
	elif base == 'G':
		skew = skew + 1
	
	if skew < min_skew:
		min_skew = skew
		all_min_skew_loc = [skew_loc]
	elif skew == min_skew:
		all_min_skew_loc.append(skew_loc)
		
	skew_loc = skew_loc + 1

inout.output(" ".join(map(str, all_min_skew_loc)))
Ejemplo n.º 40
0
def output_format(pep):
	masses = []
	for amino_acid in pep:
		masses.append(peptide.mass_table[amino_acid])
	return '-'.join(map(str,masses))
		
candidates = peptide.amino_acids
winner = ''
winner_score = 0
while candidates:
	candidates = branch(candidates)
	new_candidates = []
	for candidate in candidates:
		c_mass = peptide.total_mass(candidate)
		t_mass = spectrum[-1]

		# if the mass of the candidate peptide equals the mass of the target peptide
		if c_mass == t_mass:
			new_candidates.append(candidate)
			c_score = score(candidate, spectrum)
			if c_score > winner_score:
				winner = candidate
				winner_score = c_score
		elif c_mass < t_mass:
			new_candidates.append(candidate)
		# else: the candidate mass is too large, so it does not go on to the next round
	candidates = cut(new_candidates, spectrum, N) 
	
inout.output(output_format(winner))
Ejemplo n.º 41
0
# Input: Two strings s and t.
# Output: A longest common subsequence of s and t.
#
# Note: If more than one LCS exists, you may return any one.

# Sample Input:
# AACCTTGG
# ACACTGTGA

# Sample Output:
# AACTGG

import inout
import common

str1 = inout.infilelines[0].strip()
str2 = inout.infilelines[1].strip()

# output_longest_common_subsequence was hitting the default limit of 1000 for the test dataset
# https://class.coursera.org/bioinformatics-001/forum/thread?thread_id=742
import sys
sys.setrecursionlimit(2000)

longest, backtrack_matrix = common.longest_common_subsequence(str1, str2)
lcs = common.output_longest_common_subsequence(backtrack_matrix, str1,
                                               len(str1), len(str2))

inout.output(lcs)
Ejemplo n.º 42
0
# Input: An integer k and a string Text.
# Output: DeBruijnk(Text).

# Sample Input:
#      4
#      AAGATTCTCTAC

# Sample Output:
#      AAG -> AGA
#      AGA -> GAT
#      ATT -> TTC
#      CTA -> TAC
#      CTC -> TCT
#      GAT -> ATT
#      TCT -> CTA,CTC
#      TTC -> TCT

import inout
import common

k = int(inout.infilelines[0].strip())
sequence = inout.infilelines[1].strip()

graph = common.debruijn_graph(common.all_kmers(sequence, k))

graph_strs = []
for k, v in graph.iteritems():
    graph_strs.append(common.debruijn_to_str(k, v))

inout.output('\n'.join(graph_strs))
# Number of Breakpoints Problem: Find the number of breakpoints in a permutation.
# Input: A permutation P.
# Output: The number of breakpoints in P.

# Sample Input:
# (+3 +4 +5 -12 -8 -7 -6 +1 +2 +10 +9 -11 +13 +14)

# Sample Output:
# 8

import inout
import common

permutation = common.greedysorting_parse(inout.infilelines[0].strip())

inout.output(str(common.count_breakpoints(permutation)))
# Input: A permutation P.
# Output: The sequence of permutations corresponding to applying GREEDYSORTING to P, ending with
# the identity permutation.

# Sample Input:
# (-3 +4 +1 +5 -2)

# Sample Output:
# (-1 -4 +3 +5 -2)
# (+1 -4 +3 +5 -2)
# (+1 +2 -5 -3 +4)
# (+1 +2 +3 +5 +4)
# (+1 +2 +3 -4 -5)
# (+1 +2 +3 +4 -5)
# (+1 +2 +3 +4 +5)

import inout
import common

permutation = common.greedysorting_parse(inout.infilelines[0].strip())
sequence = common.greedysorting(permutation)
sequence_out = common.greedysorting_out(sequence)

inout.output(sequence_out)
Ejemplo n.º 45
0
import inout  # module for handling Rosalind's file I/O
sequence = inout.infilelines[0].strip()

d = {}
for char in sequence:
    if char in d:
        d[char] += 1
    else:
        d[char] = 1

counts = (d['A'], d['C'], d['G'], d['T'])
inout.output(' '.join(map(str, counts)))
Ejemplo n.º 46
0
# Output: The score of an optimal overlap alignment of v and w, followed by an alignment of a suffix v' of
# v and a prefix w' of w achieving this maximum score. Use an alignment score in which matches count
# +1 and both the mismatch and indel penalties are 2.

# Sample Input:
# PAWHEAE
# HEAGAWGHEE

# Sample Output:
# 1
# HEAE
# HEAG

import inout
import common

str1 = inout.infilelines[0].strip()
str2 = inout.infilelines[1].strip()

import string

scoring_matrix = common.mismatch_scoring_matrix_overlap(string.ascii_uppercase)
indel_penalty = -2

longest, backtrack_matrix, best_row, best_col = common.scored_longest_common_subsequence_overlap(
    scoring_matrix, indel_penalty, str1, str2)
aligned1, aligned2 = common.output_longest_common_subsequence_local(
    backtrack_matrix, str1, str2, best_row, best_col)

inout.output('{}\n{}\n{}'.format(longest, aligned1, aligned2))
Ejemplo n.º 47
0
# Input: A permutation P.
# Output: The sequence of permutations corresponding to applying GREEDYSORTING to P, ending with
# the identity permutation.

# Sample Input:
# (-3 +4 +1 +5 -2)

# Sample Output:
# (-1 -4 +3 +5 -2)
# (+1 -4 +3 +5 -2)
# (+1 +2 -5 -3 +4)
# (+1 +2 +3 +5 +4)
# (+1 +2 +3 -4 -5)
# (+1 +2 +3 +4 -5)
# (+1 +2 +3 +4 +5)

import inout
import common

permutation = common.greedysorting_parse(inout.infilelines[0].strip())
sequence = common.greedysorting(permutation)
sequence_out = common.greedysorting_out(sequence)

inout.output(sequence_out)
# Input: An integer k and a string Text.
# Output: Compositionk(Text), where the k-mers are written in lexicographic order.

# Sample Input:
#      5
#      CAATCCAAC

# Sample Output:
#      AATCC
#      ATCCA
#      CAATC
#      CCAAC
#      TCCAA

import inout
import common

k = int(inout.infilelines[0].strip())
sequence = inout.infilelines[1].strip()

kmers = sorted(common.all_kmers(sequence, k))

inout.output('\n'.join(kmers))
Ejemplo n.º 49
0
# Protein Translation Problem: Translate an RNA string into an amino acid string.

# Sample Input:
#     AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA

#Sample Output:
#     MAMAPRTEINSTRING

import inout
import codon

sequence = inout.infilelines[0].strip()

inout.output(codon.transcribe(sequence))
Ejemplo n.º 50
0
# Implement LINEARSPACEALIGNMENT to solve the Global Alignment Problem for a large dataset.
# Input: Two long (10000 amino acid) protein strings written in the single-letter amino acid alphabet.
# Output: The maximum alignment score of these strings, followed by an alignment achieving this
# maximum score. Use the BLOSUM62 scoring matrix and indel penalty sigma = 5.

# Sample Input:
# PLEASANTLY
# MEANLY

# Sample Output:
# 8
# PLEASANTLY
# -MEA--N-LY

import inout
import common

str1 = inout.infilelines[0].strip()
str2 = inout.infilelines[1].strip()

scoring_matrix = common.parse_scoring_matrix(inout.readlines('BLOSUM62.txt'))
indel_penalty = -5

score, alignment1, alignment2 = common.linear_space_alignment(
    scoring_matrix, indel_penalty, str1, str2)
inout.output('{}\n{}\n{}'.format(str(score), alignment1, alignment2))
Ejemplo n.º 51
0
# Input: The adjacency list of a directed graph that has an Eulerian path.
# Output: An Eulerian path in this graph.

# Sample Input:
#      CTT -> TTA
#      ACC -> CCA
#      TAC -> ACC
#      GGC -> GCT
#      GCT -> CTT
#      TTA -> TAC

# Sample Output:
#      GGCTTACCA           

import inout
import common

edge_strs = map(str.strip, inout.infilelines)

graph = common.parse_graph_edges(edge_strs)
path = common.find_eulerian_path(graph)

inout.output(common.assemble_path(path))
Ejemplo n.º 52
0
# Given two strings, find all their shared k-mers.
# Input: An integer k and two strings.
# Output: All k-mers shared by these strings, in the form of ordered pairs (x, y).

# Sample Input:
# 3
# AAACTCATC
# TTTCAAATC

# Sample Output:
# (0, 4)
# (0, 0)
# (4, 2)
# (6, 6)

import inout
import common

k = int(inout.infilelines[0].strip())
str1, str2 = map(str.strip, inout.infilelines[1:3])

result = common.shared_kmers(k, str1, str2)

def output_one_pair(pair):
    return '({}, {})'.format(pair[0], pair[1])

inout.output('\n'.join(map(output_one_pair, result)))
Ejemplo n.º 53
0
import inout 	# module for handling Rosalind's file I/O
sequence = inout.infilelines[0].strip()
reversed_seq = sequence[::-1]

complements = {
    "A": "T",
    "T": "A",
    "C": "G",
    "G": "C"
}

rc = [complements[x] for x in reversed_seq]
inout.output(''.join(rc))