Example #1
0
def main():
    '''Main call. Parses, runs, and saves problem specific data.'''
    # Parse the input data.
    seq_list = ReadFASTA('data/rosalind_gc.txt')
    highest_gc = map(str, max_gc_content(seq_list))

    # Print and save the answer.
    print '\n'.join(highest_gc)
    with open('output/005_GC.txt', 'w') as output_data:
        output_data.write('\n'.join(highest_gc))
Example #2
0
def main():
    '''Main call. Reads, runs, and saves problem specific data.'''
    # Parse the two input protein strings.
    s, t = [fasta[1] for fasta in ReadFASTA('data/rosalind_gaff.txt')]

    # Get the alignment score.
    score = global_alignment_affine_gap_penalty(s, t, BLOSUM62(), 11, 1)

    # Print and save the answer.
    print '\n'.join(score)
    with open('output/096_GAFF.txt', 'w') as output_data:
        output_data.write('\n'.join(score))
Example #3
0
def main():
    '''Main call. Reads, runs, and saves problem specific data.'''
    # Read and parse the input data.
    word1, word2 = [fasta[1] for fasta in ReadFASTA('data/rosalind_smgb.txt')]

    # Get the alignment.
    alignment = semiglobal_alignment(word1, word2, 1)

    # Print and save the answer.
    print '\n'.join(alignment)
    with open('output/101_SMGB.txt', 'w') as output_data:
        output_data.write('\n'.join(alignment))
Example #4
0
def main():
    '''Main call. Reads, runs, and saves problem specific data.'''
    # Parse the input data.
    v, w = [fasta[1] for fasta in ReadFASTA('data/rosalind_mgap.txt')]

    # Get the maximum number of gaps.
    max_gaps = str(maximum_gap_symbols(v, w))

    # Print and save the answer.
    print max_gaps
    with open('output/083_MGAP.txt', 'w') as output_data:
        output_data.write(max_gaps)
Example #5
0
def main():
    '''Main call. Reads, runs, and saves problem specific data.'''
    # Parse the input data.
    rna = ReadFASTA('data/rosalind_cat.txt')[0][1]

    # Get the number of noncrossing perfect bondings.
    noncrossing = str(noncrossing_perfect_bondings(rna))

    # Print and save the answer.
    print noncrossing
    with open('output/033_CAT.txt', 'w') as output_file:
        output_file.write(str(noncrossing))
Example #6
0
def main():
    '''Main call. Reads, runs, and saves problem specific data.'''
    # Read and parse the input data.
    word1, word2 = [fasta[1] for fasta in ReadFASTA('data/rosalind_sims.txt')]

    # Get the fitting alignment.
    alignment = fitting_alignment(word1, word2)

    # Print and save the answer.
    print '\n'.join(alignment)
    with open('output/100_SIMS.txt', 'w') as output_data:
        output_data.write('\n'.join(alignment))
Example #7
0
from scripts import ReadFASTA
from protein_map import ProteinDictDNA

dlist = ReadFASTA('data/rosalind_splc.txt')
#print(dlist)

RNA = dlist[0][1]

lenDNA = len(dlist)

for i in range(1, lenDNA):
    RNA = RNA.replace(dlist[i][1], '')

#print(RNA)

prot_dct = ProteinDictDNA()

protein = ''
for i in range(0, len(RNA), 3):
    test = RNA[i:i + 3]
    if (prot_dct[test] != 'Stop'):
        protein += prot_dct[test]

print(protein)
Example #8
0
    # Quick lambda function to insert indels.
    insert_indel = lambda word, i: word[:i] + '-' + word[i:]

    # Insert indels to get the alignment.
    while reduce(mul, current_index) != 0:
        for i, perm_value in enumerate(
                perm_list[backtrack[tuple(current_index)]]):
            if perm_value == 0:
                alignment[i] = insert_indel(alignment[i], current_index[i])
            else:
                current_index[i] -= 1

    # Note: We don't need to prepend any indels because we forced a match at the start of all words.
    # Remove the forced match from all alignments to recover the correct alignment.
    return [str(max_score)] + [aligned[1:] for aligned in alignment]


if __name__ == '__main__':
    from scripts import ReadFASTA

    # Parse the input data.
    words = [fasta[1] for fasta in ReadFASTA('data/rosalind_mult.txt')]

    # Get the alignment.
    words_aligned = multiple_alignment(words)

    # Print and save the answer.
    print '\n'.join(words_aligned)
    with open('output/085_MULT.txt', 'w') as output_data:
        output_data.write('\n'.join(words_aligned))
Example #9
0
#!/usr/bin/env python
'''
A solution to a ROSALIND bioinformatics problem.

Problem Title: Computing GC Content
Rosalind ID: GC
Rosalind #: 005
URL: http://rosalind.info/problems/gc/
'''

from scripts import ReadFASTA

# Our data is in FASTA form.
dna_list = ReadFASTA('data/rosalind_gc.txt')

highest_GC = -1
highest_GC_name = ''
for index, dna_seq in enumerate(dna_list):
    GC_count = 0
    for nucleotide in dna_seq[1]:
        if nucleotide == 'G' or nucleotide == 'C':
            GC_count += 1
            
    GC_amount = ( (GC_count*100.0)/len(dna_seq[1]) )
    if GC_amount > highest_GC:
        highest_GC = GC_amount
        highest_GC_name = dna_list[index][0]


# Print the solution.
print highest_GC_name, '\n', highest_GC
Example #10
0
#!/usr/bin/env python
'''
A solution to a ROSALIND bioinformatics problem.

Problem Title: Open Reading Frames
Rosalind ID: ORF
Rosalind #: 018
URL: http://rosalind.info/problems/orf/
'''

from scripts import ReadFASTA, ReverseComplementDNA, ProteinDictDNA

dna_list = [ReadFASTA('data/rosalind_orf.txt')[0][1]]
dna_list.append(ReverseComplementDNA(dna_list[0]))
dna_dict = ProteinDictDNA()

# Use a set since we want to return distinct protein.
# Sets keep track of distinct elements without us needing to worry about adding duplicates.
protein_orf = set()
for dna in dna_list:
    for i in range(len(dna) - 2):
        # Check for the Start codon.
        if dna[i:i + 3] == 'ATG':
            # Use a new index since we'll want to return to the ith position of the strand in case there are multiple start codons in a row.
            j = i
            current_protein = ''
            # Continue, if necessary, until we hit the end of the DNA sequence.
            while j + 3 < len(dna) - 1:
                # Add the protein and break if we hit a Stop codon.
                if dna_dict[dna[j:j + 3]] == 'Stop':
                    protein_orf.add(current_protein)
Example #11
0
#!/usr/bin/env python
'''
A solution to a ROSALIND bioinformatics problem.

Problem Title: Locating Restriction Sites
Rosalind ID: REVP
Rosalind #: 021
URL: http://rosalind.info/problems/revp/
'''

from scripts import ReadFASTA, ReverseComplementDNA

dna = ReadFASTA('data/rosalind_revp.txt')[0][1]
locations = []

for length in range(4, 13):
    for index in range(len(dna) - length + 1):
        if dna[index:index + length] == ReverseComplementDNA(dna[index:index +
                                                                 length]):
            print index + 1, length
            locations.append(str(index + 1) + ' ' + str(length))

with open('output/021_REVP.txt', 'w') as output_data:
    for location in locations:
        output_data.write(location + '\n')
Example #12
0
'''
A solution to a ROSALIND bioinformatics problem.

Problem Title: Maximum Matchings and RNA Secondary Structures 
Rosalind ID: MMCH
Rosalind #: 040
URL: http://rosalind.info/problems/mmch/
'''

from math import factorial
from scripts import ReadFASTA


def nPr(n, k):
    '''Returns the number of k-pernumatations of n.'''
    return factorial(n) / factorial(n - k)


rna = ReadFASTA('data/rosalind_mmch.txt')[0][1]

# Counts the number of each times each nucleotide appears in the RNA string.
AU_num = [rna.count(nucleotide) for nucleotide in 'AU']
GC_num = [rna.count(nucleotide) for nucleotide in 'GC']

# There are nPr(max, min) edges for each AU, CG.  Total number of edges is then the product.
max_matchings = nPr(max(AU_num), min(AU_num)) * nPr(max(GC_num), min(GC_num))

print max_matchings
with open('output/040_MMCH.txt', 'w') as output_data:
    output_data.write(str(max_matchings))
Example #13
0
#!/usr/bin/env python
'''
A solution to a ROSALIND bioinformatics problem.

Problem Title: Transitions and Transversions
Rosalind ID: TRAN
Rosalind #: 031
URL: http://rosalind.info/problems/tran/
'''

from scripts import ReadFASTA

dna1, dna2 = [fasta[1] for fasta in ReadFASTA('data/rosalind_tran.txt')]

transitions = transversions = 0.0
for i in xrange(len(dna1)):
    if dna1[i] == dna2[i]:
        pass
    # Check if the nucleotides are in the same purine/pyrimidine group.
    elif dna1[i] in [['A', 'G'], ['C', 'T']][dna2[i] in ['C', 'T']]:
        transitions += 1
    else:
        transversions += 1

print transitions / transversions
with open('output/031_TRAN.txt', 'w') as output_data:
    output_data.write(str(transitions / transversions))
Example #14
0
#!/usr/bin/env python
'''
A solution to a ROSALIND bioinformatics problem.

Problem Title: Perfect Matchings and RNA Secondary Structures
Rosalind ID: PMCH
Rosalind #: 026
URL: http://rosalind.info/problems/pmch/
'''

from math import factorial
from scripts import ReadFASTA

rna = ReadFASTA('data/rosalind_pmch.txt')[0][1]

pmch = factorial(rna.count('A'))*factorial(rna.count('C'))
print pmch

with open('output/026_PMCH.txt', 'w') as output_data:
	output_data.write(str(pmch))
Example #15
0
                    subintervals.append([rna[1:i], rna[i + 1:]])

            if subintervals == []:
                # If we didn't find any subintervals, there are no possible noncrossing matchings.
                noncross_dict[rna] = 0
            else:
                # Reduce the problem to noncrossing matchings over the substrings.
                noncross_dict[rna] = sum([
                    Noncrossing(subint[0]) * Noncrossing(subint[1])
                    for subint in subintervals
                ]) % 1000000

            return noncross_dict[rna]


def check_subinterval(subint):
    '''Checks if a given subinterval has the same number of matching nucleotides.'''
    N = [subint.count(nucleotide) for nucleotide in 'AUCG']
    if N[0] == N[1] and N[2] == N[3]:
        return True
    return False


rna = ReadFASTA('data/rosalind_cat.txt')[0][1]
noncross_dict = {}
matchings = {'A': 'U', 'U': 'A', 'C': 'G', 'G': 'C'}
noncross = Noncrossing(rna)
print noncross
with open('output/033_CAT.txt', 'w') as output_file:
    output_file.write(str(noncross))
Example #16
0
from scripts import ReadFASTA


def reverseComplementDNA(acid):
    out = "".maketrans("TAGC", "ATCG")
    return acid.translate(out)[::-1].lstrip()


dna_list = ReadFASTA("data/corr_data.txt")

dna_groups = []
for dna_tuple in dna_list:
    in_group = False
    dna = dna_tuple[1]
    for index, group in enumerate(dna_groups):
        if dna in group or reverseComplementDNA(str(dna)) in group:
            dna_groups[index].append(dna)
            in_group = True
            break

    if not in_group:
        dna_groups.append([dna])

dna_groups += [[], []] + dna_groups

while len(dna_groups) > 2:
    if len(dna_groups[len(dna_groups) - 1]) > 1:
        dna_groups[0].append(dna_groups.pop(len(dna_groups) - 1))
    else:
        dna_groups[1] += dna_groups.pop(len(dna_groups) - 1)
Example #17
0
#!/usr/bin/env python
'''
A solution to a ROSALIND bioinformatics problem.

Problem Title: Creating a Distance Matrix
Rosalind ID: PDST
Rosalind #: 041
URL: http://rosalind.info/problems/PDST/
'''

from numpy import zeros
from scripts import ReadFASTA

dna_list = [fasta[1] for fasta in ReadFASTA('data/rosalind_pdst.txt')]

# All seqences have the same length.
dna_len = len(dna_list[0])

M = zeros((len(dna_list), len(dna_list)))
for i in range(len(dna_list)):
    for j in range(len(dna_list)):

        if i < j:
            for k in range(dna_len):
                if dna_list[i][k] != dna_list[j][k]:
                    M[i][j] += 1.0 / dna_len

        elif i > j:
            M[i][j] = M[j][i]

print M
Example #18
0
        return 1

    else:
        # If we've already computed the value, return it!
        if rna in noncross_dict:
            return noncross_dict[rna]
        # Otherwise, calculate the value, add it to the dictionary, and return it.
        else:
            subintervals = []
            for i in xrange(1, len(rna)):
                if rna[0] == matchings[rna[i]]:
                    subintervals.append([rna[1:i], rna[i + 1:]])

            # Reduce the problem to noncrossing matchings over the matching substrings, and the matchings for the next starting point.
            noncross_dict[rna] = (sum([
                Noncrossing(subint[0]) * Noncrossing(subint[1])
                for subint in subintervals
            ]) + Noncrossing(rna[1:])) % 1000000

            return noncross_dict[rna]


rna = ReadFASTA('data/rosalind_motz.txt')[0][1]
matchings = {'A': 'U', 'U': 'A', 'C': 'G', 'G': 'C'}
noncross_dict = {}
noncross = Noncrossing(rna)

print noncross
with open('output/048_MOTZ.txt', 'w') as output_file:
    output_file.write(str(noncross))
Example #19
0
#!/usr/bin/env python
'''
A solution to a ROSALIND bioinformatics problem.

Problem Title: Maximum Matchings and RNA Secondary Structures 
Rosalind ID: MMCH
Rosalind #: 040
URL: http://rosalind.info/problems/mmch/
'''

from math import factorial
from scripts import ReadFASTA

def nPr(n, k):
	'''Returns the number of k-pernumatations of n.'''
	return factorial(n)/factorial(n-k)

rna = ReadFASTA('data/rosalind_mmch.txt')[0][1]

# Counts the number of each times each nucleotide appears in the RNA string.
AU_num = [rna.count(nucleotide) for nucleotide in 'AU']
GC_num = [rna.count(nucleotide) for nucleotide in 'GC']

# There are nPr(max, min) edges for each AU, CG.  Total number of edges is then the product.
max_matchings = nPr(max(AU_num), min(AU_num))*nPr(max(GC_num), min(GC_num))

print max_matchings
with open('output/040_MMCH.txt', 'w') as output_data:
	output_data.write(str(max_matchings))
Example #20
0
from scripts import ReadFASTA
from protein_map import ProteinDictDNA

dna_list = ReadFASTA('data/rna_splicing.txt')
exon = dna_list[0][1]

for intron in dna_list[1:]:
    # print(intron[1])
    exon = exon.replace(intron[1], '')

# print(exon)

proteinDict = ProteinDictDNA()
# print(proteinDict)

# for index in range(0, len(exon), 3):
#     print(str(exon[index:index+3]))
#     print(proteinDict[exon[index:index+3]])

exon_protein = ''
index = 3
while index < len(exon):
    codon = exon[index:index + 3]
    p = proteinDict[codon]
    if p != 'Stop':
        exon_protein += proteinDict[codon]

    index = index + 3

print(exon_protein)
Example #21
0
from scripts import ReadFASTA

DNA1, DNA2 = [fasta[1] for fasta in ReadFASTA('data/rosalind_tran.txt')]

transitions = transversions = 0.0
for i in xrange(len(DNA1)):
    if DNA1[i] == DNA2[i]:
        pass
    # Check if the nucleotides are in the same purine/pyrimidine group.
    elif DNA1[i] in [['A', 'G'], ['C', 'T']][DNA2[i] in ['C', 'T']]:
        transitions += 1
    else:
        transversions += 1

print transitions / transversions
with open('output/031_TRAN.txt', 'w') as output_data:
    output_data.write(str(transitions / transversions))
Example #22
0
#!/usr/bin/env python
'''
A solution to a ROSALIND bioinformatics problem.

Problem Title: k-Mer Composition
Rosalind ID: KMER
Rosalind #: 036
URL: http://rosalind.info/problems/kmer/
'''

from itertools import product
from scripts import ReadFASTA

dna = ReadFASTA('data/rosalind_kmer.txt')[0][1]

# Get a list of all 4-mers in lexiographic order.
kmer_list = [''.join(kmer) for kmer in list(product('ACGT', repeat=4))]

# Initialize the count of each 4-mer at zero for each 4-mer.
kmer_count = [0] * (4**4)

# Count each 4-mer
for i in range(len(dna) - 3):
    kmer_count[kmer_list.index(dna[i:i + 4])] += 1

print ' '.join(map(str, kmer_count))
with open('output/036_KMER.txt', 'w') as output_data:
    output_data.write(' '.join(map(str, kmer_count)))
Example #23
0
    # Backtrack to start of the local alignment starting at the highest scoring cell.
    while backtrack[i][j] != 3 and i * j != 0:
        if backtrack[i][j] == 0:
            i -= 1
        elif backtrack[i][j] == 1:
            j -= 1
        elif backtrack[i][j] == 2:
            i -= 1
            j -= 1

    # Cut the strings at the ending point of the backtrack.
    v_aligned = v_aligned[i:]
    w_aligned = w_aligned[j:]

    return max_score, v_aligned, w_aligned


if __name__ == '__main__':

    # Parse the two input protein strings.
    s, t = [fasta[1] for fasta in ReadFASTA('input/rosalind_loca.txt')]

    # Get the local alignment (given sigma = 5 in problem statement).
    alignment = local_alignment(s, t, PAM250(), 5)

    # Print and save the answer.
    print '\n'.join(alignment)
    with open('output/local_alignment.txt', 'w') as output_data:
        output_data.write('\n'.join(alignment))
Example #24
0
#!/usr/bin/env python
'''
A solution to a ROSALIND bioinformatics problem.

Problem Title: Error Correction in Reads
Rosalind ID: CORR
Rosalind #: 034
URL: http://rosalind.info/problems/corr/
'''

from scripts import ReadFASTA, ReverseComplementDNA as RevComp, HammingDistance as Hamm

# Group together identical DNA sequences, up to reverse complement.
dna_groups = []
for dna in [fasta[1] for fasta in ReadFASTA('data/rosalind_corr.txt')]:
    in_group = False
    for index, group in enumerate(dna_groups):
        if dna in group or RevComp(dna) in group:
            dna_groups[index].append(dna)
            in_group = True
            break

    if not in_group:
        dna_groups.append([dna])

# Sort the DNA groups as either being a correct read in index 0, or incorrect read in index 1.
dna_groups = [[], []] + dna_groups
while len(dna_groups) > 2:
    if len(dna_groups[len(dna_groups) - 1]) > 1:
        # Convert to set to eliminate repeats.
        dna_groups[0].append(dna_groups.pop(len(dna_groups) - 1))
Example #25
0
    # Backtrack to start of the local alignment starting at the highest scoring cell.
    while backtrack[i][j] != 3 and i * j != 0:
        if backtrack[i][j] == 0:
            i -= 1
        elif backtrack[i][j] == 1:
            j -= 1
        elif backtrack[i][j] == 2:
            i -= 1
            j -= 1

    # Cut the strings at the ending point of the backtrack.
    v_aligned = v_aligned[i:]
    w_aligned = w_aligned[j:]

    return max_score, v_aligned, w_aligned


if __name__ == '__main__':

    # Parse the two input protein strings.
    s, t = [fasta[1] for fasta in ReadFASTA('data/rosalind_loca.txt')]

    # Get the local alignment (given sigma = 5 in problem statement).
    alignment = local_alignment(s, t, PAM250(), 5)

    # Print and save the answer.
    print '\n'.join(alignment)
    with open('output/081_LOCA.txt', 'w') as output_data:
        output_data.write('\n'.join(alignment))
from scripts import ReadFASTA, ProteinDictDNA

dna_list = ReadFASTA('../data/rosalind_splc.txt')
exon = dna_list[0][1]

# Remove the introns.
for intron in dna_list[1:]:
    print(intron)
    exon = exon.replace(intron[1], '')

# Translate the exons.
dna_dict = ProteinDictDNA()
exon_protein = ''
for index in range(0, len(exon), 3):
    exon_protein += dna_dict[exon[index:index +
                                  3]] if dna_dict[exon[index:index +
                                                       3]] != 'Stop' else ''

print exon_protein
with open('../data/SPLC.txt', 'w') as output_data:
    output_data.write(exon_protein)
Example #27
0
	'''Extracts all substrings from the first string in a list, and sends longest substring candidates to be checked.'''
	longest = ''
	for start_index in xrange(len(string_list[0])):
		for end_index in xrange(len(string_list[0]), start_index, -1):
			# Break if the length becomes too small, as it will only get smaller.
			if end_index - start_index <= len(longest):
				break
			elif CheckSubstring(string_list[0][start_index:end_index], string_list):
				longest =  string_list[0][start_index:end_index]

	return longest

def CheckSubstring(find_string, string_list):
	'Checks if a given substring appears in all members of a given collection of strings and returns True/False.'
	for string in string_list:
		if (len(string) < len(find_string)) or (find_string not in string):
			return False
	return True


if __name__ == '__main__':
    fasta_list = ReadFASTA('data/rosalind_lcsm.txt')
    dna = []
    for fasta in fasta_list:
    	dna.append(fasta[1])

    lcsm = LongestSubstring(dna)
    print lcsm
    with open('output/014_LCSM.txt', 'w') as output_data:
    	output_data.write(lcsm)
Example #28
0
#!/usr/bin/env python
'''
A solution to a ROSALIND bioinformatics problem.

Problem Title: Finding a Spliced Motif
Rosalind ID: SSEQ
Rosalind #: 030
URL: http://rosalind.info/problems/sseq/
'''

from scripts import ReadFASTA

dna, sub_seq = [fasta[1] for fasta in ReadFASTA('data/rosalind_sseq.txt')]

sseq_indicies, i = [], 0
for nucleotide in sub_seq:
    # In practice: Use exception handling/additional constraints as such a subsequence does not necessarily exist.
    while dna[i] != nucleotide:
        i += 1

    # Use i+1 as the indicies because Rosalind starts at i=1 instead of i=0.
    sseq_indicies.append(str(i + 1))
    i += 1

print ' '.join(sseq_indicies)
with open('output/030_SSEQ.txt', 'w') as output_data:
    output_data.write(' '.join(sseq_indicies))
from scripts import ReadFASTA
# from string import maketrans

def reverseComplementDNA(dna):
    intable = "ATGC"
    outtable = "TACG"
    out = "".maketrans(intable, outtable)
    return dna.translate(out)

dna_list = ReadFASTA('data/corr.txt')
dna_dict = {}

for dna in dna_list[1:]: 
    dna_dict[dna[1]] = 0

dna_groups = []
for dna_tuple in dna_list:
    in_group = False
    dna = dna_tuple[1]
    complementDNA = reverseComplementDNA(dna[1])
    for index, group in enumerate(dna_groups):
        if in_group


print(dna_dict)



print(reverseComplementDNA("AGGGGGA"))

Example #30
0
#!/usr/bin/env python
'''
A solution to a ROSALIND bioinformatics problem.

Problem Title: Consensus and Profile
Rosalind ID: SUBS
Rosalind #: 009
URL: http://rosalind.info/problems/subs/
'''

from numpy import zeros
from scripts import ReadFASTA

# Data is in FASTA form
dna_list = ReadFASTA('data/rosalind_cons.txt')

# Setup an array and count into the array
M = zeros((4, len(dna_list[0][1])), dtype=int)
snp_dict = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
for dna in dna_list:
    for index, snp in enumerate(dna[1]):
        M[snp_dict[snp]][index] += 1

# Determine the consensus string
consensus = ''
to_snp = {0: 'A', 1: 'C', 2: 'G', 3: 'T'}
for i in range(0, len(dna_list[0][1])):
    maxval = [-1, -1]
    for j in range(0, 4):
        if maxval[1] < M[j][i]:
            maxval = [j, M[j][i]]