Ejemplo n.º 1
0
def frmt(f):
    ids = ' '.join(ro.read_str(f).split())
    handle = Entrez.efetch(db='nucleotide', id=ids, rettype='fasta')
    records = list(SeqIO.parse(handle, 'fasta'))  # we get the list of SeqIO objects in FASTA format
    x = min((len(x), x) for x in records)[1]
    print '>' + x.description
    print x.seq
Ejemplo n.º 2
0
def test_suffix_tree_weights(file_name_prefix):
    s = ro.read_str("%s/%s.dat" % (ro.ROSALIND_HOME, file_name_prefix))
    actual = np.array(list(rm.suffix_tree_weights(s)))
    expected = np.loadtxt("%s/%s.out" % (ro.ROSALIND_HOME, file_name_prefix), dtype=str)
    np.savetxt("%s/%s.mine.out" % (ro.ROSALIND_HOME, file_name_prefix), np.array(sorted(actual)), fmt="%s")
    # print sorted(actual)
    # print sorted(expected)
    assert_equal(sorted(actual), sorted(expected), "Wrong suffix tree weight list")
Ejemplo n.º 3
0
def orfr(f):
    '''Main driver to solve this problem.'''
    return max((len(x), x) for x in distinct_protein_strings(ro.read_str(f)))[1]
Ejemplo n.º 4
0
def one_e(f):
    return ro.join_list(prefix_skew_argmax(ro.read_str(f)))
Ejemplo n.º 5
0
'''
============================================================
http://rosalind.info/problems/prot

The Genetic Codeclick to expand

Problem

The 20 commonly occurring amino acids are abbreviated by using 20 letters from the English alphabet (all letters except for B, J, O, U, X, and Z). Protein strings are constructed from these 20 symbols. Henceforth, the term genetic string will incorporate protein strings along with DNA strings and RNA strings.

The RNA codon table dictates the details regarding the encoding of specific codons into the amino acid alphabet.

Given: An RNA string s corresponding to a strand of mRNA (of length at most 10 kbp).

Return: The protein string encoded by s.
============================================================
'''
from rosalind.rosutil import read_str, RNA_TRANSLATION, STOP_VALUE
from itertools import takewhile

def mrna_to_protein(s):
    '''Convert mRNA string to a protein string.'''
    return reduce(lambda x, y: ''.join((x, y)), takewhile(lambda v: v != STOP_VALUE, (RNA_TRANSLATION[s[i:i + 3]] for i in xrange(0, len(s), 3))), '')

if __name__ == "__main__":
    print mrna_to_protein(read_str('rosalind_prot_sample.dat'))
    print mrna_to_protein(read_str('rosalind_prot.dat'))
Ejemplo n.º 6
0
def ling(f):
    '''Main driver to solve this problem.'''
    return ling_complexity(ro.read_str(f))
Ejemplo n.º 7
0
def dbpr(f):
    return "\n".join(protein_biological_processes(ro.read_str(f)))
Ejemplo n.º 8
0
def mrep(f):
    '''Main driver to solve this problem.'''
    s = ro.read_str(f)
    for r in maximal_prefixes(s, 20): print r
Ejemplo n.º 9
0
def need(f):
    a, b = ro.read_str(f).split()
    s, t = rd.dna_seq_of_id(a), rd.dna_seq_of_id(b)
    print s
    print t
Ejemplo n.º 10
0
#!/usr/bin/env python
'''
============================================================
http://rosalind.info/problems/dna/

Given: A DNA string s of length at most 1000 nt.

Return: Four integers (separated by spaces) counting the respective number of times that the symbols 'A', 'C', 'G', and 'T' occur in s.
============================================================
'''
from rosalind.rosutil import read_str

def histogram(s):
    d = {}
    for x in s: d[x] = d.setdefault(x, 0) + 1
    return ' '.join(map(str, (d[x] for x in ['A', 'C', 'G', 'T'])))
         
if __name__ == "__main__":
    print histogram(read_str('rosalind_dna.dat'))
    print histogram(read_str('rosalind_dna_sample.dat'))
Ejemplo n.º 11
0
def swat(f):
    '''Print strings for local alignment score.'''
    a, b = ro.read_str(f).split()
    s, t = rd.protein_record(a).sequence, rd.protein_record(b).sequence
    print s
    print t
Ejemplo n.º 12
0
def suff(f):
    """Main driver to solve this problem."""
    for x in rm.suffix_tree_weights(ro.read_str(f)):
        print x
Ejemplo n.º 13
0
'''
============================================================
http://rosalind.info/problems/prtm

Chaining the Amino Acidsclick to expandclick to expand

Problem

In a weighted alphabet, every symbol is assigned a positive real number called a weight. A string formed from a weighted alphabet is called a weighted string, and its weight is equal to the sum of the weights of its symbols.

The standard weight assigned to each member of the 20-symbol amino acid alphabet is the monoisotopic mass of the corresponding amino acid.

Given: A protein string P of length at most 1000 aa.

Return: The total weight of P. Consult the monoisotopic mass table.
============================================================
'''
from rosalind.rosutil import read_str, aa_mass

'''Return the amino acid mass of the protein whose string s.'''
prtm = lambda s: sum(aa_mass[x] for x in s)

if __name__ == "__main__":
    print prtm(read_str('rosalind_prtm.dat')) # 821.392
    print prtm(read_str('rosalind_prtm_sample.dat'))
    
Ejemplo n.º 14
0
Problem

For positive integers a and n, a modulo n (written amodn in shorthand) is the remainder when a is divided by n. For example, 29mod11=7 because 29=11x2+7.

Modular arithmetic is the study of addition, subtraction, multiplication, and division with respect to the modulo operation. We say that a and b are congruent modulo n if amodn=bmodn; in this case, we use the notation a=bmodn.

Two useful facts in modular arithmetic are that if a=bmodn and c=dmodn, then a+c=b+dmodn and axc=bxdmodn. To check your understanding of these rules, you may wish to verify these relationships for a=29, b=73, c=10, d=32, and n=11.

As you will see in this exercise, some Rosalind problems will ask for a (very large) integer solution modulo a smaller number to avoid the computational pitfalls that arise with storing such large numbers.

Given: A protein string of length at most 1000 aa.

Return: The total number of different RNA strings from which the protein could have been translated, modulo 1,000,000. (Don't neglect the importance of the stop codon in protein translation.)
============================================================
'''
import rosalind.rosutil as ro
from itertools import chain

INV_CODON = {}
for k, v in ro.RNA_TRANSLATION.iteritems(): INV_CODON.setdefault(v, []).append(k)

def mrna(s, r=1000000):
#    print [len(INV_CODON[x]) for x in chain(s, [STOP_VALUE])]
    return ro.prod_mod((len(INV_CODON[x]) for x in chain(s, [ro.STOP_VALUE])), r)

if __name__ == "__main__":
    print mrna(ro.read_str('rosalind_mrna_sample.dat'))
    print mrna(ro.read_str('rosalind_mrna.dat'))
        
Ejemplo n.º 15
0
'''
============================================================
http://rosalind.info/problems/rnas

Given an RNA string s, we will augment the bonding graph of s by adding basepair edges connecting all occurrences of 'U' to all occurrences of 'G' in order to represent possible wobble base pairs.

We say that a matching in the bonding graph for s is valid if it is noncrossing (to prevent pseudoknots) and has the property that a basepair edge in the matching cannot connect symbols sj and sk unless k>=j+4 (to prevent nearby nucleotides from base pairing).

See Figure 1 for an example of a valid matching if we allow wobble base pairs. In this problem, we will wish to count all possible valid matchings in a given bonding graph; see Figure 2 for all possible valid matchings in a small bonding graph, assuming that we allow wobble base pairing.

Given: An RNA string s (of length at most 200 bp).

Return: The total number of distinct valid matchings of basepair edges in the bonding graph of s. Assume that wobble base pairing is allowed.
============================================================
'''
import rosalind.rosutil as ro

'''Which letters can a letter bind to, assuming wobble bonding.'''
_BONDING = {'A':'U', 'U':'AG', 'C':'G', 'G':'CU'}
'''Returns the number of wobble non-crossing matching in the string s with
minimium wobble distance min_wobble_dist between bases.'''
_wobb = lambda s, w: wobb(s[1:], w) + sum(wobb(s[1:i], w) * wobb(s[i + 1:], w) for i in [i for i in xrange(w, len(s)) if s[i] in _BONDING[s[0]]]) if s else 1
wobb = ro.memoize(_wobb)
rnas = lambda f: wobb(ro.read_str(f), 4) # Main driver to solve this problem.

if __name__ == "__main__":
    print rnas('rosalind_rnas_sample1.dat')
    print rnas('rosalind_rnas_sample.dat')
    print rnas('rosalind_rnas.dat')
Ejemplo n.º 16
0
#!/usr/bin/env python
"""
============================================================
http://rosalind.info/problems/rna/

Given: A DNA string t having length at most 1000 nt.

Return: The transcribed RNA string of t.
============================================================
"""
from rosalind.rosutil import read_str

transcribe = lambda s: "".join(("U" if x == "T" else x) for x in s)
if __name__ == "__main__":
    print transcribe(read_str("rosalind_rna_sample.dat"))
    print transcribe(read_str("rosalind_rna.dat"))
Ejemplo n.º 17
0
def mend(f):
    '''Main driver to solve this problem.'''
    return ro.join_list(geno_prob(rt.parse_newick(ro.read_str(f))))
Ejemplo n.º 18
0
'''
============================================================
http://rosalind.info/problems/revc

Given: A DNA string s of length at most 1000 bp.
Return: The reverse complement sc of s.
============================================================
'''
from rosalind.rosutil import read_str, revc

if __name__ == "__main__":
#    import doctest
#    doctest.testmod()
#    print revc(read_str('rosalind_revc_sample.dat'))
#    print revc(read_str('rosalind_revc.dat'))
    print revc(read_str('rosalind_revc_1b.dat'))
Ejemplo n.º 19
0
def eubt(f):
    '''Main driver to solve this problem.'''
    labels = ro.read_str(f).split()
    for g in enumerate_trees(labels): print to_newick_str(g)