Esempio n. 1
0
def protein_strings(s):
    '''All possible protein sequences of the DNA string s. May yield duplicates.'''
    max_i = len(s) - 3
    for t in (s, revc(s)):
        for start in xrange(3):
            #print '---> t', t, 'start', start
            i, translate, starts, relative_index, p = start, False, [], 0, ''
            while i <= max_i:
                c = t[i:i + 3]
                #print 'i', i, 'c', c
                if c == DNA_START_CODON:
                    #i_start = i
                    starts.append(relative_index)
                    #print 'start', 'i', i, t[i:], 'starts', starts
                    translate = True
                if translate:
                    if c in DNA_STOP_CODONS:
                        #print 'stop ', 'i', i, t[i_start:i + 3]
                        #print 'p', p
                        for relative_index in starts:
                            #print 'yielding p[%d:] = %s' % (relative_index, p[relative_index:])  
                            yield p[relative_index:]
                        translate, relative_index, p = False, 0, ''
                        del starts[0:len(starts)]
                    else:
                        p = ''.join((p, DNA_TRANSLATION[c]))
                        #print 'translating', c, 'to', DNA_TRANSLATION[c], 'relative_index', relative_index
                        relative_index += 1
                i += 3
Esempio n. 2
0
def classify(s):
    '''Return the list of correct strings (standardized to start with ''A'' or ''T'')
    and incorrect strings in the collection s.'''
    d, correct = {}, np.tile(False, len(s))
    for i, x in enumerate(s):
        y = min(x, ro.revc(x))  # y = standardized form of x
        d.setdefault(y, []).append(i)
    # Find correct strings that occur exactly twice
    correct[np.array([x for (y, X) in d.iteritems() for x in X if len(X) >= 2])] = True
    return [y for (y, X) in d.iteritems() if len(X) >= 2], [s[x] for x in np.where(~correct)[0]]
Esempio n. 3
0
'''
============================================================
http://rosalind.info/problems/revc

Given: A DNA string s of length at most 1000 bp.
Return: The reverse complement sc of s.
============================================================
'''
from rosalind.rosutil import read_str, revc

if __name__ == "__main__":
#    import doctest
#    doctest.testmod()
#    print revc(read_str('rosalind_revc_sample.dat'))
#    print revc(read_str('rosalind_revc.dat'))
    print revc(read_str('rosalind_revc_1b.dat'))
Esempio n. 4
0
def one_h(f):
    '''Main driver for solving this problem.'''
    lines = ro.read_lines(f)
    s, (k, d) = lines[0], map(int, lines[1].split())
    c = ro.possible_kmers_counter(s, k, d)
    return ro.join_list(ro.most_frequent(c + Counter(dict((ro.revc(x), v) for x, v in c.iteritems()))))
Esempio n. 5
0
def rvco(f):
    '''Main driver to solve this problem.'''
    return sum(1 for x in ro.fafsa_itervalues(f) if x == ro.revc(x))
Esempio n. 6
0
(so that every edge in the cycle is traversed in the same
direction).

For a set of DNA strings S and a positive integer k, let Sk
denote the collection of all possible k-mers of the strings
in S.

Given: A collection S of (error-free) reads of equal length (not exceeding 50 bp). In this dataset, for some positive integer k, the de Bruijn graph Bk on Sk+1 U Srck+1 consists of exactly two directed cycles.

Return: A cyclic superstring of minimal length containing every read or its reverse complement.
============================================================
'''
import rosalind.rosutil as ro, networkx as nx, itertools as it

'''The reverse complement set of a set of strings.'''
revc_set = lambda S: [ro.revc(u) for u in S]

def db_graph(S, SC, k):
    '''A de-Bruijn graph B_k of a list S of reads and its reverse complement SC.'''
    return nx.from_edgelist(((r[:-1], r[1:]) 
                             for r in it.chain.from_iterable(ro.kmers(u, k + 1) for u in it.chain(S, SC))),
                            create_using=nx.DiGraph())

def cyclic_strings(g):
    '''Generate all cyclic strings in the de-Bruijn graph g if it consists of a collection of cycles.
    If not, returns nothing.'''
    if not all(g.out_degree(u) == 1 for u in g): return
    g, V = g.copy(), set(g.nodes_iter())
    print 'k', len(g.nodes_iter().next()), 'nodes', g.number_of_nodes(), 'edges', g.number_of_edges()
    while V:  # Loop over all cycles until graph is empty
        #print 'V', V