def lcsq(): recs = rosalind_utils.read_fasta("rosalind_lcsq.txt") seqa, seqb = recs[0][1], recs[1][1] # return the set of all longest common subsesquences C = rosalind_utils.lcsq(seqa, seqb) print rosalind_utils.lcsq_len(C) print rosalind_utils.lcsq_backtrack(C, seqa, seqb, len(seqa), len(seqb))
def kmer(): seq = rosalind_utils.read_fasta("rosalind_kmer.txt")[0][1] rev_comp = rosalind_utils.reverse_complement(seq) for mer in itertools.product("ACGT", repeat=4): s = ''.join(mer) print overlapping_count(seq, s, 0), print ""
def gc(): records = rosalind_utils.read_fasta("rosalind_gc.txt") gc_contents = [(desc, rosalind_utils.gc_content(seq)) for desc, seq in records] max_gc_content = max(gc_contents, key=lambda x: x[1]) print max_gc_content[0] print max_gc_content[1] * 100
def pmch(): seq = rosalind_utils.read_fasta("rosalind_pmch.txt")[0][1] #seq = "AGCUAGUCAU" num_a = seq.count('A') num_g = seq.count('G') #print num_possible_matchings(num_a) #print num_possible_matchings(num_g) return num_possible_matchings(num_a) * num_possible_matchings(num_g)
def grph(): k = 3 recs = rosalind_utils.read_fasta("rosalind_grph.txt") for ena in recs: for enb in recs: if ena == enb: continue if ena[1][-k:] == enb[1][:k]: print ena[0], enb[0]
def revp(): # get the sequence of the only entry seq = rosalind_utils.read_fasta("rosalind_revp.txt")[0][1] for l in xrange(4, 13): # find all reverse palindromes of length l for i in range(len(seq) - l + 1): # if reverse palindrome, report the position and length if seq[i:i + l] == rosalind_utils.reverse_complement(seq[i:i + l]): print i + 1, l
def sseq(): recs = rosalind_utils.read_fasta("rosalind_sseq.txt") s = recs[0][1] t = recs[1][1] last_index = 0 # assuming t is a substring of s (not necessarily contiguously) for tlet in t: idx = s[last_index:].find(tlet) print last_index + idx + 1, last_index += idx + 1 print ""
def splc(): recs = rosalind_utils.read_fasta("rosalind_splc.txt") seqs = [rec[1] for rec in recs] exon = seqs[0] introns = sorted(seqs[1:], key=lambda (s): len(s), reverse=True) #print introns for intron in introns: exon = exon.replace(intron, "", 1) prot = rosalind_utils.translate(rosalind_utils.transcribe(exon)) return prot[:-1]
def splc(): recs = rosalind_utils.read_fasta("rosalind_splc.txt") seqs = [rec[1] for rec in recs] exon = seqs[0] introns = sorted(seqs[1:], key=lambda(s): len(s), reverse=True) #print introns for intron in introns: exon = exon.replace(intron, "", 1) prot = rosalind_utils.translate(rosalind_utils.transcribe(exon)) return prot[:-1]
def long(): # recs contain the list of tuples (desc, sequence) recs = rosalind_utils.read_fasta("rosalind_long.txt") next = {} for reca in recs: for recb in recs: if reca == recb: continue min_overlap_req = min(len(seq(reca)), len(seq(recb))) if overlap(seq(reca), seq(recb)) > min_overlap_req / 2: next[reca] = recb # find the starting string (the one that has not in next.values()) sub = head([rec for rec in recs if rec not in next.values()]) merged = seq(sub) while sub in next: overlap_len = overlap(seq(sub), seq(next[sub])) merged += seq(next[sub])[overlap_len:] sub = next[sub] print merged
def cons(): # read sequences recs = rosalind_utils.read_fasta("rosalind_cons.txt") seqs = [rec[1] for rec in recs] matrix = [] for i in xrange(len(seqs[0])): d = {'A': 0, 'C': 0, 'G': 0, 'T': 0} for seq in seqs: d[seq[i]] += 1 matrix.append(d) # print consensus consensus = ''.join( max(col.iteritems(), key=operator.itemgetter(1))[0] for col in matrix) print consensus # print matrix print 'A:', ' '.join(str(col['A']) for col in matrix) print 'C:', ' '.join(str(col['C']) for col in matrix) print 'G:', ' '.join(str(col['G']) for col in matrix) print 'T:', ' '.join(str(col['T']) for col in matrix)
def cons(): # read sequences recs = rosalind_utils.read_fasta("rosalind_cons.txt") seqs = [rec[1] for rec in recs] matrix = [] for i in xrange(len(seqs[0])): d = {'A':0, 'C':0, 'G':0, 'T':0} for seq in seqs: d[seq[i]] += 1 matrix.append(d) # print consensus consensus = ''.join(max(col.iteritems(), key=operator.itemgetter(1))[0] for col in matrix) print consensus # print matrix print 'A:', ' '.join(str(col['A']) for col in matrix) print 'C:', ' '.join(str(col['C']) for col in matrix) print 'G:', ' '.join(str(col['G']) for col in matrix) print 'T:', ' '.join(str(col['T']) for col in matrix)
def tran(): recs = rosalind_utils.read_fasta("rosalind_tran.txt") seqs = [rec[1] for rec in recs] purines = "AG" pyrimidines = "CT" transition = 0 transversion = 0 for a,b in zip(seqs[0], seqs[1]): if a==b: continue elif ((a in purines and b in purines) or (a in pyrimidines and b in pyrimidines)): transition += 1 else: transversion += 1 return float(transition) / transversion
def tran(): recs = rosalind_utils.read_fasta("rosalind_tran.txt") seqs = [rec[1] for rec in recs] purines = "AG" pyrimidines = "CT" transition = 0 transversion = 0 for a, b in zip(seqs[0], seqs[1]): if a == b: continue elif ((a in purines and b in purines) or (a in pyrimidines and b in pyrimidines)): transition += 1 else: transversion += 1 return float(transition) / transversion
def gc(): records = rosalind_utils.read_fasta("rosalind_gc.txt") gc_contents = [(desc, rosalind_utils.gc_content(seq)) for desc,seq in records] max_gc_content = max(gc_contents, key=lambda x: x[1]) print max_gc_content[0] print max_gc_content[1]*100
def pdst(): recs = rosalind_utils.read_fasta("rosalind_pdst.txt") for reca in recs: for recb in recs: print p_distance(reca[1], recb[1]), print ""