def main(argv): strings = fasta.read(argv[0]) nodes = graphs.labeled_overlap_nodes(strings, 3) edges = graphs.overlap_edges(nodes) print '\n'.join('%s %s' % edge for edge in edges)
def read_data_from_fasta(filepath): with open(filepath) as file: data = fasta.read(file) data = list(data) ##!! I have never used fasta format before, I don;t know what it would reaturn. ## I assume it will return a list return data
def main(argv): strings = fasta.read(argv[0]) count = 0 for string in strings.values(): if string == Seq(string, IUPAC.unambiguous_dna).reverse_complement(): count += 1 print count
def main(argv): strings = fasta.read(argv[0]) stats = statistics(strings.values()) print ''.join([max(s, key=s.get) for s in stats]) for row in ['A', 'C', 'G', 'T']: vals = [] for col in stats: vals.append(col[row]) print '%s: %s' % (row, ' '.join([str(val) for val in vals]))
def sequence(trees, mp="", ml=""): """ Add sequence for each node in a tree @param trees: dictionary, in the form of {id : [parent, name, offspring, support, length, level, sequence]} @param mp: string, a string refer to a filename of MSA file (for MP method only), only contain aligned protein sequences for terminal nodes @param ml: string, a string refer to a filename of MSA file (for ML method only), contains aligned protein sequences both for all terminal nodes and internal nodes @return: dictionary, in the form of {id : [parent, name, offspring, support, length, level, sequence]} """ tree = newick.read(trees) mp_sequence = fasta.read(mp) ml_sequence = fasta.read(ml) if mp_sequence: tips = [v[1] for k, v in tree.items() if not v[2]] taxa = mp_sequence.keys() if not set(tips).difference(set(taxa)): for k, v in tree.items(): if v[1] in tips: v[-1] = mp_sequence[v[1]] tree[k] = v elif ml_sequence: s = ml_sequence support = [v for v in tree.values() if v[1] == "root"][0] ids = [k for k, v in tree.items() if v[1] == "root"][0] support[3] = "N1" tree[ids] = support for k, v in tree.items(): if v[3] in s: v[-1] = s[v[3]] tree[k] = v elif v[1] in s: v[-1] = s[v[1]] tree[k] = v return tree
def main(): import fasta import collections # Read the input data. entries = fasta.read('data/rosalind_gc.txt') found_entry = "" max = 0 for entry in entries: entry_counts = collections.Counter(entries[entry]) total = sum(entry_counts.values()) gc = entry_counts['G'] + entry_counts['C'] percent = gc / total if (percent > max): max = percent found_entry = entry with open('output/rosalind_gc.txt', 'w') as output_data: output_data.write("{0} {1}".format(found_entry, max * 100.0))
''' Created on Dec 10, 2012 @author: Carl Raymond ''' import urllib import re import fasta """Finds (overlapping) occurrences of pattern N{P}[ST]{P} in a sequence""" # Matches can overlap, so use a positive lookahead assertion (tricky!) pattern = "(?=(N[^P][ST][^P]))" with open("rosalind_mprt.txt") as seqlist: for name in seqlist: name = name.strip() data = urllib.urlopen("http://www.uniprot.org/uniprot/" + name + ".fasta") for desc, data in fasta.read(data): if re.search(pattern, data): print name positions = [ match.start() for match in re.finditer(pattern, data) ] for pos in positions: print pos + 1, print
def main(argv): dna_strings = fasta.read(argv[0]).values() print strings.shortest_superstring(dna_strings)
from fasta import read # Returns true when the Hamming distance between seq1 and seq2 # is exactly n. Faster than computing the distance first when # we're only interested in small distances. def isDistance(seq1, seq2, n): dist = 0 for (c1, c2) in zip(seq1, seq2): if c1 != c2: dist += 1 if (dist > n): return False return dist == n with open("rosalind_corr.txt") as spec: rawdata = [seq.strip() for name,seq in read(spec)] # Build a dictionary with reads as keys and counts as the value. reads = {} for seq in rawdata: if seq in reads: reads[seq] += 1 else: reads[seq] = 1 print "Original data: {0} sequences.".format(len(rawdata)) print "Distinct reads: {0} sequences.".format(len(reads)) readsum = sum(v for k,v in reads.iteritems()) print "Total multiplicity: {0}".format(readsum)
''' Created on Jan 5, 2013 @author: Carl Raymond ''' from fasta import read def pdist(seq1, seq2): n = min(len(seq1), len(seq2)) return float(sum( 1 if b1 !=b2 else 0 for (b1, b2) in zip(seq1, seq2))) / n with open("rosalind_pdst.txt") as spec: data = [ seq for seq in read(spec)] print data n = len(data) seqlen = len(data[0][1]) dist = [ [ pdist(seq1, seq2) for name2, seq2 in data ] for name1, seq1 in data] for row in dist: for elem in row: print elem, print
import fasta def matrix(m): n = len(m.values()[0]) l = [] for b in ['A', 'C', 'G', 'T']: c = [] for i in xrange(n): c.append(sum(map(lambda x: 1 if x == b else 0, map(lambda x: x[i], m.values())))) l.append(c) return l def cons(m): l = '' for i in xrange(len(m[0])): a = m[0][i] c = m[1][i] g = m[2][i] t = m[3][i] sup = max(a,c,g,t) l += 'A' if sup == a else 'C' if sup == c else 'G' if sup == g else 'T' return l m = fasta.read(sys.stdin) [a, c, g, t] = matrix(m) print(cons([a, c, g, t])) print("A: " + ' '.join(map(str, a))) print("C: " + ' '.join(map(str, c))) print("G: " + ' '.join(map(str, g))) print("T: " + ' '.join(map(str, t)))
def main(argv): dna_strings = fasta.read(argv[0]).values() print strings.longest_common_substring(dna_strings)
from fasta import read import math with open("rosalind_mmch.txt") as spec: name,seq = read(spec).next() count = { 'A': 0, 'U': 0, 'C': 0, 'G': 0 } # Count each nucleotide for n in seq: count[n] += 1 print count au_max = max(count['A'], count['U']) au_min = min(count['A'], count['U']) cg_max = max(count['C'], count['G']) cg_min = min(count['C'], count['G']) print "AU: max: {0}, min: {1}".format(au_max, au_min) print "CG: max: {0}, min: {1}".format(cg_max, cg_min) # A and U can be mapped in au_max! / (au_max - au_min)! # C and G can be mapped in cg_max! / (cg_max - cg_min)! au_matchings = math.factorial(au_max) / math.factorial(au_max - au_min) cg_matchings = math.factorial(cg_max) / math.factorial(cg_max - cg_min) print "au_matchings: {0}, cg_matchings: {1}".format(au_matchings, cg_matchings) total = au_matchings * cg_matchings print "Total matchings: {0}".format(total)
import fasta nodes = [] with open("rosalind_grph.txt") as data: for node in fasta.read(data): nodes.append(node) with open("rosalind_grph.out", "w+") as output: for (l, r) in [(l, r) for l in nodes for r in nodes if l != r and l[1][-3:] == r[1][:3]]: output.write("{0} {1}\n".format(l[0], r[0])) #raw_input()
def main(argv): s, t = fasta.read(argv[0]).values() print strings.longest_common_subsequence(s, t)
''' Created on Nov. 24 2015 @author: Carl J. Raymond ''' # Solves both EDIT and EDTA. from fasta import read with open("data/rosalind_edta.txt") as spec: data_s, data_t = read(spec) _, S = data_s len_s = len(S) _, T = data_t len_t = len(T) print "S (length {0}): {1}".format(len_s, S) print "T (length {0}): {1}".format(len_t, T) cost_gap_S = 1 cost_gap_T = 1 cost_substitute = 1 # Allocate and initialize the cost array. Each cell is a tuple consiting of a # cost and an operation token that describes how we got to this place from # the previous step. The token 0 means that the characters matched; 1 means they # didn't match, and the choice is to substitute; 2 means that a position # in S was skipped; 3 means a postion in T was skipped. # Cost[0][j] = (j,3) for all j and cost[i][0] = (i,2) for all i. This # represents the cost of a gap from positions 1..i of S or T.
import sys import fasta def overlap(m, k): l = [] n = len(m) ks = m.keys() for i in xrange(n): for j in xrange(i+1, n): s = m[ks[i]] t = m[ks[j]] if s[-k:] == t[:k]: l.append((ks[i], ks[j])) if s[:k] == t[-k:]: l.append((ks[j], ks[i])) return l l = overlap(fasta.read(sys.stdin), 3) for (s, t) in l: print(s + ' ' + t)
''' Created on Dec 10, 2012 @author: Carl Raymond ''' import urllib import re import fasta """Finds (overlapping) occurrences of pattern N{P}[ST]{P} in a sequence""" # Matches can overlap, so use a positive lookahead assertion (tricky!) pattern = "(?=(N[^P][ST][^P]))" with open("rosalind_mprt.txt") as seqlist: for name in seqlist: name = name.strip() data = urllib.urlopen("http://www.uniprot.org/uniprot/" + name + ".fasta") for desc, data in fasta.read(data): if re.search(pattern, data): print name positions = [match.start() for match in re.finditer(pattern, data)] for pos in positions: print pos+1, print
import fasta; nodes = []; with open("rosalind_grph.txt") as data: for node in fasta.read(data): nodes.append(node); with open("rosalind_grph.out", "w+") as output: for (l, r) in [(l, r) for l in nodes for r in nodes if l != r and l[1][-3:] == r[1][:3] ]: output.write("{0} {1}\n".format(l[0], r[0])); #raw_input()
''' Created on Jan 4, 2013 @author: Carl Raymond ''' from fasta import read complements = {'A': 'G', 'C': 'T', 'G': 'A', 'T': 'C'} with open("rosalind_tran.txt") as spec: reader = read(spec) seq1 = reader.next()[1] seq2 = reader.next()[1] ts = 0 tv = 0 for (b1, b2) in zip(seq1, seq2): if b1 == b2: pass elif b1 == complements[b2]: ts += 1 else: tv += 1 print "Transitions: {0}".format(ts) print "Transversions: {0}".format(tv) print "Ratio: {0:5}".format(float(ts) / tv)
# Returns true when the Hamming distance between seq1 and seq2 # is exactly n. Faster than computing the distance first when # we're only interested in small distances. def isDistance(seq1, seq2, n): dist = 0 for (c1, c2) in zip(seq1, seq2): if c1 != c2: dist += 1 if (dist > n): return False return dist == n with open("rosalind_corr.txt") as spec: rawdata = [seq.strip() for name, seq in read(spec)] # Build a dictionary with reads as keys and counts as the value. reads = {} for seq in rawdata: if seq in reads: reads[seq] += 1 else: reads[seq] = 1 print "Original data: {0} sequences.".format(len(rawdata)) print "Distinct reads: {0} sequences.".format(len(reads)) readsum = sum(v for k, v in reads.iteritems()) print "Total multiplicity: {0}".format(readsum)
''' Created on Jan 8, 2013 @author: Carl Raymond ''' from fasta import read with open("rosalind_kmp.txt") as spec: name, seq = read(spec).next() n = len(seq) print "Length:", n failure = [0] * n #failure[0] = 0 # No. of matches seen so far m = 0 k = 1 while k < n: if seq[k] == seq[m]: m += 1 failure[k] = m k += 1 elif m > 0: #print "Backtrack at {0} where m = {1}".format(k, m); m = failure[m - 1] #print "New m = {0}".format(m); else:
def gc(seq): gc = 0 length = 0 for n in seq: if n == "A" or n == "T": length += 1 elif n == "C" or n == "G": length += 1 gc += 1 return 100.0 * float(gc) / float(length) def analyze(sequences): for seq in sequences: yield (seq[0], gc(seq[1])) def maxgc(seq1, seq2): return seq1 if (seq1[1] > seq2[1]) else seq2 with open("rosalind_gc.txt") as data: result = reduce(maxgc, analyze(fasta.read(data))) print result[0] print "{0:4f}%".format(result[1]) raw_input()
''' Created on Jan 4, 2013 @author: Carl Raymond ''' from fasta import read complements = { 'A': 'G', 'C': 'T', 'G': 'A', 'T': 'C'} with open("rosalind_tran.txt") as spec: reader = read(spec) seq1 = reader.next()[1] seq2 = reader.next()[1] ts = 0 tv = 0 for (b1, b2) in zip(seq1, seq2): if b1 == b2: pass elif b1 == complements[b2]: ts += 1 else: tv += 1 print "Transitions: {0}".format(ts) print "Transversions: {0}".format(tv) print "Ratio: {0:5}".format(float(ts) / tv)
def main(argv): s, t = fasta.read(argv[0]).values() print distance.edit(s, t)
import fasta; def gc(seq): gc = 0; length = 0; for n in seq: if (n == 'A' or n == 'T'): length += 1; elif (n == 'C' or n == 'G'): length += 1; gc += 1; return 100.0 * float(gc) / float(length); def analyze(sequences): for seq in sequences: yield (seq[0], gc(seq[1])); def maxgc(seq1, seq2): return seq1 if (seq1[1] > seq2[1]) else seq2; with open("rosalind_gc.txt") as data: result = reduce(maxgc, analyze(fasta.read(data))); print result[0] print "{0:4f}%".format(result[1]); raw_input();
def main(argv): s1, s2 = fasta.read(argv[0]).values() print distance.tt_ratio(s1, s2)