def main(argv):
    strings = fasta.read(argv[0])

    nodes = graphs.labeled_overlap_nodes(strings, 3)
    edges = graphs.overlap_edges(nodes)

    print '\n'.join('%s %s' % edge for edge in edges)
Example #2
0
def read_data_from_fasta(filepath):
    with open(filepath) as file:
        data = fasta.read(file)

        data = list(data)
        ##!! I have never used fasta format before, I don;t know what it would reaturn.
        ## I assume it will return a list
        return data
Example #3
0
def main(argv):
    strings = fasta.read(argv[0])
    count = 0

    for string in strings.values():
        if string == Seq(string, IUPAC.unambiguous_dna).reverse_complement():
            count += 1

    print count
Example #4
0
def main(argv):
    strings = fasta.read(argv[0])
    stats = statistics(strings.values())

    print ''.join([max(s, key=s.get) for s in stats])

    for row in ['A', 'C', 'G', 'T']:
        vals = []
        for col in stats:
            vals.append(col[row])
        print '%s: %s' % (row, ' '.join([str(val) for val in vals]))
Example #5
0
def sequence(trees, mp="", ml=""):
    """
    Add sequence for each node in a tree

    @param trees: dictionary, in the form of {id : [parent, name, offspring, support, length, level, sequence]}
    @param mp: string, a string refer to a filename of MSA file (for MP method only), only contain aligned protein sequences for terminal nodes
    @param ml: string, a string refer to a filename of MSA file (for ML method only), contains aligned protein sequences both for all terminal nodes and internal nodes
    @return: dictionary, in the form of {id : [parent, name, offspring, support, length, level, sequence]}
    """

    tree = newick.read(trees)
    mp_sequence = fasta.read(mp)
    ml_sequence = fasta.read(ml)

    if mp_sequence:
        tips = [v[1] for k, v in tree.items() if not v[2]]
        taxa = mp_sequence.keys()
        if not set(tips).difference(set(taxa)):
            for k, v in tree.items():
                if v[1] in tips:
                    v[-1] = mp_sequence[v[1]]
                    tree[k] = v
    elif ml_sequence:
        s = ml_sequence
        support = [v for v in tree.values() if v[1] == "root"][0]
        ids = [k for k, v in tree.items() if v[1] == "root"][0]
        support[3] = "N1"
        tree[ids] = support
        for k, v in tree.items():
            if v[3] in s:
                v[-1] = s[v[3]]
                tree[k] = v
            elif v[1] in s:
                v[-1] = s[v[1]]
                tree[k] = v
    return tree
Example #6
0
def main():
    import fasta
    import collections

    # Read the input data.
    entries = fasta.read('data/rosalind_gc.txt')

    found_entry = ""
    max = 0
    for entry in entries:
        entry_counts = collections.Counter(entries[entry])
        total = sum(entry_counts.values())
        gc = entry_counts['G'] + entry_counts['C']
        percent = gc / total
        if (percent > max):
            max = percent
            found_entry = entry

    with open('output/rosalind_gc.txt', 'w') as output_data:
        output_data.write("{0} {1}".format(found_entry, max * 100.0))
Example #7
0
'''
Created on Dec 10, 2012

@author: Carl Raymond
'''

import urllib
import re
import fasta
"""Finds (overlapping) occurrences of pattern N{P}[ST]{P} in a sequence"""

# Matches can overlap, so use a positive lookahead assertion (tricky!)
pattern = "(?=(N[^P][ST][^P]))"

with open("rosalind_mprt.txt") as seqlist:
    for name in seqlist:
        name = name.strip()
        data = urllib.urlopen("http://www.uniprot.org/uniprot/" + name +
                              ".fasta")
        for desc, data in fasta.read(data):
            if re.search(pattern, data):
                print name
                positions = [
                    match.start() for match in re.finditer(pattern, data)
                ]
                for pos in positions:
                    print pos + 1,
                print
def main(argv):
    dna_strings = fasta.read(argv[0]).values()

    print strings.shortest_superstring(dna_strings)
Example #9
0
from fasta import read

# Returns true when the Hamming distance between seq1 and seq2
# is exactly n.  Faster than computing the distance first when
# we're only interested in small distances.
def isDistance(seq1, seq2, n):
    dist = 0
    for (c1, c2) in zip(seq1, seq2):
        if c1 != c2:
            dist += 1
            if (dist > n): return False
    return dist == n


with open("rosalind_corr.txt") as spec:
    rawdata = [seq.strip() for name,seq in read(spec)]
    

# Build a dictionary with reads as keys and counts as the value.    
reads = {}
for seq in rawdata:
    if seq in reads:
        reads[seq] += 1
    else:
        reads[seq] = 1
 
print "Original data: {0} sequences.".format(len(rawdata))
print "Distinct reads: {0} sequences.".format(len(reads))   

readsum = sum(v for k,v in reads.iteritems())
print "Total multiplicity: {0}".format(readsum)                                                                                                                             
Example #10
0
'''
Created on Jan 5, 2013

@author: Carl Raymond
'''

from fasta import read

def pdist(seq1, seq2):
    n = min(len(seq1), len(seq2))
    return float(sum( 1 if b1 !=b2 else 0 for (b1, b2) in zip(seq1, seq2))) / n
        
    
with open("rosalind_pdst.txt") as spec:
    data = [ seq for seq in read(spec)]
    
print data
    
n = len(data)
seqlen = len(data[0][1])

dist = [ [ pdist(seq1, seq2) for name2, seq2 in data ] for name1, seq1 in data]

for row in dist:
    for elem in row: print elem,
    print

Example #11
0
import fasta

def matrix(m):
  n = len(m.values()[0])
  l = []
  for b in ['A', 'C', 'G', 'T']:
    c = []
    for i in xrange(n):
      c.append(sum(map(lambda x: 1 if x == b else 0, map(lambda x: x[i], m.values()))))
    l.append(c)
  return l

def cons(m):
  l = ''
  for i in xrange(len(m[0])):
    a = m[0][i]
    c = m[1][i]
    g = m[2][i]
    t = m[3][i]
    sup = max(a,c,g,t)
    l += 'A' if sup == a else 'C' if sup == c else 'G' if sup == g else 'T'
  return l

m = fasta.read(sys.stdin)
[a, c, g, t] = matrix(m)
print(cons([a, c, g, t]))
print("A: " + ' '.join(map(str, a)))
print("C: " + ' '.join(map(str, c)))
print("G: " + ' '.join(map(str, g)))
print("T: " + ' '.join(map(str, t)))
Example #12
0
def main(argv):
    dna_strings = fasta.read(argv[0]).values()

    print strings.longest_common_substring(dna_strings)
Example #13
0
from fasta import read
import math

with open("rosalind_mmch.txt") as spec:
    name,seq = read(spec).next()

count = { 'A': 0, 'U': 0, 'C': 0, 'G': 0 }

# Count each nucleotide
for n in seq:
    count[n] += 1

print count

au_max = max(count['A'], count['U'])
au_min = min(count['A'], count['U'])
cg_max = max(count['C'], count['G'])
cg_min = min(count['C'], count['G'])

print "AU: max: {0},  min: {1}".format(au_max, au_min) 
print "CG: max: {0},  min: {1}".format(cg_max, cg_min)

# A and U can be mapped in au_max! / (au_max - au_min)!
# C and G can be mapped in cg_max! / (cg_max - cg_min)!
au_matchings = math.factorial(au_max) / math.factorial(au_max - au_min)
cg_matchings = math.factorial(cg_max) / math.factorial(cg_max - cg_min)

print "au_matchings: {0}, cg_matchings: {1}".format(au_matchings, cg_matchings)

total = au_matchings * cg_matchings
print "Total matchings: {0}".format(total)
Example #14
0
import fasta

nodes = []
with open("rosalind_grph.txt") as data:
    for node in fasta.read(data):
        nodes.append(node)

with open("rosalind_grph.out", "w+") as output:
    for (l, r) in [(l, r) for l in nodes for r in nodes
                   if l != r and l[1][-3:] == r[1][:3]]:
        output.write("{0} {1}\n".format(l[0], r[0]))

#raw_input()
def main(argv):
    s, t = fasta.read(argv[0]).values()

    print strings.longest_common_subsequence(s, t)
Example #16
0
'''
Created on Nov. 24 2015
@author: Carl J. Raymond
'''

# Solves both EDIT and EDTA.

from fasta import read


with open("data/rosalind_edta.txt") as spec:
    data_s, data_t = read(spec)

_, S = data_s
len_s = len(S)
_, T = data_t    
len_t = len(T)
print "S (length {0}): {1}".format(len_s, S)
print "T (length {0}): {1}".format(len_t, T)

cost_gap_S = 1
cost_gap_T = 1
cost_substitute = 1

# Allocate and initialize the cost array. Each cell is a tuple consiting of a
# cost and an operation token that describes how we got to this place from
# the previous step. The token 0 means that the characters matched; 1 means they
# didn't match, and the choice is to substitute; 2 means that a position
# in S was skipped; 3 means a postion in T was skipped.
# Cost[0][j] = (j,3) for all j and cost[i][0] = (i,2) for all i. This
# represents the cost of a gap from positions 1..i of S or T.
Example #17
0
import sys
import fasta

def overlap(m, k):
  l = []
  n = len(m)
  ks = m.keys()
  for i in xrange(n):
    for j in xrange(i+1, n):
      s = m[ks[i]]
      t = m[ks[j]]
      if s[-k:] == t[:k]:
        l.append((ks[i], ks[j]))
      if s[:k] == t[-k:]:
        l.append((ks[j], ks[i]))
  return l

l = overlap(fasta.read(sys.stdin), 3)
for (s, t) in l:
  print(s + ' ' + t)
Example #18
0
'''
Created on Dec 10, 2012

@author: Carl Raymond
'''

import urllib
import re
import fasta

"""Finds (overlapping) occurrences of pattern N{P}[ST]{P} in a sequence"""

# Matches can overlap, so use a positive lookahead assertion (tricky!)
pattern = "(?=(N[^P][ST][^P]))"

with open("rosalind_mprt.txt") as seqlist:
    for name in seqlist:
        name = name.strip()
        data = urllib.urlopen("http://www.uniprot.org/uniprot/" + name + ".fasta")
        for desc, data in fasta.read(data):
            if re.search(pattern, data):
                print name
                positions = [match.start() for match in re.finditer(pattern, data)]
                for pos in positions:
                    print pos+1,
                print
Example #19
0
import fasta;


nodes = [];
with open("rosalind_grph.txt") as data:
    for node in fasta.read(data): nodes.append(node);

with open("rosalind_grph.out", "w+") as output:
    for (l, r) in [(l, r) for l in nodes for r in nodes if l != r and l[1][-3:] == r[1][:3] ]:
        output.write("{0} {1}\n".format(l[0], r[0]));

#raw_input()
Example #20
0
'''
Created on Jan 4, 2013

@author: Carl Raymond
'''

from fasta import read

complements = {'A': 'G', 'C': 'T', 'G': 'A', 'T': 'C'}

with open("rosalind_tran.txt") as spec:
    reader = read(spec)
    seq1 = reader.next()[1]
    seq2 = reader.next()[1]

ts = 0
tv = 0

for (b1, b2) in zip(seq1, seq2):
    if b1 == b2:
        pass
    elif b1 == complements[b2]:
        ts += 1
    else:
        tv += 1

print "Transitions: {0}".format(ts)
print "Transversions: {0}".format(tv)
print "Ratio: {0:5}".format(float(ts) / tv)
Example #21
0

# Returns true when the Hamming distance between seq1 and seq2
# is exactly n.  Faster than computing the distance first when
# we're only interested in small distances.
def isDistance(seq1, seq2, n):
    dist = 0
    for (c1, c2) in zip(seq1, seq2):
        if c1 != c2:
            dist += 1
            if (dist > n): return False
    return dist == n


with open("rosalind_corr.txt") as spec:
    rawdata = [seq.strip() for name, seq in read(spec)]

# Build a dictionary with reads as keys and counts as the value.
reads = {}
for seq in rawdata:
    if seq in reads:
        reads[seq] += 1
    else:
        reads[seq] = 1

print "Original data: {0} sequences.".format(len(rawdata))
print "Distinct reads: {0} sequences.".format(len(reads))

readsum = sum(v for k, v in reads.iteritems())
print "Total multiplicity: {0}".format(readsum)
Example #22
0
'''
Created on Jan 8, 2013

@author: Carl Raymond
'''

from fasta import read

with open("rosalind_kmp.txt") as spec:
    name, seq = read(spec).next()

n = len(seq)
print "Length:", n

failure = [0] * n
#failure[0] = 0

# No. of matches seen so far
m = 0

k = 1
while k < n:
    if seq[k] == seq[m]:
        m += 1
        failure[k] = m
        k += 1
    elif m > 0:
        #print "Backtrack at {0} where m = {1}".format(k, m);
        m = failure[m - 1]
        #print "New m = {0}".format(m);
    else:
Example #23
0

def gc(seq):
    gc = 0
    length = 0
    for n in seq:
        if n == "A" or n == "T":
            length += 1
        elif n == "C" or n == "G":
            length += 1
            gc += 1
    return 100.0 * float(gc) / float(length)


def analyze(sequences):
    for seq in sequences:
        yield (seq[0], gc(seq[1]))


def maxgc(seq1, seq2):
    return seq1 if (seq1[1] > seq2[1]) else seq2


with open("rosalind_gc.txt") as data:
    result = reduce(maxgc, analyze(fasta.read(data)))
    print result[0]
    print "{0:4f}%".format(result[1])


raw_input()
Example #24
0
'''
Created on Jan 4, 2013

@author: Carl Raymond
'''

from fasta import read

complements = { 'A': 'G', 'C': 'T', 'G': 'A', 'T': 'C'}

with open("rosalind_tran.txt") as spec:
    reader = read(spec)
    seq1 = reader.next()[1]
    seq2 = reader.next()[1]

ts = 0
tv = 0

for (b1, b2) in zip(seq1, seq2):
    if b1 == b2:
        pass
    elif b1 == complements[b2]:
        ts += 1
    else:
        tv += 1
        
print "Transitions: {0}".format(ts)
print "Transversions: {0}".format(tv)
print "Ratio: {0:5}".format(float(ts) / tv)
def main(argv):
    s, t = fasta.read(argv[0]).values()

    print distance.edit(s, t)
Example #26
0
import fasta;

def gc(seq):
    gc = 0;
    length = 0;
    for n in seq:
        if (n == 'A' or n == 'T'):
            length += 1;
        elif (n == 'C' or n == 'G'):
            length += 1;
            gc += 1;
    return 100.0 * float(gc) / float(length);

def analyze(sequences):
    for seq in sequences:
        yield (seq[0], gc(seq[1])); 

def maxgc(seq1, seq2):
    return seq1 if (seq1[1] > seq2[1]) else seq2;

with open("rosalind_gc.txt") as data:
    result = reduce(maxgc, analyze(fasta.read(data)));
    print result[0]
    print "{0:4f}%".format(result[1]); 


raw_input();
def main(argv):
    s1, s2 = fasta.read(argv[0]).values()

    print distance.tt_ratio(s1, s2)