def calculate_motifs(input_seqs, input_seqs2, width): print input_seqs, input_seqs2 input_seqs = fasta.read_seqs(open(input_seqs).readlines()) input_seqs2 = fasta.read_seqs(open(input_seqs2).readlines()) foreground = get_quorums(input_seqs, width) background = get_quorums(input_seqs2, width) N = len(input_seqs) + len(input_seqs2) res_motifs = [] for i in foreground: term1 = choose(background[i], foreground[i]) term2 = choose((N - background[i]), len(input_seqs) - 1) term3 = choose(N, len(input_seqs)) p = (float(term1) * float(term2)) / term3 if 0 < p <= 0.0001: res_motifs.append(i + '\t' + str(foreground[i]) + '\t' + str(background[i]) + '\t' + str(p)) res_motifs.sort() return res_motifs
def get_quorums(seqs, mlen): """ add seq id_no to a set use explicit counter to create seq_no """ quorum = defaultdict(set) id_no = 0 for seq in seqs: id_no += 1 for n in range(len(seq) - mlen): quorum[seq[n:n + mlen]].add(id_no) return quorum input_seqs = fasta.read_seqs(open(sys.argv[1]).readlines()) input_seqs2 = fasta.read_seqs(open(sys.argv[2]).readlines()) foreground = get_quorums(input_seqs, 10) background = get_quorums(input_seqs2, 10) N = len(input_seqs) + len(input_seqs2) for i in foreground: term1 = choose(len(background[i]), len(foreground[i])) term2 = choose((N - len(background[i])), len(input_seqs) - 1) term3 = choose(N, len(input_seqs)) p = (float(term1) * float(term2)) / term3 if 0 < p <= 0.0001: print i, len(foreground[i]), len(background[i]), p
#! /usr/bin/env python import fasta import sys data = fasta.read_seqs(open(sys.argv[1], "r").readlines()) print [len(seq) for seq in data]
#! /usr/bin/env python import fasta import sys data = fasta.read_seqs(open(sys.argv[1], 'r').readlines()) print [len(seq) for seq in data]
return ntok // ktok else: return 0 def get_quorums(seqs, mlen): """ add seq id_no to a set use explicit counter to create seq_no """ quorum = defaultdict(int) for seq in seqs: for n in range(len(seq) - mlen): quorum[seq[n:n + mlen]] += 1 return quorum input_seqs = fasta.read_seqs(open(sys.argv[1]).readlines()) input_seqs2 = fasta.read_seqs(open(sys.argv[2]).readlines()) foreground = get_quorums(input_seqs, 10) background = get_quorums(input_seqs2, 10) N = len(input_seqs) + len(input_seqs2) res_motifs = [] for i in foreground: term1 = choose(len(background[i]), len(foreground[i])) term2 = choose((N - len(background[i])), len(input_seqs)-1) term3 = choose(N, len(input_seqs)) p = (float(term1) * float(term2)) / term3 if 0 < p <= 0.0001: res_motifs.append(i + '\t' + str(foreground[i]) + '\t' + str(background[i]) + '\t' + str(p))