/
homPolDist.py
69 lines (56 loc) · 2 KB
/
homPolDist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#! /usr/bin/env python
import sys
import hompol
import math
from Bio.Seq import translate
from BioExt import untranslate, randgene
from BioExt.stats import bbinom
from scipy.stats import rv_discrete
# A script essentially. Takes the input file containing a sequence we're looking to model
# and an output gene length
def writeBothToFile(seqFileName, newSeqLength, randFileName, transFileName):
randOutFile = open(randFileName, "w")
randOutFile.write("> Randomly generated gene of length %d \n" % int(newSeqLength))
transOutFile = open(transFileName, "w")
transOutFile.write("> AA translation and untranslation of randomly generated gene of length %d\n" % int(newSeqLength))
sequence = generateRandomGene(seqFileName, newSeqLength)
randOutFile.write(sequence)
seqPerm = untranslate(translate(sequence))
transOutFile.write(seqPerm)
randOutFile.close()
transOutFile.close()
# Generate the random gene using a combination of my homopol counting module
# and Lance's everything else.
def generateRandomGene(seqFileName, newSeqLength):
f = open(seqFileName, 'r')
characterList = []
for line in f:
if line[0] != '>':
characterList += list(line.rstrip('\n'))
homopolCounts = hompol.getCounts(characterList)[1:]
n = len(homopolCounts) - 1
b = bbinom(*bbinom.fit(homopolCounts))
randseq = randgene(hompol.roundTo(newSeqLength, 3), b.ppf)
return randseq
def fastaParser(readFileName):
reads = []
readFile = open(readFileName, 'r')
tempRead = []
for line in readFile:
if line[0] == ">":
if tempRead != []:
reads.append(tempRead)
tempRead = []
else:
tempRead += list(line.rstrip('\n'))
if tempRead != []:
reads.append(tempRead)
return reads
def stripLowerCase(read):
stripped = []
for char in read:
if char.isupper():
stripped.append(char)
return stripped
if __name__ == "__main__":
writeBothToFile(*sys.argv[1:])