/
Sampling_IPD.py
87 lines (70 loc) · 3.57 KB
/
Sampling_IPD.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# Created by Lacey Westphal and Peter Sauvey, August 24 2015
## This script randomly samples IPD for a timepoint/RefPos and then feeds those into Wilcoxon
## It will run 100 times and output the "insig" RefPos for each test into a dataframe
##Usage: In terminal: python script.py "control file" "timepoint file" "output file" (must be in order)
##### DO NOT RUN THIS MULTIPLE TIMES WITHOUT DELETING THE OUTPUT FILE. IT APPENDS!! #####
from scipy import stats
import math
from sys import argv
import numpy
import qvalue
import pandas
from Queue import Queue
from threading import Thread
numpy.set_printoptions(threshold='nan')
timepoint = "/Users/lwestpha/Desktop/Lacey/Bioinformatics/8hr_orderedGATC.txt"
control= "/Users/lwestpha/Desktop/Lacey/Bioinformatics/dam_orderedGATC.txt"
ref = []
def generate_IPDs(file):
IPD=[] #set-up empty list of IPD
current_ref = 620.0
#start current_ref at the beginning
with open(file, 'r') as f:
f.next() #skip first str (header)
for row in f:
line = row.split() #split file into rows
if current_ref == float(line[1]): #if current_ref is equal to value of RefPos
IPD.append(float(line[2])) #append to list called IPD
else: #if not equal, yield the current_ref and IPD list to function
yield current_ref, IPD
del IPD[:] #delete contents of IPD lis
IPD.append(float(line[2])) #but since we have moved to next line, attach that to new IPD
current_ref = float(line[1]) #and make sure current_ref value is next RefPos value
yield current_ref, IPD #this yields last IPD list when eof is reached
def generate_DF(ref):
ref_arr = numpy.asarray(ref)
qval = qvalue.estimate(ref_arr[:,[1]])
q_val = numpy.asarray(qval)
df_all = numpy.append(ref_arr, q_val, 1)
insig_ref = df_all[df_all[:,2] > 0.2]
print df_all
def generate_stats():
tipd = generate_IPDs(timepoint) #create generator object
cipd = generate_IPDs(control) #create generator object (so you can use in for-loop)
for refc, ipdc in cipd: #this will iterate through generator(control)
reft, ipdt = tipd.next() #generate next value from generator (timepoint)
while reft != refc: #if refpos are not equal
if refc > reft: #skip to next refpos in timepoint if control ref is higher
print "skipping %r in %s" %(reft, timepoint)
reft, ipdt = tipd.next()
else: #skip to next refpos in control if timepoint ref is higher
print "skipping %r in %s" %(refc, control)
refc, ipdc = cipd.next()
if refc==reft:
#if they are equal, pass values to Wilcoxon function
for a in Wilcoxon(ipdc, ipdt):
ref.append([refc,a]) #this will return a 2d list with a row with the refc and every pval
generate_DF(ref)
def Wilcoxon(ipdc, ipdt): #calculates z_stat and pvalue from Wilcoxon test, appends to a file
pv_list = []
if len(ipdc) > len(ipdt):
sample = len(ipdt)
else:
sample = len(ipdc)
for r in range(100):
sampled_timepoint = numpy.random.choice(ipdt, sample, replace=True)
sampled_control = numpy.random.choice(ipdc, sample, replace=True)
z_stat, p_val = stats.ranksums(sampled_control, sampled_timepoint)
pv_list.append(p_val)
return pv_list
generate_stats()