forked from davek44/CLIP-Seq
/
mutation_compare.py
executable file
·173 lines (144 loc) · 5.77 KB
/
mutation_compare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/usr/bin/env python
from optparse import OptionParser
import math, pdb
import pybedtools, pysam
from rpy2.robjects.packages import importr
import rpy2.robjects as ro
import rpy2.robjects.lib.ggplot2 as ggplot2
grdevices = importr('grDevices')
################################################################################
# mutation_compare.py
#
# Print a table and plot a heatmap of enrichments/depletions of mutation types
# in a pair of datasets.
#
# Two normalization options:
# 1. # sequenced bp
# 2. # mutations
################################################################################
################################################################################
# main
################################################################################
def main():
usage = 'usage: %prog [options] <mut1 file> <mut2 file>'
parser = OptionParser(usage)
parser.add_option('-m', dest='mut_norm', action='store_true', default=False, help='Normalize by # mutations (as opposed to sequenced bp) [Default: %default]')
parser.add_option('-o', dest='output_pdf', default='mut_cmp.pdf', help='Output pdf file for heatmap [Default: %default]')
parser.add_option('-r', dest='raw', action='store_true', default=False, help='Use raw mutation counts (as opposed to normalized for ACGT content) [Default: %default]')
(options,args) = parser.parse_args()
if len(args) != 2:
parser.error(usage)
else:
mut1_file = args[0]
mut2_file = args[1]
mutation_profile1, seq_bp1 = parse_mutations(mut1_file, options.raw)
mutation_profile2, seq_bp2 = parse_mutations(mut2_file, options.raw)
relative_mutation_profile = compute_relative_profile(mutation_profile1, seq_bp1, mutation_profile2, seq_bp2)
print_table(relative_mutation_profile)
# make plotting data structures
nts = ['_','A','C','G','T']
nts1 = []
nts2 = []
rel = []
for nt1 in nts:
for nt2 in nts:
nts1.append(nt1)
nts2.append(nt2)
rel.append(relative_mutation_profile[(nt1,nt2)])
nts1_r = ro.StrVector(nts1)
nts2_r = ro.StrVector(nts2)
rel_r = ro.FloatVector(rel)
df = ro.DataFrame({'nt1':nts1_r, 'nt2':nts2_r, 'rel':rel_r})
# plot
'''
gp = ggplot2.ggplot(df) + \
ggplot2.aes_string(x='nt2', y='nt1', fill='rel') + \
ggplot2.geom_tile() + \
ggplot2.scale_x_discrete(mut2_file, limits=nts) + \
ggplot2.scale_y_discrete(mut1_file, limits=nts) + \
ggplot2.scale_fill_gradient('Enrichment 1/2')
'''
gp = ggplot2.ggplot(df) + \
ggplot2.aes_string(x='nt2', y='nt1', fill='rel') + \
ggplot2.geom_tile() + \
ggplot2.scale_x_discrete('Read') + \
ggplot2.scale_y_discrete('Reference') + \
ggplot2.scale_fill_gradient2('log2 enrichment', low='darkblue', mid='white', high='darkred')
# save to file
grdevices.pdf(file=options.output_pdf)
gp.plot()
grdevices.dev_off()
################################################################################
# compute_relative_profile
#
# Compute the relative enrichment/depletion between two mutation profiles
# after further normalizing each.
################################################################################
def compute_relative_profile(mut_prof1, norm1, mut_prof2, norm2):
rel_mut_prof = {}
for mut_key in mut_prof1.keys():
if mut_prof1[mut_key] == 0 and mut_prof2[mut_key] == 0:
rel_mut_prof[mut_key] = 0
else:
rel_mut_prof[mut_key] = math.log((mut_prof1[mut_key]/float(norm1)) / (mut_prof2[mut_key]/float(norm2))) / math.log(2)
return rel_mut_prof
################################################################################
# parse_mutations
#
# Parse my mutation_profile.py output.
################################################################################
def parse_mutations(mut_file, raw):
nts = ['_','A','C','G','T']
mut_open = open(mut_file)
# acgt content
acgt_content = {}
line = mut_open.readline()
line = mut_open.readline()
while line.rstrip():
a = line.split()
acgt_content[a[0]] = int(a[1])
line = mut_open.readline()
# raw mutation counts
raw_mutation_profile = {}
line = mut_open.readline()
line = mut_open.readline()
line = mut_open.readline()
for i in range(len(nts)):
a = line.split()
for j in range(len(nts)):
raw_mutation_profile[(nts[i],nts[j])] = int(a[j+1])
line = mut_open.readline()
line = mut_open.readline()
# norm mutation counts
norm_mutation_profile = {}
line = mut_open.readline()
line = mut_open.readline()
line = mut_open.readline()
for i in range(len(nts)):
a = line.split()
for j in range(len(nts)):
norm_mutation_profile[(nts[i],nts[j])] = int(a[j+1])
line = mut_open.readline()
line = mut_open.readline()
mut_open.close()
if raw:
mutation_profile = raw_mutation_profile
else:
mutation_profile = norm_mutation_profile
return mutation_profile, sum(acgt_content.values())
################################################################################
# print_table
################################################################################
def print_table(mutation_profile):
nts = ['_','A','C','G','T']
print ' %6s %6s %6s %6s %6s' % tuple(nts)
for nt1 in nts:
row_counts = [mutation_profile.get((nt1,nt2),0) for nt2 in nts]
print '%1s %6.3f %6.3f %6.3f %6.3f %6.3f' % tuple([nt1]+row_counts)
print ''
################################################################################
# __main__
################################################################################
if __name__ == '__main__':
main()
#pdb.runcall(main)