/
te_mut_info.py
157 lines (137 loc) · 5.95 KB
/
te_mut_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python
from optparse import OptionParser
import ggplot
##################################################################################
# te_mut_info.py
#
# This script inputs a MSA file and a BED file with scores. The output is mutual
# information between each column of the consensus in MSA file and the scores.
#
# To Do:
# - Currently, I discretize my scores before computing MI. It would be good to
# calculate MI for continuous scores.
##################################################################################
def main():
usage='usage:%prog [options] <bed_file> <msa_file>'
parser = OptionParser(usage)
parser.add_option('-c', dest='consensus_pct', default=0.5, type='float', help='Required proportion of columns with a valid nt to consider it a consensus column [Default: %default]')
parser.add_option('-d', dest='dfam_consensus', action='store_true', help='Pass the option if you want to use Consensus as defined by Dfam')
#parser.add_option('-j', dest='condense_pct', type='float', help='Required proportion of entries to be same between 2 columns for them to be merged')
#parser.add_option('-n', dest='discretize_bins', type='int', help='The number of bins you want to discretize the scores into')
parser.add_option('-o', dest='output_pre', type='string', help='Prefix of the output files')
(options, args) = parser.parse_args()
if len(args)!=2:
parser.error('Must provide both the BED file and MSA file. Check %s' %usage)
else:
bed_file = args[0]
msa_fasta_file = args[1]
##################################################
# hash scores
##################################################
seq_scores = {}
for line in open(bed_file):
a = line.split('\t')
header = a[0] + ':' + a[1] + '-' + a[2]
score = float(a[4])
seq_scores[header] = score
##################################################
# define consensus
# define columns to condense for regression
##################################################
msa_sequences = {}
for line in open(msa_fasta_file):
if line[0] == '>':
header = line.strip()
msa_sequences[header] = ''
else:
msa_sequences[header] += line.strip()
if options.dfam_consensus is True:
consensus_sequence = msa_sequences.pop('>Consensus')
sequence_length = len(consensus_sequence)
consensus_columns = []
for i in range(0,len(consensus_sequence)):
if consensus_sequence[i] == 'x':
consensus_columns.append(i)
else:
consensus_columns = define_consensus(msa_fasta_file, options.consensus_pct)
#sample_sequence = msa_sequences.pop('>Consensus')
#sequence_length = len(sample_sequence)
#hamming_cutoff = int(sequence_length - options.condense_pct*sequence_length)
#condensed_columns, columns_ls_remove = column_condense(msa_sequences, consensus_columns, hamming_cutoff)
##################################################
# map sequences to feature vectors
##################################################
# initialize the dictionary with score and position/nt features
df_mi = {'Score':[]}
for i in range(len(consensus_columns)):
position = i+1
df_mi[position] = []
header = ''
for line in open(msa_fasta_file):
if line[0] == '>':
if header and header != 'Consensus':
# process seq
df_mi['Score'].append(seq_scores[header])
for i in range(len(consensus_columns)):
position = i+1
seq_i = consensus_columns[i]
nt = seq[seq_i].upper()
df_mi[position].append(nt)
header = line[1:].rstrip()
seq = ''
else:
seq += line.rstrip()
if header and header != 'Consensus':
# process last seq
df_mi['Score'].append(seq_scores[header])
for i in range(len(consensus_columns)):
position = i+1
seq_i = consensus_columns[i]
nt = seq[seq_i].upper()
df_mi[position].append(nt)
ggplot.plot('%s/te_mut_info.r' % tempura.r_dir, df_mi, [options.output_pre])
################################################################################
# define_consensus
#
# Input
# msa_fasta_file:
# consensus_pct: Float above which we consider the column to be consensus.
#
# Output
# consensus_cols: List of consensus column indexes.
################################################################################
def define_consensus(msa_fasta_file, consensus_pct):
valid_nts = ['A','C','G','T']
column_counts = []
header = ''
seq_count = 0
for line in open(msa_fasta_file):
if line[0] == '>':
if header and header != 'Consensus': # avoid DFAM
seq_count += 1
for i in range(len(seq)):
if seq[i].upper() in valid_nts:
while i >= len(column_counts):
column_counts.append(0)
column_counts[i] += 1
header = line[1:].rstrip()
seq = ''
else:
seq += line.rstrip()
if header and header != 'Consensus':
seq_count += 1
for i in range(len(seq)):
if seq[i].upper() in valid_nts:
while i >= len(column_counts):
column_counts.append(0)
column_counts[i] += 1
consensus_columns = []
for i in range(len(column_counts)):
if column_counts[i] / float(seq_count) > consensus_pct:
consensus_columns.append(i)
return consensus_columns
##################################################################################
# main()
##################################################################################
if __name__ == '__main__':
main()