-
Notifications
You must be signed in to change notification settings - Fork 0
/
ditalini.py
143 lines (118 loc) · 6.06 KB
/
ditalini.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
'''
Runs some finer statistical analysis of MSAs. The initial version is meant to give a value
for the weighted amount of sequence material between characters in the consensus to represent
the variability of those regions. Consensus characters with no gaps between represent a
level of variability no greater than that defined by the consensus threshold.
'''
from parameter import ConsensusStatsCommandParser
from buffer import XMLBuildReader, status_message
import pairwise
from msa import MultipleSequenceDriver, ConsensusFilterFactory
from sequence import NeuriteSequence
version = 0.1
def calculate_conservation(msa,consensus):
raw_consensus = consensus.raw_consensus
alignments = msa.get_alignments()
contributions = {}
for name in list(alignments.keys()):
alignment = alignments[name]
contributions[name] = 1/len(alignment.replace('-',''))
current_var_sum = 0
variability_sums = []
conservation = []
## Calculate conservation at and variability before each character
# Go through each position in the alignment (based on the composite or gapped consensus)
for position in range(len(msa.composite)):
# If the current position is a gap, add contributions to the current variability sum
if raw_consensus[position] == '-':
# Go through each alignment to check whether it has a character at the current position
for name in list(alignments.keys()):
alignment = alignments[name]
# If it does have a character (not a gap '-'), add its variability contribution
if not alignment[position] == '-':
current_var_sum += contributions[name]
# The current position is a consensus character, so store the variability sum preceeding it
else:
# if the position within the ungapped consensus is needed, just use the current length of variability_sums
variability_sums.append(current_var_sum)
current_var_sum = 0
# determine the current position's conservation level
conservation_sum = 0
for alignment in list(alignments.values()):
# If it does have a character (not a gap '-'), add its conservation contribution
if not alignment[position] == '-':
conservation_sum += 1
conservation.append(conservation_sum/len(alignments))
return conservation,variability_sums
#TODO - A stack is going to be required for this. Look at Java code to recreate (shouldn't be too hard)
def generate_consensus_newick(msa,consensus,filename):
newick_string = ""
cons_str = consensus.consensus
conservation,variability = calculate_conservation(msa,consensus)
a_stack = [(1,0)]
print(cons_str)
for position in range(len(cons_str)):
#print(str(position)+' '+cons_str[position])
#print(str(a_stack))
if cons_str[position] == 'A':
a_stack.append((0,0))
node_string = ')'
elif cons_str[position] == 'C':
node_string = ',:1)'
a_val = a_stack.pop()
a_stack.append((a_val[0],a_val[1]+1))
elif cons_str[position] == 'T':
node_string = '(:1,:1)'
not_done = 1
while not_done and len(a_stack) > 0:
a_val = a_stack.pop()
if a_val[0] == 0:
node_string = ','+a_val[1]*'('+node_string
a_stack.append((1,1))
not_done = 0
else:
node_string = a_val[1]*'('+node_string
newick_string = node_string + cons_str[position] + '-' + str(round(conservation[position],3)) + ':' + str(round(variability[position],3)) + newick_string
newick_string += ';'
status_message('Generating newick string','OK')
handle = open(args['newick'],'w')
handle.write(newick_string)
handle.close()
return newick_string
def generate_score_and_conserved_chars_file(msa,filename):
score_by_threshold = msa.score_by_threshold()
percent_cols_conserved = msa.percent_columns_conserved_by_threshold()
handle = open(args['scores'],'w')
for i in range(len(score_by_threshold)):
handle.write("%.3f,%.3f,%.3f\n" % (score_by_threshold[i][0],score_by_threshold[i][1],percent_cols_conserved[i][1]))
#print("%.3f %.3f %.3f" % (score_by_threshold[i][0],score_by_threshold[i][1],percent_cols_conserved[i][1]))
handle.close()
print("Wrote scores by threshold to "+filename)
if __name__ == '__main__':
try:
args = ConsensusStatsCommandParser().parse_args()
# ConsensusStatsArgumentValidator(args) # test all arguments are correct
print('capellini - v.' + str(version) + '\n=============')
msa = XMLBuildReader(args['build']).parse()
if args['threshold']:
threshold = args['threshold']
msa.add_consensus(threshold,msa.build_consensus(float(threshold)))
print('Consensus at threshold ' + threshold + ': ' + msa.get_consensus(threshold).consensus)
if args['newick']:
if args['threshold']:
generate_consensus_newick(msa,msa.get_consensus(float(args['threshold'])),args['newick'])
else:
generate_consensus_newick(msa,list(msa.consensuses.values())[0],args['newick'])
if args['scores']:
generate_score_and_conserved_chars_file(msa,args['scores'])
if args['k']:
threshold,k = msa.find_conservation_boundary()
print("At threshold %.2f, average conservation and proportion of conserved characters are both %.2f%%" % (threshold, k*100))
if args['newbuild']:
# consensus_object = msa.build_consensus(args['threshold'],args['type'])
# Write MSA and consensus to file
consensus_fact = ConsensusFilterFactory(msa,msa.get_consensus(threshold))
consensus_fact.write(fname=args['newbuild'])
status_message('Consensus statistics computation complete ', 'OK')
except (IOError, KeyboardInterrupt, IndexError) as e:
print(str(e))