/
bayes_net.py
executable file
·247 lines (190 loc) · 9.94 KB
/
bayes_net.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
import os
import sqlite3
import math
import utils
from utils import trim_song, sized_observation_from_index, parse_song
min_smoothed_prob = 1.0 / 128
min_smoothed_prob_log = math.log(min_smoothed_prob)
def _cache(cache, func, *args):
if cache is not None:
try:
return cache[args]
except:
value = func(*args)
cache[args] = value
return value
else:
return func(*args)
def score_transition(song_chunk, new_frame, smooth=True, cache=None):
numerator_smooth, denom_smooth = (1, 128) if smooth else (0, 0)
frame_1_chan_1, frame_1_chan_2, frame_1_chan_3 = [channel[-1] for channel in song_chunk]
frame_2_chan_1, frame_2_chan_2, frame_2_chan_3 = new_frame
frame_1_chan_2_count = _cache(cache, cross_obs_count, frame_1_chan_2, 2)
frame_1_chan_3_count = _cache(cache, cross_obs_count, frame_1_chan_3, 3)
frame_2_chan_1_count = _cache(cache, inner_obs_count, frame_2_chan_1)
f1_c1_to_f2_c1_num = _cache(cache, cross_obs_count, frame_1_chan_1, 1, frame_2_chan_1, 1)
if f1_c1_to_f2_c1_num is not None:
f1_c1_to_f2_c1_prob = math.log(float(f1_c1_to_f2_c1_num + numerator_smooth) / (_cache(cache, cross_obs_count, frame_1_chan_1, 1) + denom_smooth), 10)
else:
f1_c1_to_f2_c1_prob = min_smoothed_prob_log
f1_c2_to_f2_c1_num = _cache(cache, cross_obs_count, frame_1_chan_2, 2, frame_2_chan_1, 1)
if f1_c2_to_f2_c1_num is not None:
f1_c2_to_f2_c1_prob = math.log(float(f1_c2_to_f2_c1_num + numerator_smooth) / (frame_1_chan_2_count + denom_smooth), 10)
else:
f1_c2_to_f2_c1_prob = min_smoothed_prob_log
f1_c3_to_f2_c1_num = _cache(cache, cross_obs_count, frame_1_chan_3, 3, frame_2_chan_1, 1)
if f1_c3_to_f2_c1_num is not None:
f1_c3_to_f2_c1_prob = math.log(float(f1_c3_to_f2_c1_num + numerator_smooth) / (frame_1_chan_3_count + denom_smooth), 10)
else:
f1_c3_to_f2_c1_prob = min_smoothed_prob_log
f1_c2_to_f2_c2_num = _cache(cache, cross_obs_count, frame_1_chan_2, 2, frame_2_chan_2, 2)
if f1_c2_to_f2_c2_num is not None:
f1_c2_to_f2_c2_prob = math.log(float(f1_c2_to_f2_c2_num + numerator_smooth) / (frame_1_chan_2_count + denom_smooth), 10)
else:
f1_c2_to_f2_c2_prob = min_smoothed_prob_log
f1_c3_to_f2_c3_num = _cache(cache, cross_obs_count, frame_1_chan_3, 3, frame_2_chan_3, 3)
if f1_c3_to_f2_c3_num is not None:
f1_c3_to_f2_c3_prob = math.log(float(f1_c3_to_f2_c3_num + numerator_smooth) / (frame_1_chan_3_count + denom_smooth), 10)
else:
f1_c3_to_f2_c3_prob = min_smoothed_prob_log
f2_c1_to_f2_c2_num = _cache(cache, inner_obs_count, frame_2_chan_1, frame_2_chan_2, 2)
if f2_c1_to_f2_c2_num is not None:
f2_c1_to_f2_c2_prob = math.log(float(f2_c1_to_f2_c2_num + numerator_smooth) / (frame_2_chan_1_count + denom_smooth), 10)
else:
f2_c1_to_f2_c2_prob = min_smoothed_prob_log
f2_c1_to_f2_c3_num = _cache(cache, inner_obs_count, frame_2_chan_1, frame_2_chan_3, 3)
if f2_c1_to_f2_c3_num is not None:
f2_c1_to_f2_c3_prob = math.log(float(f2_c1_to_f2_c3_num + numerator_smooth) / (frame_2_chan_1_count + denom_smooth), 10)
else:
f2_c1_to_f2_c3_prob = min_smoothed_prob_log
return f1_c1_to_f2_c1_prob + f1_c2_to_f2_c1_prob + f1_c3_to_f2_c1_prob + f1_c2_to_f2_c2_prob + f1_c3_to_f2_c3_prob + f2_c1_to_f2_c2_prob + f2_c1_to_f2_c3_prob
def score(data, cache=None, obs=1000, smooth=False, check_len=True):
song = data if isinstance(data, list) else utils.parse_song(data)
song = utils.trim_song(song, length=2500)
song_len = len(song[0])
# I don't know of any reliable way to normalize the log-likelyhood
# for different length probabilities, so we're just going to limit
# things to a fixed number of observations to normalize the lenght
# of observations per file, intestead of normalizing the probabilities
# of different length songs
if check_len and song_len < obs:
print " !! %s is too short (%d)" % (data, song_len)
return None
numerator_smooth, denom_smooth = (1, 128) if smooth else (0, 0)
scores = []
for x in range(0, obs):
frame = utils.sized_observation_from_index(song, start=x, length=2)
frame_1, frame_2 = frame.split(".")
scores.append(score_transition(frame_1.split("|"), frame_2.split("|"), smooth))
return sum(scores)
#
# Data Store Related Methods
#
def get_connection():
if not hasattr(get_connection, '_conn'):
get_connection._conn = sqlite3.connect(os.path.join('data', 'bayes_training_counts.sqlite3'))
# Do a test check, just so we can create the tables if needed
cur = get_connection._conn.cursor()
try:
cur.execute('SELECT COUNT(*) FROM cross_obs_counts WHERE obs_1 = "DOES NOT EXIST"')
except sqlite3.OperationalError:
setup()
return get_connection._conn
def setup():
conn = get_connection()
cur = conn.cursor()
cur.execute('CREATE TABLE training_files (filename text)')
cur.execute('CREATE UNIQUE INDEX filename ON training_files (filename)')
cur.execute('CREATE TABLE inner_obs_counts (obs_1 int, obs_2 int, chan_2 int, count int)')
cur.execute('CREATE INDEX obs_1 ON inner_obs_counts (obs_1)')
cur.execute('CREATE UNIQUE INDEX obs_1_obs_2_chan_2 ON inner_obs_counts (obs_1, obs_2, chan_2)')
cur.execute('CREATE TABLE cross_obs_counts (obs_1 int, chan_1 int, obs_2 int, chan_2 int, count int)')
cur.execute('CREATE INDEX obs_1_chan_1 ON cross_obs_counts (obs_1, chan_1)')
cur.execute('CREATE UNIQUE INDEX obs_1_chan_1_obs_2_chan_2 ON cross_obs_counts (obs_1, chan_1, obs_2, chan_2)')
commit()
def has_file_been_recorded(filename):
conn = get_connection()
cur = conn.cursor()
cur.execute('SELECT COUNT(*) AS count FROM training_files WHERE filename = ?', (filename,))
row = cur.fetchone()
return row[0] > 0
def record_file(filename):
conn = get_connection()
cur = conn.cursor()
cur.execute('INSERT INTO training_files (filename) VALUES (?)', (filename,))
def inner_obs_count(obs_1, obs_2=None, chan_2=None):
conn = get_connection()
cur = conn.cursor()
if obs_2 and chan_2:
cur.execute('SELECT count FROM inner_obs_counts WHERE obs_1 = ? AND obs_2 = ? AND chan_2 = ? LIMIT 1', (obs_1, obs_2, chan_2))
else:
cur.execute('SELECT SUM(count) FROM inner_obs_counts WHERE obs_1 = ?', (obs_1,))
row = cur.fetchone()
return row[0] if row else None
def record_inner_obs_count(obs_1, obs_2, chan_2):
conn = get_connection()
cur = conn.cursor()
current_count = inner_obs_count(obs_1, obs_2, chan_2)
if current_count is None:
cur.execute('INSERT INTO inner_obs_counts (obs_1, obs_2, chan_2, count) VALUES (?, ?, ?, 1)', (obs_1, obs_2, chan_2))
else:
cur.execute('UPDATE inner_obs_counts SET count = count + 1 WHERE obs_1 = ? AND obs_2 = ? AND chan_2 = ?', (obs_1, obs_2, chan_2))
def cross_obs_count(obs_1, chan_1, obs_2=None, chan_2=None):
conn = get_connection()
cur = conn.cursor()
if obs_2 and chan_2:
cur.execute('SELECT count FROM cross_obs_counts WHERE obs_1 = ? AND chan_1 = ? AND obs_2 = ? AND chan_2 = ? LIMIT 1', (obs_1, chan_1, obs_2, chan_2))
else:
cur.execute('SELECT SUM(count) AS count FROM cross_obs_counts WHERE obs_1 = ? AND chan_1 = ?', (obs_1, chan_1))
row = cur.fetchone()
return row[0] if row else None
def record_cross_obs_count(obs_1, chan_1, obs_2, chan_2):
conn = get_connection()
cur = conn.cursor()
current_count = cross_obs_count(obs_1, chan_1, obs_2, chan_2)
if current_count is None:
cur.execute('INSERT INTO cross_obs_counts (obs_1, chan_1, obs_2, chan_2, count) VALUES (?, ?, ?, ?, 1)', (obs_1, chan_1, obs_2, chan_2,))
else:
cur.execute('UPDATE cross_obs_counts SET count = count + 1 WHERE obs_1 = ? AND chan_1 = ? AND obs_2 = ? AND chan_2 = ?', (obs_1, chan_1, obs_2, chan_2,))
def commit():
conn = get_connection()
conn.commit()
def train_on_files(files):
for a_file in files:
if has_file_been_recorded(a_file):
print "Not recalculating counts for %s" % (a_file,)
continue
else:
print "Beginning to calculate counts for %s" % (a_file,)
record_file(a_file)
song = parse_song(a_file)
song = trim_song(song, length=2500)
song_len = len(song[0])
if song_len < 100:
print "Song is too short for consideration. May be a sound effect or something trivial. Ignoring."
continue
for x in range(0, song_len):
frame = sized_observation_from_index(song, start=x, length=2)
frame_1, frame_2 = frame.split(".")
frame_1_chan_1, frame_1_chan_2, frame_1_chan_3 = frame_1.split("|")
frame_2_chan_1, frame_2_chan_2, frame_2_chan_3 = frame_2.split("|")
# We treat channel one as the most important one, so first
# record the counts of all channels in frame 1 transitioning
# into channel 1 in frame 2
record_cross_obs_count(frame_1_chan_1, 1, frame_2_chan_1, 1)
record_cross_obs_count(frame_1_chan_2, 2, frame_2_chan_1, 1)
record_cross_obs_count(frame_1_chan_3, 3, frame_2_chan_1, 1)
# Now also record the remaining channels trainsitioning to
# themselves in the next channel
record_cross_obs_count(frame_1_chan_2, 2, frame_2_chan_2, 2)
record_cross_obs_count(frame_1_chan_3, 3, frame_2_chan_3, 3)
# Last, record the transitioning from the assumed "melody" in the
# new frame to the assumed "background" channels in the new
# frame
record_inner_obs_count(frame_2_chan_1, frame_2_chan_2, 2)
record_inner_obs_count(frame_2_chan_1, frame_2_chan_3, 3)
commit()
print "finished calculating counts from %s" % (a_file,)
if __name__ == "__main__":
from song_collections import training_songs
train_on_files(training_songs)