/
jens_playground_backup.py
157 lines (132 loc) · 7.16 KB
/
jens_playground_backup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import cPickle
import numpy
from blocks.filter import VariableFilter
from fuel.datasets.hdf5 import H5PYDataset
from fuel.streams import DataStream
from fuel.schemes import SequentialScheme
from scipy.stats import pearsonr
from theano import function
from custom_blocks import PadAndAddMasks
from network import NetworkType, Network
from util import StateComputer, mark_seq_len, mark_word_boundaries
numpy.set_printoptions(precision=8, suppress=True)
# TODO get these into util.py, maybe in a prettier form
def mark_seq_len_batch(seq_batch, mask_batch):
# get markers separately, then reshape
padded_markers = numpy.array([numpy.arange(len(seq)) for seq in seq_batch])
padded_markers = padded_markers.flatten(order="C")
# throw away padding
return padded_markers[mask_batch.flatten(order="C") == 1]
def mark_word_boundaries_batch(seq_batch, mask_batch):
padded_markers = numpy.array([mark_word_boundaries([map_ind_2_chr[ind] for ind in seq]) for seq in seq_batch])
padded_markers = padded_markers.flatten(order="C")
return padded_markers[mask_batch.flatten(order="C") == 1]
def mark_letter(seq_batch, mask_batch, letter):
padded_markers = 1*numpy.array([[map_ind_2_chr[char] == letter for char in seq] for seq in seq_batch])
padded_markers = padded_markers.flatten(order="C")
return padded_markers[mask_batch.flatten(order="C") == 1]
map_chr_2_ind = cPickle.load(open("char_to_ind.pkl"))
map_ind_2_chr = cPickle.load(open("ind_to_char.pkl"))
lstm_net = Network(NetworkType.LSTM, input_dim=len(map_ind_2_chr), hidden_dims=[512, 512, 512])
lstm_net.set_parameters('seqgen_lstm_512_512_512.pkl')
# having a look at connectioneros from the cellsinas to the outputsos
params = lstm_net.cost_model.get_parameter_values()
for param in params:
print param
# this section deals with prediction probabilities
"""
readouts = VariableFilter(theano_name="readout_readout_output_0")(lstm_net.cost_model.variables)[0]
char_probs = lstm_net.generator.readout.emitter.probs(readouts)
prob_function = function([lstm_net.x, lstm_net.mask], char_probs)
lord_original = "3:15 And the LORD came."
lord = [map_chr_2_ind[char] for char in lord_original]
print lord
zaza = prob_function([lord], numpy.ones((1, len(lord)), dtype="int8"))[:, 0, :]
print zaza
print zaza.shape
for (ey, row) in enumerate(zaza):
print "PREDICTION PROBABILITIES FOR POSITION", ey, "LETTER", repr(lord_original[ey])
sorted_thing = [(prob, ind) for (ind, prob) in enumerate(row)]
sorted_thing.sort(reverse=True)
for (prob, ind) in sorted_thing:
print repr(map_ind_2_chr[ind]), ":", prob
print "\n"
"""
# define a function that gets the overall "sum of scores" at a given time step
readouts = VariableFilter(theano_name="readout_readout_output_0")(lstm_net.cost_model.variables)[0]
score_function = function([lstm_net.x, lstm_net.mask], readouts.sum(axis=2))
# this section of the playground has some fun rides that revolve around various correlation stuff. uncomment to access
# =)
sc = StateComputer(lstm_net.cost_model, map_chr_2_ind)
# storage for the correlations at the very end
correlation_dict = dict()
for name in sc.state_var_names:
correlation_dict[name] = numpy.zeros(lstm_net.hidden_dims[0], dtype=float)
# get validation data to run over
valid_data = H5PYDataset("bible.hdf5", which_sets=("valid",), load_in_memory=True)
data_stream = PadAndAddMasks(
DataStream.default_stream(dataset=valid_data, iteration_scheme=SequentialScheme(valid_data.num_examples,
batch_size=128)),
produces_examples=False)
iterator = data_stream.get_epoch_iterator()
# storage for the "supersequences" concatenated over all sequences
state_super_dict = dict()
for name in sc.state_var_names:
state_super_dict[name] = numpy.empty(shape=(0, lstm_net.hidden_dims[0]))
super_marker = numpy.empty(shape=(0,))
# storage for the connections from states to output (softmax)
# this later allows easier connection between each layer's states and the corresponding output connection
connection_dict = dict()
standard_name = "/sequencegenerator/readout/merge/transform_states"
for name in sc.state_var_names:
if name[-1] == "2":
name_here = standard_name + "#2.W"
elif name[-1] == "1":
name_here = standard_name + "#1.W"
else:
name_here = standard_name + ".W"
connection_dict[name] = params[name_here][:, map_chr_2_ind["O"]]
# if this is true, each state will be aligned with the character (or event derived from it) that it is used to *predict*
# if false, each state will be aligned with the character that was most recently read
prediction_alignment = False
try:
while iterator:
seq_batch, mask_batch = next(iterator)
if not prediction_alignment:
# "remove" last element of each sequence by modifying mask
mask_batch[numpy.arange(mask_batch.shape[0]), mask_batch.sum(axis=1) - 1] = 0
state_batch_dict = sc.read_sequence_batch(seq_batch, mask_batch)
# reshape mask: we only need to do this once per batch, not again for each state_type
# mask is in shape batch_size x seq_len, so NOT transposed, so it is flattened in C order
mask_reshaped = mask_batch.flatten(order="C")
# get marker (very preliminary...)
seq_len_correlator = mark_letter(seq_batch, mask_batch, letter="L")
super_marker = numpy.append(super_marker, seq_len_correlator)
# TESTING total score thingy -- should be 2D, seq_len x batch_size
#overall_scores = score_function(seq_batch, mask_batch)
for state_type in state_batch_dict:
state_batch = state_batch_dict[state_type]
if not prediction_alignment:
# "throw away" initial state by rolling array backwards -- hacky, but sidesteps problems with needing
# different masks for the sequences (the modified one further above) and for states (the "regular" one)
state_batch = numpy.roll(state_batch, shift=-1, axis=0)
#state_batch *= connection_dict[state_type][None, None, :]
#state_batch /= overall_scores[:, :, None]
# note: order of reshape is Fortran because states are "transposed" into seq_len x batch_size x dim
state_reshaped = state_batch.reshape((state_batch.shape[0]*state_batch.shape[1], state_batch.shape[2]),
order="F")
# throw away padding
state_reshaped = state_reshaped[mask_reshaped == 1, :]
# vstack will be very slow; here's hoping the relatively small number of operations will make it bearable...
state_super_dict[state_type] = numpy.vstack((state_super_dict[state_type], state_reshaped))
print "MADE IT THROUGH BATCH"
except StopIteration:
pass
# do correlations between super long sequences...
for state_name in correlation_dict:
for dim in xrange(correlation_dict[state_name].shape[0]):
correlation_dict[state_name][dim] = pearsonr(state_super_dict[state_name][:, dim], super_marker)[0]
print state_name
print correlation_dict[state_name]
print "LARGEST:", max(correlation_dict[state_name]), min(correlation_dict[state_name])
print "\n\n"