-
Notifications
You must be signed in to change notification settings - Fork 4
/
metamer_helpers.py
274 lines (225 loc) · 10.7 KB
/
metamer_helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
"""
Contains functions that are used for image loading and metamer generation.
This code was used for the following paper:
Metamers of neural networks reveal divergence from human perceptual systems
Jenelle Feather, Alex Durango, Ray Gonzalez, Josh McDermott
In Advances in Neural Information Processing Systems (2019)
PDF: https://papers.nips.cc/paper/9198-metamers-of-neural-networks-reveal-divergence-from-human-perceptual-systems.pdf
"""
import scipy.io.wavfile as wav
import pickle
import numpy as np
import scipy
from scipy.misc import imread, imresize
import resampy
import tensorflow as tf
def use_audio_path_specified_audio(wav_path, wav_word,
rms_normalize=None,
SR=20000):
"""
Loads an example wav specified by wav_path
Inputs:
wav_path (string) : filepath to the audio to load
wav_word (string) : label for the audio in wav_path
rms_normalize (float) : the rms value to set the audio to
SR (int) : sampling rate of the desired audio. The file at
wav_path will be resampled to this value
Output:
audio_dict (dictionary) : a dictionary containing the loaded
audio and preprocessing parameters
"""
metamer_word_encodings = pickle.load(open('assets/metamer_word_encodings.pckl', 'rb'))
word_to_int = metamer_word_encodings['word_to_word_idx']
print("Loading: %s"%wav_path)
SR_loaded, wav_f = scipy.io.wavfile.read(wav_path)
if SR_loaded != SR:
wav_f = resampy.resample(wav_f, SR_loaded, SR)
SR_loaded = SR
if rms_normalize is not None:
wav_f = wav_f - np.mean(wav_f.ravel())
wav_f = wav_f/(np.sqrt(np.mean(wav_f.ravel()**2)))*rms_normalize
rms = rms_normalize
else:
rms = np.sqrt(np.mean(wav_f.ravel()**2))
audio_dict={}
audio_dict['wav'] = wav_f
audio_dict['SR'] = SR
audio_dict['word_int'] = word_to_int[wav_word]
audio_dict['word'] = wav_word
audio_dict['rms'] = rms
audio_dict['filename'] = wav_path
audio_dict['filename_short'] = wav_path.split('/')[-1]
audio_dict['correct_response'] = wav_word
return audio_dict
def image_center_crop(input_img, im_shape=224):
"""
Returns a square portion of the input image, rescaled to im_shape
Inputs:
input_img (np array) : an image stored as [H,W,C]
im_shape (int) : the square shape to crop the image
Outputs:
input_img (np array) : The resized numpy array [im_shape, im_shape, C]
"""
image_shape_hw = input_img.shape[0:2]
smallest_dim = min(image_shape_hw)
cropped_img = input_img[int((image_shape_hw[0]/2-smallest_dim/2)):int((image_shape_hw[0]/2+smallest_dim/2)),
int((image_shape_hw[1]/2-smallest_dim/2)):int((image_shape_hw[1]/2+smallest_dim/2)), :]
# Now that it is square, resize it
cropped_img = imresize(cropped_img, (im_shape, im_shape))
return cropped_img
def use_image_path_specified_image(image_path, image_class=None, im_shape=224):
"""
Loads and image from a specified path and performs preprocessing.
Inputs:
image_path (string) : filepath to an image
image_class (string) : the class name for the image
im_shape (int) : the height of the image (square)
Outputs:
image_dict (dictionary) : contains the input image and preprocessing info
"""
print("Loading: %s"%image_path)
input_img = scipy.misc.imread(image_path, mode='RGB')
input_img = image_center_crop(input_img, im_shape=im_shape)
image_dict = {}
image_dict['image'] = input_img
image_dict['shape'] = im_shape
image_dict['filename'] = image_path
image_dict['filename_short'] = image_path.split('/')[-1]
image_dict['correct_response'] = image_class
image_dict['max_value_image_set'] = 255
image_dict['min_value_image_set'] = 0
return image_dict
def _make_pink_noise(T,rms_normalization=False):
"""
Makes a segment of pink noise length T and returns a numpy array with the values
Parameters
----------
T : int
length of the pink noise to generate
rms_normalization : float
normalization factor for the pink noise, ie the rms of a test signal. default no normalization.
Returns
-------
pink_noise : numpy array
numpy array containing pink noise of length T
rms_pint : float
the rms of the pink noise
"""
uneven = T%2
X = np.random.randn(T//2+1+uneven) + 1j * np.random.randn(T//2+1+uneven)
S = np.sqrt(np.arange(len(X))+1.)
pink_noise = (np.fft.irfft(X/S)).real
if uneven:
pink_noise = pink_noise[:-1]
rms_pink = np.sqrt(np.mean(pink_noise**2))
if rms_normalization: # basic normalization of pink noise
pink_noise = (rms_normalization/rms_pink)*pink_noise
rms_pink = np.sqrt(np.mean(pink_noise**2))
return pink_noise, rms_pink
def make_initialization_op_pink_audio(input_tensor, input_noise_shape,
audio_scaling=0.001, rms_normalize=False):
"""
Makes a tensorflow operation to reassign the input audio noise, using pink noise.
This function is particularly helpful when generating multiple metamers from
the same random seed without rebuilding the tensorflow graph.
Inputs:
input_tensor (tensorflow tensor) : the variable tensor that will be optimized
during metamer generation
input_noise_shape (float) : the length of the input audio
audio_scaling (float) : multiplicative value to directly scale the
initialized pink noise. Usually set less than 1, as a way to scale the noise.
rms_normalization (float) : whether to generate the pink noise at a specific
rms value. Normally set to match the rms of the input audio to the network.
Outputs:
input_tensor (tensorflow tensor) : the same as input to the function.
input_noise_assign_op (tensorflow op) : when called with sess.run will
reinitialize input_tensor to the noise value.
"""
pink_noise, rmspink = _make_pink_noise(input_noise_shape,
rms_normalization=rms_normalize)
noise_initialization = (np.expand_dims(pink_noise,0)-np.mean(pink_noise))*audio_scaling
input_noise_assign_op = tf.assign(input_tensor, noise_initialization)
return input_tensor, input_noise_assign_op
def make_initialization_op_random_image(input_tensor, max_image, min_image,
scaling_factor=0.1, im_shape=224):
"""
Generates a random noise image to use as an initialization for the
metamer variable input during the optimization.
This function is particularly useful when generating multiple metamers from the
same random seed without rebuilding the tensorflow graph.
Inputs:
input_tensor (tensorflow tensor) : the variable tensor that will be optimized
during metamer generation
max_image (float) : the maximum value that the variable can be, used to
center the noise
min_image (float) : the minimum value that the varibale can be, used to
center the noise
scaling_factor (float) : how much to scale the input
Outputs:
input_tensor (tensorflow tensor) : the same as input to the function.
input_noise_assign_op (tensorflow op) : when called with sess.run will
reinitialize input_tensor to the noise value.
"""
# Make an op that reassins the input tensor to noise
if type(im_shape)!=list:
im_shape = [im_shape, im_shape, 3]
noise = np.random.random([1, im_shape[0], im_shape[1], im_shape[2]])*scaling_factor + (max_image-min_image)/2
input_noise_assign_op = tf.assign(input_tensor, noise)
return input_tensor, input_noise_assign_op
def run_optimization(loss, sess, input_tensor, iterations_adam, log_loss_every_num,
starting_learning_rate_adam=0.001,
additional_output_tensors=None,
adam_exponential_decay=0.95):
"""
Sets up the optimizer for the metamer generation and runs the optimization.
Inputs:
loss (tensorflow tensor) : the loss to minimize to generate the output
sess (tensorflow session) : the tensorflow session where the
optimization will occur
input_tensor (tensorflow tensor) : the variable tensor that will be optimized
during metamer generation
iterations_adam (int) : the number of iterations to run the adam optimizer
log_loss_every_num (int) : will print the loss after each of these iterations
and save the synthetic variable to a dictionary
starting_learning_rate_adam (float) : learning rate for adam optimizer
additional_output_tensors (list of tensorflow tensors) : If not None, these
tensors will be evaluated every log_loss_every_num iterations and saved to
the track_output dictionary
adam_exponential_decay (float) : Sets the exponential decay for the adam optimizer
Outputs:
total_loss (list) : tracks the loss during the optimization
track_ites (list) : logs the iteration where we saved loss values
track_output (dictionary) : saves evaluated tensors during the optmization.
Always contains the tensor that is being optimized, and can include additional
tensors if `additional_output_tensors` is specified.
"""
if additional_output_tensors is not None:
additional_output_tensors['optimized_tensor'] = input_tensor
# set up the optimizers--adam has a decaying learning rate
step = tf.Variable(0, trainable=False)
rate = tf.train.exponential_decay(starting_learning_rate_adam, step, 1000, adam_exponential_decay)
optm = tf.train.AdamOptimizer(rate).minimize(loss, var_list=[input_tensor], global_step=step)
# set up some values that we will track with each iteration
total_loss = np.full(iterations_adam//log_loss_every_num+1,np.nan)
track_iters = np.full(iterations_adam//log_loss_every_num+1,np.nan)
track_output = {}
# initialize any variables that were uninitialized before (ie the adam optimize variables)
uninitialized = sess.run(tf.report_uninitialized_variables())
all_variables = tf.global_variables()
init_op = tf.variables_initializer([var for var in all_variables if any([var_name.decode('utf-8') in var.name for var_name in uninitialized.tolist()])])
sess.run(init_op)
# go through the adam optimization
for i in range(iterations_adam+1):
if i % log_loss_every_num == 0 or i==iterations_adam:
total_loss[i//log_loss_every_num] = sess.run(loss)
track_iters[i//log_loss_every_num] = i
# TODO: check the R^2 values here?
if i % (log_loss_every_num) == 0:
if additional_output_tensors is not None:
track_output[i] = sess.run(additional_output_tensors)
else:
track_output[i] = sess.run(input_tensor)
print("[%d/%d], loss=%f" % (i, iterations_adam, total_loss[i//log_loss_every_num])) # this seems okay.
if i != iterations_adam: # log the loss for the last iteration but don't run the optimization
sess.run(optm)
return total_loss, track_iters, track_output