def setup_generate(self): # dimensions: (batch, time, 12) chord_types = T.btensor3() # dimensions: (batch, time) chord_roots = T.imatrix() n_batch, n_time = chord_roots.shape specs = [lstmstack.prepare_sample_scan( start_pos=T.alloc(np.array(encoding.STARTING_POSITION, np.int32), (n_batch)), start_out=T.tile(encoding.initial_encoded_form(), (n_batch,1)), timestep=T.tile(T.arange(n_time), (n_batch,1)), cur_chord_type=chord_types, cur_chord_root=chord_roots, deterministic_dropout=True ) for lstmstack, encoding in zip(self.lstmstacks, self.encodings)] updates, all_chosen, all_probs, indiv_probs = helper_generate_from_spec(specs, self.lstmstacks, self.encodings, self.srng, n_batch, n_time, self.bounds, self.normalize_artic_only) self.generate_fun = theano.function( inputs=[chord_roots, chord_types], updates=updates, outputs=all_chosen, allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None)) self.generate_visualize_fun = theano.function( inputs=[chord_roots, chord_types], updates=updates, outputs=[all_chosen, all_probs] + indiv_probs, allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
def test_NanGuardMode(): # Tests if NanGuardMode is working by feeding in numpy.inf and numpy.nans # intentionally. A working implementation should be able to capture all # the abnormalties. x = tt.matrix() w = theano.shared(np.random.randn(5, 7).astype(theano.config.floatX)) y = tt.dot(x, w) fun = theano.function([x], y, mode=NanGuardMode(nan_is_error=True, inf_is_error=True)) a = np.random.randn(3, 5).astype(theano.config.floatX) infa = np.tile((np.asarray(100.0)**1000000).astype(theano.config.floatX), (3, 5)) nana = np.tile(np.asarray(np.nan).astype(theano.config.floatX), (3, 5)) biga = np.tile(np.asarray(1e20).astype(theano.config.floatX), (3, 5)) fun(a) # normal values # Temporarily silence logger _logger = logging.getLogger("theano.compile.nanguardmode") try: _logger.propagate = False with pytest.raises(AssertionError): fun(infa) # INFs with pytest.raises(AssertionError): fun(nana) # NANs with pytest.raises(AssertionError): fun(biga) # big values finally: _logger.propagate = True # slices a = np.random.randn(3, 4, 5).astype(theano.config.floatX) infa = np.tile((np.asarray(100.0)**1000000).astype(theano.config.floatX), (3, 4, 5)) nana = np.tile(np.asarray(np.nan).astype(theano.config.floatX), (3, 4, 5)) biga = np.tile(np.asarray(1e20).astype(theano.config.floatX), (3, 4, 5)) x = tt.tensor3() y = x[:, tt.arange(2), tt.arange(2), None] fun = theano.function([x], y, mode=NanGuardMode(nan_is_error=True, inf_is_error=True)) fun(a) # normal values try: _logger.propagate = False with pytest.raises(AssertionError): fun(infa) # INFs with pytest.raises(AssertionError): fun(nana) # NANs with pytest.raises(AssertionError): fun(biga) # big values finally: _logger.propagate = True
def test_NanGuardMode(): """ Tests if NanGuardMode is working by feeding in numpy.inf and numpy.nans intentionally. A working implementation should be able to capture all the abnormalties. """ x = T.matrix() w = theano.shared(numpy.random.randn(5, 7).astype(theano.config.floatX)) y = T.dot(x, w) fun = theano.function( [x], y, mode=NanGuardMode(nan_is_error=True, inf_is_error=True) ) a = numpy.random.randn(3, 5).astype(theano.config.floatX) infa = numpy.tile( (numpy.asarray(100.) ** 1000000).astype(theano.config.floatX), (3, 5)) nana = numpy.tile( numpy.asarray(numpy.nan).astype(theano.config.floatX), (3, 5)) biga = numpy.tile( numpy.asarray(1e20).astype(theano.config.floatX), (3, 5)) fun(a) # normal values # Temporarily silence logger _logger = logging.getLogger("theano.compile.nanguardmode") try: _logger.propagate = False assert_raises(AssertionError, fun, infa) # INFs assert_raises(AssertionError, fun, nana) # NANs assert_raises(AssertionError, fun, biga) # big values finally: _logger.propagate = True
def prepare_style(self, scale=1.0): """Called each phase of the optimization, process the style image according to the scale, then run it through the model to extract intermediate outputs (e.g. sem4_1) and turn them into patches. """ style_image = skimage.transform.rescale(self.style_img_original, scale) * 255.0 self.style_image = self.model.prepare_image(style_image) style_map = skimage.transform.rescale( self.style_map_original * args.semantic_weight, scale) * 255.0 self.style_map = style_map.transpose( (2, 0, 1))[np.newaxis].astype(np.float32) # Compile a function to run on the GPU to extract patches for all layers at once. extractor = theano.function( [self.model.tensor_img, self.model.tensor_map], self.extract_patches([ self.model.tensor_outputs['sem' + l] for l in self.style_layers ]), mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) result = extractor(self.style_image, self.style_map) # For each layer, we now have a set of patches and their magnitude. for layer, patches, norms in zip(self.style_layers, result[::2], result[1::2]): l = self.model.network['nn' + layer] l.N = theano.shared(norms) l.W.set_value(patches) l.num_filters = patches.shape[0] print(' - Style layer sem{}: {} patches in {:,}kb.'.format( layer, patches.shape[0], patches.size // 1000))
def get_fns(self, input_dim=123, p_learning_rate=0.01, d_learning_rate=0.0001, p=0.23928176569346055): x = T.matrix('X') y = T.vector('y') mlp, updates, cost, probs = self.primal_step(x, y, p_learning_rate, input_dim) train_fn = theano.function([x, y], [cost], updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) # Calculate Validation in batch_mode for speedup valid_th_fns = theano.function([x], probs) def valid_fn(x, y): probs = valid_th_fns(x) f_beta = self.get_cost(y, probs) return f_beta return train_fn, valid_fn
def test_nan_guard_mode(): # Also test that abs uint* and bool have c code. for dtype in ["uint8", "int64", "bool"]: x = tensor.vector(dtype=dtype) y = x + 1 mode = NanGuardMode(nan_is_error=True, optimizer=mode_with_gpu.optimizer) f = theano.function([x], y, mode=mode) d = np.asarray([23, 7]).astype(dtype) assert np.allclose(f(d), d + 1)
def prepare_style(self, scale=1.0): """Called each phase of the optimization, process the style image according to the scale, then run it through the model to extract intermediate outputs (e.g. sem4_1) and turn them into patches. """ style_image = skimage.transform.rescale(self.style_img_original, scale) * 255.0 self.style_image = self.model.prepare_image(style_image) style_map = skimage.transform.rescale(self.style_map_original, scale) * 255.0 self.style_map = style_map.transpose( (2, 0, 1))[np.newaxis].astype(np.float32) # Workaround for Issue #8. Not clear what this is caused by, NaN seems to happen in convolution node # on some OSX installations. https://github.com/alexjc/neural-doodle/issues/8 if args.safe_mode: from theano.compile.nanguardmode import NanGuardMode flags = { 'mode': NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False) } else: flags = {} # Compile a function to run on the GPU to extract patches for all layers at once. required_layers = ['conv' + l for l in self.style_layers ] + ['map' + l for l in self.style_layers] extractor = theano.function( [self.model.tensor_img, self.model.tensor_map], self.extract_patches([ self.model.tensor_outputs[l] for l in required_layers ]), **flags) result = extractor(self.style_image, self.style_map) # For each layer, build it from set of patches and their magnitude. def build(layer, prefix, name, patches, norms): l = self.model.network[prefix + layer] l.N = theano.shared(norms) l.W.set_value(patches) l.num_filters = patches.shape[0] print(' - {} layer {}: {} patches in {:,}kb.'.format( name, layer, patches.shape[0], patches.size // 1000)) if args.style_weight > 0.0: result_nn = result[:len(self.style_layers) * 2] for layer, *data in zip(self.style_layers, result_nn[::2], result_nn[1::2]): build(layer, 'nn', 'Style', *data) if args.semantic_weight > 0.0: result_mm = result[len(self.style_layers) * 2:] for layer, *data in zip(self.style_layers, result_mm[::2], result_mm[1::2]): build(layer, 'mm', 'Semantic', *data)
def rmsprop(lr, tparams, grads, inp, cost, opt_ret=None): """ RMS prop optimizer :param lr: :param tparams: :param grads: :param inp: :param cost: :param opt_ret: :return f_grad_shared, f_update: """ zipped_grads = [ theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad' % k) for k, p in iteritems(tparams) ] running_grads = [ theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad' % k) for k, p in iteritems(tparams) ] running_grads2 = [ theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad2' % k) for k, p in iteritems(tparams) ] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp, [cost], updates=zgup + rgup + rg2up, profile=profile, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) updir = [ theano.shared(p.get_value() * numpy.float32(0.), name='%s_updir' % k) for k, p in iteritems(tparams) ] updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg**2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in zip(itervalues(tparams), updir_new)] f_update = theano.function([lr], [], updates=updir_new + param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
def compile(inputs, outputs, *args, mode=None, **kwargs): """ Use as theano.function(). TODO: Something useful with non-symbolic output ? Parameters ---------- ... mode: In addition to the values accepted by `theano.function`, also accepts a string to make it easier to use `NanGuardMode`. If a string, a `NanGuardMode` object is created; the string should contain comma separated values indicating against which values we want to guard. For example, with the string ``"nan,inf"``, a `NanGuardMode` object is created with the options ``NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)``. """ if not any( core.is_theano_object(arg) for arg in itertools.chain( [inputs, outputs], args, kwargs.values())): raise ValueError( "`shim.graph.function()` is undefined for non-symbolic outputs") if mode: from theano.compile.nanguardmode import NanGuardMode if isinstance(mode, NanGuardMode): kwargs['mode'] = mode elif isinstance(mode, str): nanguard = 'nan' in mode infguard = 'inf' in mode bigguard = 'big' in mode kwargs['mode'] = NanGuardMode(nan_is_error=nanguard, inf_is_error=infguard, big_is_error=bigguard) # Replace dict by OrderedDict to silence Theano warnings – since 3.7, dicts # now have guaranteed order if sys.version_info.major >= 3 and sys.version_info.minor >= 7: args = tuple( collections.OrderedDict(a) if type(a) is dict else a for a in args) kwargs = { k: collections.OrderedDict(v) if type(v) is dict else v for k, v in kwargs.items() } return core.theano.function(inputs, outputs, *args, **kwargs)
def buildvalidfun(self, model): self.tt.tick("compiling validation function") inps, out = self.autobuild_model(model, *self.traindata, _trainmode=False) if issequence(out): out = out[0] metrics, newinp = self.buildlosses(out, self.validators) inputs = newinp if newinp is not None else inps ret = None if len(metrics) > 0: ret = theano.function(inputs=[x.d for x in inputs] + [self.goldvar], outputs=metrics, mode=NanGuardMode(nan_is_error=True, inf_is_error=False, big_is_error=False)) else: self.tt.msg("NO VALIDATION METRICS DEFINED, RETURNS NONE") self.tt.tock("validation function compiled") return ret
def test_NanGuardMode(): """ Tests if NanGuardMode is working by feeding in numpy.inf and numpy.nans intentionally. A working implementation should be able to capture all the abnormalties. """ x = T.matrix() w = theano.shared(numpy.random.randn(5, 7).astype(theano.config.floatX)) y = T.dot(x, w) fun = theano.function([x], y, mode=NanGuardMode(nan_is_error=True, inf_is_error=True)) a = numpy.random.randn(3, 5).astype(theano.config.floatX) infa = numpy.tile( (numpy.asarray(100.)**1000000).astype(theano.config.floatX), (3, 5)) nana = numpy.tile( numpy.asarray(numpy.nan).astype(theano.config.floatX), (3, 5)) biga = numpy.tile(numpy.asarray(1e20).astype(theano.config.floatX), (3, 5)) work = [False, False, False] fun(a) # normal values try: fun(infa) # INFs except AssertionError: work[0] = True try: fun(nana) # NANs except AssertionError: work[1] = True try: fun(biga) # big values except AssertionError: work[2] = True if not (work[0] and work[1] and work[2]): raise AssertionError("NanGuardMode not working.")
def setup_encode(self): # dimensions: (batch, time, 12) chord_types = T.btensor3() # dimensions: (batch, time) chord_roots = T.imatrix() # dimensions: (batch, time) relative_posns = [T.imatrix() for _ in self.encodings] # dimesions: (batch, time, output_data) encoded_melodies = [T.btensor3() for _ in self.encodings] n_batch, n_time = chord_roots.shape all_activations = [] for encoding, enc_lstmstack, encoded_melody, relative_pos in zip( self.encodings, self.enc_lstmstacks, encoded_melodies, relative_posns): activations = enc_lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch, 1)), relative_position=relative_pos, cur_chord_type=chord_types, cur_chord_root=chord_roots, cur_input=encoded_melody, deterministic_dropout=True) all_activations.append(activations) reduced_activations = functools.reduce((lambda x, y: x + y), all_activations) strengths, vects = self.qman.get_strengths_and_vects( reduced_activations) self.encode_fun = theano.function( inputs=[chord_types, chord_roots] + relative_posns + encoded_melodies, outputs=[strengths, vects], allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
def buildvalidfun(self, model, batsize): self.tt.tick("validation - autobuilding") inps, outps = self.autobuild_model(model, *self.traindata, _trainmode=False, _batsize=batsize) assert (len(outps) == 1) outp = outps[0] self.tt.tock("validation - autobuilt") self.tt.tick("compiling validation function") metrics, newinp = self.buildlosses(outp, self.validators) inputs = newinp if newinp is not None else inps ret = None if len(metrics) > 0: ret = theano.function(inputs=[x.d for x in inputs] + [self.goldvar], outputs=metrics, mode=NanGuardMode(nan_is_error=True, inf_is_error=False, big_is_error=True)) else: self.tt.msg("NO VALIDATION METRICS DEFINED, RETURNS NONE") self.tt.tock("validation function compiled") return ret
def __init__( self, n_dim, n_out, n_chan=1, n_superbatch=12800, opt_alg='adam', opt_params={'lr': 1e-3, 'b1': 0.9, 'b2': 0.99} ): """RBM constructor. Defines the parameters of the model along with basic operations for inferring hidden from visible (and vice-versa), as well as for performing CD updates. """ self.numpy_rng = np.random.RandomState(1234) self.theano_rng = RandomStreams(self.numpy_rng.randint(2 ** 30)) self.create_mat = lambda x, y: self.numpy_rng.normal(0, 0.01, (x, y)).astype(theano.config.floatX) # save config n_batch = opt_params.get('nb') self.n_hidden = 100 self.n_visible = n_chan*n_dim*n_dim # size of visible layer self.n_batch = n_batch self.n_qk = 10 # num of components in MoB used of q self.n_mc = 30 # num of monte carlo samples from each MoB component self.n_dim = n_dim self.n_out = n_out self.n_superbatch = n_superbatch self.alg = opt_alg # set up general RBM methods AbstractRBM.__init__(self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params) # create updates alpha = T.scalar(dtype=theano.config.floatX) # learning rate # save config self.n_class = 2 self.n_dim = n_dim self.n_out = n_out self.n_components = self.n_qk self.n_samples = self.n_mc self.n_tot_samples = self.n_samples*self.n_components # create input variables D, idx1, idx2 = self.create_inputs() # create model self.network = self.create_model() # create objectives loglik, plik = self.create_objectives(D) # create gradients dL_Theta, dE_Theta, dlogZ_Theta, dL_Phi = self.create_gradients() grads = dL_Theta, dE_Theta, dlogZ_Theta, dL_Phi # create updates uL_Theta, uL_Phi, avg_updates, avg_Theta_updates \ = self.create_updates(grads, None, alpha, opt_alg, opt_params) # logF_avg, Z_avg = self.create_llik_estimate(D) mode = NanGuardMode(nan_is_error=True, inf_is_error=False, big_is_error=False) mode = None common_update1 = OrderedDict(avg_updates.items() + uL_Phi.items()) self.train_q = theano.function([idx1, idx2], [loglik, plik], updates=common_update1, mode=mode, givens={D: self.train_set_x[idx1:idx2]}) common_update2 = OrderedDict(avg_Theta_updates.items() + uL_Theta.items()) self.train_p = theano.function([idx1, idx2], [loglik, plik], updates=common_update2, mode=mode, on_unused_input='warn', givens={D: self.train_set_x[idx1:idx2]}) # self.llik = theano.function([D], logF_avg - T.log(Z_avg), mode=mode) common_update3 = OrderedDict(common_update1.items() + common_update2.items()) self.train = theano.function([idx1, idx2], [loglik, plik], updates=common_update3, mode=mode, givens={D: self.train_set_x[idx1:idx2]})
def setup_train(self): # dimensions: (batch, time, 12) chord_types = T.btensor3() # dimensions: (batch, time) chord_roots = T.imatrix() # dimensions: (batch, time) relative_posns = [T.imatrix() for _ in self.encodings] # dimesions: (batch, time, output_data) encoded_melodies = [T.btensor3() for _ in self.encodings] # dimesions: (batch, time) correct_notes = T.imatrix() n_batch, n_time = chord_roots.shape def _build(det_dropout): all_out_probs = [] for encoding, lstmstack, encoded_melody, relative_pos in zip(self.encodings, self.lstmstacks, encoded_melodies, relative_posns): activations = lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch,1)) , relative_position=relative_pos, cur_chord_type=chord_types, cur_chord_root=chord_roots, last_output=T.concatenate([T.tile(encoding.initial_encoded_form(), (n_batch,1,1)), encoded_melody[:,:-1,:] ], 1), deterministic_dropout=det_dropout) out_probs = encoding.decode_to_probs(activations, relative_pos, self.bounds.lowbound, self.bounds.highbound) all_out_probs.append(out_probs) reduced_out_probs = functools.reduce((lambda x,y: x*y), all_out_probs) if self.normalize_artic_only: non_artic_probs = reduced_out_probs[:,:,:2] artic_probs = reduced_out_probs[:,:,2:] non_artic_sum = T.sum(non_artic_probs, 2, keepdims=True) artic_sum = T.sum(artic_probs, 2, keepdims=True) norm_artic_probs = artic_probs*(1-non_artic_sum)/artic_sum norm_out_probs = T.concatenate([non_artic_probs, norm_artic_probs], 2) else: normsum = T.sum(reduced_out_probs, 2, keepdims=True) normsum = T.maximum(normsum, constants.EPSILON) norm_out_probs = reduced_out_probs/normsum return Encoding.compute_loss(norm_out_probs, correct_notes, True) train_loss, train_info = _build(False) updates = Adam(train_loss, self.get_optimize_params(), lr=self.learning_rate_var) eval_loss, eval_info = _build(True) self.loss_info_keys = list(train_info.keys()) self.update_fun = theano.function( inputs=[chord_types, chord_roots, correct_notes] + relative_posns + encoded_melodies, outputs=[train_loss]+list(train_info.values()), updates=updates, allow_input_downcast=True, on_unused_input='ignore', mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None)) self.eval_fun = theano.function( inputs=[chord_types, chord_roots, correct_notes] + relative_posns + encoded_melodies, outputs=[eval_loss]+list(eval_info.values()), allow_input_downcast=True, on_unused_input='ignore', mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
args = parser.parse_args() if args.save != default_save and not args.overwrite and os.path.isfile( 'interim/%s_model.pkl' % args.save): raise Exception( 'A model with this name was already saved. Provide the --overwrite flag (-o) when trying to --save over an existing model.' ) import numpy as np import scipy import theano from theano import tensor as T from six.moves import cPickle import sys sys.setrecursionlimit(100000) from theano.compile.nanguardmode import NanGuardMode ngm = NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False) from sklearn.svm import SVC from vcd import image_iter, models, util # Configuration cfg = { # General 'patch_shape': (33, 33), 'aug_noise_std': 0.05, 'train_test_split': 0.8, # Proportion of dataset to use for the train+validation set if the test set is enabled. 'train_valid_split': 0.75, # Proportion of train+validation set to use for the training set. # Note that the final training set size is (label_count * train_test_split * train_valid_split).
def main(): # Al inici es recuperen els valor del fitxer de configuració trainingSize, validationSize, batchSize, testDataSize, nLayer, num_epochs, getFromFile = getConfigData( ) printAndSave("Loading data...", dt=False) # Segons l'escollit al fitxer de configuració les metadades # es generen o obtenen d'un fitxer if (getFromFile): printAndSave("Getting metadata from file...", dt=False) getMetadata = getMetadataFromFile else: printAndSave("Calculating metadata...", dt=False) getMetadata = calculateMetadata # S'obtenen les dades tant de les coleccions d'entrada com de les etiquetes per validar train, trainTargets, val, valTargets, test, \ testTargets, metadata, colsToRemove = \ getTrainingTestLists( traiSize = trainingSize, valSize = validationSize, testSize = testDataSize, getMetadata = getMetadata) # Es preparen les variables de theano per a # l'entrada i les etiquetes que s'utilitzen # per validar els reusltats input_var = T.matrix('inputs') target_var = T.ivector('targets') # Es crea la FNN network = buid_MLP(input_var=input_var, depth=nLayer, drop_input=.2, drop_hidden=.5, nCols=len(metadata)) # S'obté la predicció a partir de la sorida de la MLP prediction = lasagne.layers.get_output(network) # Expressió per la perdua loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() # Es creen les expressions d'update per modificar els # parametres en cada pas del training params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.momentum(loss, params, learning_rate=0.01, momentum=0.9) # S'obté la predicció a partir de la sorida de la MLP per la validació i testing, # a diferència de l'anterior aquí es desactiven les capes de dropout passant a # través de tota la xarxa amb el mode deterministic a True test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() # Expressió per la precissió de la classificació es realitza a # partir de la predicció obtinguda al a sortida del MLP test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compilar la funcio executant un pas de training mitjancant un petit # paquet de dades, es retornara la perdua # S'activa el mode NanGuardMode per tal d'obtenir un error en cas de # nombres massa grans això val per comprobar la validesa en la # normalització de les dades train_fn = theano.function([input_var, target_var], loss, updates=updates, name="TrainingFunc", mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) # Es recuperarà la pèrdua i precissió, # s'utilitza tant en la validació com en el test val_fn = theano.function([input_var, target_var], [test_loss, test_acc], name="ValidationFunc") # S'inicialitza l'entrenament printAndSave("*" * 53, dt=False) printAndSave("Starting training...", dt=False) training_start_time = time.time() for epoch in range(num_epochs): start_time = time.time() # Per cada iteració es fa una execució completa de les dades d'entrenament train_err = 0 train_batches = 0 for batch in iterate_minibatches(train, trainTargets, batchSize, metadata, colsToRemove): inputs, targets = batch tmp = train_fn(inputs, targets) train_err += tmp train_batches += 1 # Validació de la iteració val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatches(val, valTargets, batchSize, metadata, colsToRemove): inputs, targets = batch err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 # Impressió de resultats de la iteració printAndSave("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time), dt=False) printAndSave(" training loss:\t\t{:.6f}".format(train_err / train_batches), dt=False) printAndSave(" validation loss:\t\t{:.6f}".format(val_err / val_batches), dt=False) printAndSave(" validation accuracy:\t\t{:.2f} %".format( val_acc / val_batches * 100), dt=False) # ####################################### # Activar per calcular l'error i precisió # amb les dades de test en cada epoch # ####################################### # Es realitza el test # start_time = time.time() # test_err = 0 # test_acc = 0 # test_batches = 0 # for batch in iterate_minibatches(test,testTargets, batchSize, metadata, colsToRemove): # inputs, targets = batch # err, acc = val_fn(inputs, targets) # test_err += err # test_acc += acc # test_batches += 1 # # Impressió de resultats del test # printAndSave("Final results:",dt=False) # printAndSave(" test loss:\t\t\t{:.6f}".format(test_err / test_batches),dt=False) # printAndSave(" test accuracy:\t\t{:.2f} %".format( # test_acc / test_batches * 100),dt=False) # printAndSave("Tests in {}".format(time.time()-start_time),dt=False) printAndSave("Training in {}".format(time.time() - training_start_time), dt=False) # Es realitza el test start_time = time.time() test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatches(test, testTargets, batchSize, metadata, colsToRemove): inputs, targets = batch err, acc = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 # Impressió de resultats del test printAndSave("Final results:", dt=False) printAndSave(" test loss:\t\t\t{:.6f}".format(test_err / test_batches), dt=False) printAndSave(" test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100), dt=False) printAndSave("Tests in {}".format(time.time() - start_time), dt=False)
def __init__(self,parameters=None): X = tensor.tensor4() Y = tensor.lvector() self.params = parameters if parameters == None: W1 = theano.shared(np.random.randn(32,3,5,5).astype(theano.config.floatX)*0.01) b1 = theano.shared(np.zeros(32,).astype(theano.config.floatX)) W2 = theano.shared(np.random.randn(64,32,5,5).astype(theano.config.floatX)*0.01) b2 = theano.shared(np.zeros(64,).astype(theano.config.floatX)) W3 = theano.shared(np.random.randn(128,64,5,5).astype(theano.config.floatX)*0.01) b3 = theano.shared(np.zeros(128,).astype(theano.config.floatX)) W5 = theano.shared(np.random.randn(28800,1084).astype(theano.config.floatX)*0.01) # b5 = theano.shared(np.zeros(64*9*9,)) W6 = theano.shared(np.random.randn(1084,2).astype(theano.config.floatX)*0.01) b6 = theano.shared(np.zeros(2,).astype(theano.config.floatX)) else: W1 = theano.shared(parameters["W1"]) b1 = theano.shared(parameters["b1"]) W2 = theano.shared(parameters["W2"]) b2 = theano.shared(parameters["b2"]) W3 = theano.shared(parameters["W3"]) b3 = theano.shared(parameters["b3"]) W5 = theano.shared(parameters["W5"]) W6 = theano.shared(parameters["W6"]) b6 = theano.shared(parameters["b6"]) layer_1 = conv2d(X,W1) layer_1_pool = pool_2d(layer_1,(2,2),ignore_border=True) layer_1_output = tensor.tanh(layer_1_pool+b1.dimshuffle('x', 0, 'x', 'x')) layer_2 = conv2d(layer_1_output, W2) layer_2_pool = pool_2d(layer_2,(2,2),ignore_border=True) layer_2_output = tensor.tanh(layer_2_pool+b2.dimshuffle('x', 0, 'x', 'x')) layer_3 = conv2d(layer_2_output, W3) layer_3_pool = pool_2d(layer_3,(2,2),ignore_border=True) layer_3_output = tensor.tanh(layer_3_pool+b3.dimshuffle('x', 0, 'x', 'x')) layer_4 = layer_3_output.flatten(2) layer_5 = tensor.dot(layer_4,W5) layer_5_output = layer_5.tanh() layer_6 = tensor.dot(layer_5_output, W6) + b6 #softmax instead of sigmoid. layer_6_output = softmax(layer_6) + 0.0000001 output = tensor.argmax(layer_6_output,axis=1) # cost = ((Y-layer_6_output)**2).sum() # Negative Log Likelihood cost = -tensor.mean(tensor.log(layer_6_output)[tensor.arange(Y.shape[0]), Y], dtype=theano.config.floatX) error = tensor.mean(tensor.neq(output, Y)) parameters = [W1,b1,W2,b2,W3,b3,W5,W6,b6] updates = self.GradientDescent(cost,parameters) params = {"W1": W1, "b1": b1, "W2": W2, "b2": b2, "W3": W3, "b3": b3, "W5": W5, "W6": W6, "b6": b6} self.parameters = theano.function([],params) mode = NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True).excluding('local_elemwise_fusion','inplace') self.train = theano.function([X, Y], cost,updates=updates, mode=mode) self.test = theano.function([X, Y], error) self.predict = theano.function([X],output)
def __init__(self, K, vocab_size, num_chars, W_init, S_init, nhidden, embed_dim, dropout, train_emb, sub_dim, use_feat, gating_fn, save_attn=False): self.nhidden = nhidden self.embed_dim = embed_dim self.dropout = dropout self.train_emb = train_emb self.sub_dim = sub_dim self.learning_rate = LEARNING_RATE self.num_chars = num_chars self.use_feat = use_feat self.save_attn = save_attn self.gating_fn = gating_fn self.use_subs = self.sub_dim != 0 if W_init is None: W_init = lasagne.init.GlorotNormal().sample( (vocab_size, self.embed_dim)) # W_init = lasagne.init.GlorotNormal().sample((vocab_size, self.embed_dim)) doc_var, query_var, cand_var = T.itensor3('doc'), T.itensor3('quer'), \ T.wtensor3('cand') docmask_var, qmask_var, candmask_var = T.bmatrix('doc_mask'), T.bmatrix('q_mask'), \ T.bmatrix('c_mask') target_var = T.ivector('ans') feat_var = T.imatrix('feat') doc_toks, qry_toks = T.imatrix('dchars'), T.imatrix('qchars') tok_var, tok_mask = T.imatrix('tok'), T.bmatrix('tok_mask') cloze_var = T.ivector('cloze') self.inps = [ doc_var, doc_toks, query_var, qry_toks, cand_var, target_var, docmask_var, qmask_var, tok_var, tok_mask, candmask_var, feat_var, cloze_var ] self.predicted_probs, predicted_probs_val, self.network, W_emb, attentions = ( self.build_network(K, vocab_size, W_init, S_init)) self.loss_fn = T.nnet.categorical_crossentropy(self.predicted_probs, target_var).mean() self.eval_fn = lasagne.objectives.categorical_accuracy( self.predicted_probs, target_var).mean() loss_fn_val = T.nnet.categorical_crossentropy(predicted_probs_val, target_var).mean() eval_fn_val = lasagne.objectives.categorical_accuracy( predicted_probs_val, target_var).mean() self.params = L.get_all_params(self.network, trainable=True) updates = lasagne.updates.adam(self.loss_fn, self.params, learning_rate=self.learning_rate) self.train_fn = theano.function( self.inps, [self.loss_fn, self.eval_fn, self.predicted_probs], updates=updates, on_unused_input='ignore') self.validate_fn = theano.function( self.inps, [loss_fn_val, eval_fn_val, predicted_probs_val] + attentions, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True), on_unused_input='ignore')
def train_loop(inputs, cost, train_data, times, prints=None, inject_total_iters=False, test_data=None, callback=None, optimizer=lasagne.updates.adam, save_params=False, nan_guard=False): params = lib.search(cost, lambda x: hasattr(x, 'param')) lib.print_params_info(params) grads = T.grad(cost, wrt=params, disconnected_inputs='warn') grads = [T.clip(g, lib.floatX(-1), lib.floatX(1)) for g in grads] updates = optimizer(grads, params) if prints is None: prints = [('cost', cost)] else: prints = [('cost', cost)] + prints print "Compiling train function..." if nan_guard: from theano.compile.nanguardmode import NanGuardMode mode = NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) else: mode = None train_fn = theano.function(inputs, [p[1] for p in prints], updates=updates, on_unused_input='warn', mode=mode) print "Compiling eval function..." eval_fn = theano.function(inputs, [p[1] for p in prints], on_unused_input='warn') print "Training!" total_iters = 0 total_seconds = 0. last_print = 0 last_gen = 0 if len(times) >= 4: gen_every = times[3] else: gen_every = times[1] if len(times) >= 5: early_stop = times[4] if len(times) >= 6: early_stop_min = times[5] else: early_stop_min = 0 else: early_stop = None early_stop_min = None best_test_cost = np.inf best_test_cost_iter = 0. all_outputs = [] all_stats = [] for epoch in itertools.count(): generator = train_data() while True: try: inputs = generator.__next__() except StopIteration: break if inject_total_iters: inputs = [np.int32(total_iters)] + list(inputs) start_time = time.time() outputs = train_fn(*inputs) total_seconds += time.time() - start_time total_iters += 1 all_outputs.append(outputs) if total_iters == 1: try: # This only matters on Ishaan's computer import experiment_tools experiment_tools.register_crash_notifier() except ImportError: pass if (times[0]=='iters' and total_iters-last_print == times[1]) or \ (times[0]=='seconds' and total_seconds-last_print >= times[1]): mean_outputs = np.array(all_outputs).mean(axis=0) if test_data is not None: if inject_total_iters: test_outputs = [ eval_fn(np.int32(total_iters), *inputs) for inputs in test_data() ] else: test_outputs = [ eval_fn(*inputs) for inputs in test_data() ] test_mean_outputs = np.array(test_outputs).mean(axis=0) stats = collections.OrderedDict() stats['epoch'] = epoch stats['iters'] = total_iters for i, p in enumerate(prints): stats['train ' + p[0]] = mean_outputs[i] if test_data is not None: for i, p in enumerate(prints): stats['test ' + p[0]] = test_mean_outputs[i] stats['secs'] = total_seconds stats['secs/iter'] = total_seconds / total_iters if test_data != None and (stats['test cost'] < best_test_cost or (early_stop_min != None and total_iters <= early_stop_min)): best_test_cost = stats['test cost'] best_test_cost_iter = total_iters print_str = "" for k, v in stats.items(): if isinstance(v, int): print_str += "{}:{}\t".format(k, v) else: print_str += "{}:{:.4f}\t".format(k, v) print print_str[:-1] # omit the last \t all_stats.append(stats) all_outputs = [] last_print += times[1] if (times[0]=='iters' and total_iters-last_gen==gen_every) or \ (times[0]=='seconds' and total_seconds-last_gen >= gen_every): tag = "iters{}_time{}".format(total_iters, total_seconds) if callback is not None: callback(tag) if save_params: lib.save_params('params_{}.pkl'.format(tag)) last_gen += gen_every if (times[0]=='iters' and total_iters == times[2]) or \ (times[0]=='seconds' and total_seconds >= times[2]) or \ (test_data != None and early_stop != None and total_iters > (3*early_stop) and (total_iters-best_test_cost_iter) > early_stop): if (test_data != None and early_stop != None and total_iters > (3 * early_stop) and (total_iters - best_test_cost_iter) > early_stop): print "Early stop! Best test cost was {} at iter {}".format( best_test_cost, best_test_cost_iter) print "Done!" try: # This only matters on Ishaan's computer import experiment_tools experiment_tools.send_sms("done!") except ImportError: pass return all_stats
y = net.fprop(x**2 / 2.) cost = y.mean() parameters = net.params from blocks.algorithms import Scale from blocks.algorithms import GradientDescent optimizer = Scale(0.) print "Calling Algorithm" algorithm = GradientDescent( #gradients=grads, parameters=parameters, cost=cost, parameters=parameters, step_rule=optimizer) from theano.compile.nanguardmode import NanGuardMode fun = theano.function(inputs=[x], outputs=[cost], updates=algorithm.updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) #npx = getnumpyf32((5, batch_size, channels,)+image_size) npx = np.random.random((5, 32, 50)).astype(np.float32) out = fun(npx) #for i,v in enumerate(parameters): # if 'U' in v.name: # theano.printing.debugprint(algorithm.updates[i][1]) # break
def build_model(tparams, options): opt_ret = dict() trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # description string: #words x #samples x = tensor.matrix('x', dtype='int64') x_mask = tensor.matrix('x_mask', dtype='float32') y = tensor.matrix('y', dtype='int64') y_mask = tensor.matrix('y_mask', dtype='float32') # time_steps n_timesteps = x_mask.shape[0] n_timesteps_trg = y_mask.shape[0] n_samples = x_mask.shape[1] # word embedding for forward rnn (source) emb = tparams['Wemb'][x.flatten()] emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) proj = get_layer(options['encoder'])[1](tparams, emb, options, prefix='encoder', mask=x_mask) # for reverse RNN: bi-directional RNN encoder if options.get('birnn', False): xr = x[::-1] xr_mask = x_mask[::-1] embr = tparams['Wemb'][xr.flatten()] embr = embr.reshape([n_timesteps, n_samples, options['dim_word']]) projr = get_layer(options['encoder'])[1](tparams, embr, options, prefix='encoder_r', mask=xr_mask) ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim - 1) else: ctx = proj[0] # context vectors # mean of the context (across time) will be used to initialize decoder rnn ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None] # or you can use the last state of forward + backward encoder rnns # ctx_mean = concatenate([proj[0][-1], projr[0][-1]], axis=proj[0].ndim-2) # initial decoder state init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh') # word embedding (target), we will shift the target sequence one time step # to the right. This is done because of the bi-gram connections in the # readout and decoder rnn. The first target will be all zeros and we will # not condition on the last output. emb = tparams['Wemb_dec'][y.flatten()] emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']]) emb_shifted = tensor.zeros_like(emb) emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) emb = emb_shifted # decoder - pass through the decoder conditional gru with attention proj = get_layer(options['decoder'])[1](tparams, emb, options, prefix='decoder', mask=y_mask, context=ctx, context_mask=x_mask, one_step=False, init_state=init_state) # hidden states of the decoder gru proj_h = proj[0] # weighted averages of context, generated by attention module ctxs = proj[1] # weights (alignment matrix) opt_ret['dec_alphas'] = proj[2] # --> to show the attenion weights # compute word probabilities logit_lstm = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') logit_prev = get_layer('ff')[1](tparams, emb, options, prefix='ff_logit_prev', activ='linear') logit_ctx = get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx) # dropout (noise) if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape probs = tensor.nnet.softmax( logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]])) # compute the cost (negative loglikelihood) y_flat = y.flatten() y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat cost = -tensor.log(probs.flatten()[y_flat_idx]) cost = cost.reshape([y.shape[0], y.shape[1]]) cost = (cost * y_mask).sum(0) # we will build an additional function for computing costs f_cost = theano.function([ctx, x_mask, y, y_mask], cost, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost, f_cost
def trainer( r=5, dim_word=1000, dim=1000, trainpath=[ '../datasets/simQA_test.txt', '../datasets/cand_ent_test.txt', '../datasets/cand_rel_test.txt' ], validpath=[ '../datasets/simQA_test.txt', '../datasets/cand_ent_test.txt', '../datasets/cand_rel_test.txt' ], dict_character='../datasets/dict/dict.pkl', dict_relation='../datasets/dict/dict.pkl', dict_word='../datasets/dict/dict.pkl', relation_pattern='RWC', batch_size=16, valid_batch_size=16, maxlen=200, learning_rate=0.001, max_epochs=10, dispFreq=100, saveFreq=1000, validFreq=1000, saveto='model.npz', overwrite=True, patience=10, predicate_num=150, lstm_end='average', lstm_layers=2, word=False, word_dict_num=5000, relation_dict_num=8000, character_dict_num=200, cross=True, one_layer=False, en_decode_type='ff', qu_split=False, structure_number=3, en_pooling_type='average', # only for pooling question when entity decoding relation_attention='target_attention'): # theano.config.warn_float64 = "raise" model_options = locals().copy() train = TextIterator(trainpath[0], trainpath[1], trainpath[2], dict_character, dict_word, dict_relation, predicate_num=predicate_num, batch_size=model_options['batch_size'], maxlen=model_options['maxlen']) valid = TextIterator(validpath[0], validpath[1], validpath[2], dict_character, dict_word, dict_relation, predicate_num=predicate_num, batch_size=model_options['batch_size'], maxlen=model_options['maxlen']) InitParamsIns = InitParams() tparams = InitParamsIns.inittparams(model_options) ModelIns = MODEL() print 'Build Train and Valid Model...', x, x_mask, y, y_mask, z_rel, z_mask_rel, z_wor, chz_mask_wor, z_cha, chz_mask_cha, t, cost, errors = ModelIns.BuildTrainModel( tparams, model_options) x_v, x_mask_v, y_v, y_mask_v, z_rel_v, z_mask_rel_v, z_wor_v, chz_mask_wor_v, z_cha_v, chz_mask_cha_v, t_v, errors_v, en_errors_v, pr_errors_v = ModelIns.BuildValidTestModel( tparams, model_options) print 'Done' inputs_v = [ x_v, x_mask_v, y_v, y_mask_v, z_rel_v, z_mask_rel_v, z_wor_v, chz_mask_wor_v, z_cha_v, chz_mask_cha_v, t_v ] inputs = [ x, x_mask, y, y_mask, z_rel, z_mask_rel, z_wor, chz_mask_wor, z_cha, chz_mask_cha, t ] # alpha=[pr_alpha] outputs = [cost, errors] optputs_v = [errors_v, en_errors_v, pr_errors_v] func_ctx = theano.function(inputs, outputs, on_unused_input='ignore', allow_input_downcast=True, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) func_valid_error = theano.function(inputs_v, optputs_v, on_unused_input='ignore', allow_input_downcast=True) # func_p = theano.function(inputs,p,on_unused_input = 'ignore',allow_input_downcast=True) # func_alpha = theano.function(inputs,pr_alpha,on_unused_input = 'ignore',allow_input_downcast=True) print 'Building grad...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' print 'Building optimizers...', lr = tensor.scalar(name='lr') f_grad_shared, f_update = adadelta(lr, tparams, grads, inputs, cost) print 'Done' uidx = 0 best_p = None estop = False bad_counter = 0 history_right = [] for epoch_idx in xrange(max_epochs): n_samples = 0 for source, target, entity, predicate_relation, predicate_word, predicate_character in train: n_samples += len(source) uidx += 1 prepare_layer = PrepareDate(source, entity, predicate_character) x, x_mask, y, y_mask, z_relation, \ z_mask_relation,z_word, z_mask_word,z_character, \ z_mask_character,t = prepare_layer.prepare_valid_test_date_for_cross(source, entity, predicate_relation,predicate_word, predicate_character,target) if source is None: print 'Minibatch with zero sample' uidx -= 1 continue ud_start = time.time() cost = f_grad_shared(x, x_mask, y, y_mask, z_relation, z_mask_relation, z_word, z_mask_word, z_character, z_mask_character, t) # ctx_qu_rel,ctx_qu_wor,ctx_qu_cha,ctx_pr_rel,ctx_pr_wor,ctx_pr_cha=func_p(x, x_mask, y, y_mask, z_relation, # z_mask_relation,z_word, z_mask_word,z_character, # z_mask_character,t) f_update(learning_rate) ud = time.time() - ud_start if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' break if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', epoch_idx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud, 'learning_rate', learning_rate if numpy.mod(uidx, saveFreq) == 0: print 'Saving the best model...', if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_right, uidx=uidx, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' # save with uidx if not overwrite: print 'Saving the model at iteration {0}...'.format(uidx), saveto_uidx = '{0}.iter{1}.npz'.format( os.path.splitext(saveto)[0], uidx) numpy.savez(saveto_uidx, history_errs=history_right, uidx=uidx, **unzip(tparams)) print 'Done' # validdata model on validation set and early stop if necessary if numpy.mod(uidx, validFreq) == 0: rights = [] for source, target, entity, predicate_relation, predicate_word, predicate_character in valid: valid_prepare_layer = PrepareDate(source, entity, predicate_character) x, x_mask, y, y_mask, z_relation, \ z_mask_relation,z_word, z_mask_word,z_character, \ z_mask_character,t= valid_prepare_layer.prepare_valid_test_date_for_cross(source, entity, predicate_relation,predicate_word, predicate_character,target) right = func_valid_error(x, x_mask, y, y_mask, z_relation, z_mask_relation, z_word, z_mask_word, z_character, z_mask_character, t) rights.append(right[0]) right_arr = numpy.array(rights) valid_right = right_arr.mean() / valid_batch_size history_right.append(valid_right) if uidx == 0 or valid_right >= numpy.array( history_right).max(): best_p = unzip(tparams) bad_counter = 0 if len(history_right ) > patience and valid_right <= numpy.array( history_right)[:-patience].max(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break # if numpy.isnan(valid_err): # ipdb.set_trace() print 'Valid ', valid_right print 'seen %d samples' % n_samples if estop: break print 'Saving the model at epoch {0}...'.format(epoch_idx), saveto_uidx = '{0}.epoch{1}.npz'.format( os.path.splitext(saveto)[0], epoch_idx) numpy.savez(saveto_uidx, history_errs=history_right, uidx=uidx, **unzip(tparams)) print 'Done' if best_p is not None: zipp(best_p, tparams) rights = [] for source, target, entity, predicate_relation, predicate_word, predicate_character in valid: valid_prepare_layer = PrepareDate(source, entity, predicate_character) x, x_mask, y, y_mask, z_relation, \ z_mask_relation,z_word, z_mask_word,z_character, \ z_mask_character,t= valid_prepare_layer.prepare_valid_test_date_for_cross(source, entity, predicate_relation,predicate_word, predicate_character,target) right = func_valid_error(x, x_mask, y, y_mask, z_relation, z_mask_relation, z_word, z_mask_word, z_character, z_mask_character, t) rights.append(right[0]) right_arr = numpy.array(rights) valid_right = right_arr.mean() / valid_batch_size print 'Valid ', valid_right # train_err =numpy.array(p_train).mean()/batch_size params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_errs=history_right, uidx=uidx, **params) return valid_right
def setup(self): """ Set up the model to train. """ # input_words: shape (n_batch, n_sentence, sentence_len) input_words = T.itensor3() n_batch, n_sentences, sentence_len = input_words.shape # query_words: shape (n_batch, query_len) query_words = T.imatrix() # correct_output: shape (n_batch, ?, num_output_words) correct_output = T.ftensor3() # graph_num_new_nodes: shape(n_batch, n_sentence) graph_num_new_nodes = T.imatrix() # graph_new_node_strengths: shape(n_batch, n_sentence, new_nodes_per_iter) graph_new_node_strengths = T.ftensor3() # graph_new_node_ids: shape(n_batch, n_sentence, new_nodes_per_iter, num_node_ids) graph_new_node_ids = T.ftensor4() # graph_new_edges: shape(n_batch, n_sentence, pad_graph_size, pad_graph_size, num_edge_types) graph_new_edges = T.TensorType('floatX', (False, ) * 5)() def _build(with_correct_graph, snap_to_best, using_dropout, evaluate_accuracy): info = {} # Process each sentence, flattened to (?, sentence_len) flat_input_words = input_words.reshape([-1, sentence_len]) flat_input_reprs, flat_ref_matrices = self.input_transformer.process( flat_input_words) # flat_input_reprs of shape (?, input_repr_size) # flat_ref_matrices of shape (?, num_node_ids, input_repr_size) input_reprs = flat_input_reprs.reshape( [n_batch, n_sentences, self.input_repr_size]) ref_matrices = flat_ref_matrices.reshape([ n_batch, n_sentences, self.num_node_ids, self.input_repr_size ]) query_repr, query_ref_matrix = self.input_transformer.process( query_words) if using_dropout: iter_dropouts = [] states_mask = util.make_dropout_mask( (self.node_state_size, ), self.dropout_keep, self.srng) if self.nodes_mutable: iter_dropouts.extend( self.node_state_updater.dropout_masks( self.srng, states_mask)) if len(self.word_node_mapping) > 0: iter_dropouts.extend( self.direct_reference_updater.dropout_masks( self.srng, states_mask)) if self.intermediate_propagate != 0: iter_dropouts.extend( self.intermediate_propagator.dropout_masks( self.srng, states_mask)) if self.dynamic_nodes: iter_dropouts.extend( self.new_node_adder.dropout_masks(self.srng)) iter_dropouts.extend( self.edge_state_updater.dropout_masks(self.srng)) else: iter_dropouts = [] states_mask = None def _iter_fn(input_repr, ref_matrix, gstate, correct_num_new_nodes=None, correct_new_strengths=None, correct_new_node_ids=None, correct_edges=None, dropout_masks=None): # If necessary, update node state if self.nodes_mutable: gstate, dropout_masks = self.node_state_updater.process( gstate, input_repr, dropout_masks) if len(self.word_node_mapping) > 0: gstate, dropout_masks = self.direct_reference_updater.process( gstate, ref_matrix, dropout_masks) # If necessary, propagate node state if self.intermediate_propagate != 0: gstate, dropout_masks = self.intermediate_propagator.process_multiple( gstate, self.intermediate_propagate, dropout_masks) node_loss = None node_accuracy = None # Propose and vote on new nodes if self.dynamic_nodes: new_strengths, new_ids, dropout_masks = self.new_node_adder.get_candidates( gstate, input_repr, self.new_nodes_per_iter, dropout_masks) # new_strengths and correct_new_strengths are of shape (n_batch, new_nodes_per_iter) # new_ids and correct_new_node_ids are of shape (n_batch, new_nodes_per_iter, num_node_ids) if with_correct_graph: perm_idxs = np.array( list( itertools.permutations( range(self.new_nodes_per_iter)))) permuted_correct_str = correct_new_strengths[:, perm_idxs] permuted_correct_ids = correct_new_node_ids[:, perm_idxs] # due to advanced indexing, we should have shape (n_batch, permutation, new_nodes_per_iter, num_node_ids) ext_new_str = T.shape_padaxis(new_strengths, 1) ext_new_ids = T.shape_padaxis(new_ids, 1) strength_ll = permuted_correct_str * T.log( ext_new_str + util.EPSILON) + (1 - permuted_correct_str) * T.log( 1 - ext_new_str + util.EPSILON) ids_ll = permuted_correct_ids * T.log(ext_new_ids + util.EPSILON) reduced_perm_lls = T.sum(strength_ll, axis=2) + T.sum( ids_ll, axis=[2, 3]) if self.best_node_match_only: node_loss = -T.max(reduced_perm_lls, 1) else: full_ll = util.reduce_log_sum(reduced_perm_lls, 1) # Note that some of these permutations are identical, since we likely did not add the maximum # amount of nodes. Thus we will have added repeated elements here. # We have log(x+x+...+x) = log(kx), where k is the repetition factor and x is the probability we want # log(kx) = log(k) + log(x) # Our repetition factor k is given by (new_nodes_per_iter - correct_num_new_nodes)! # Recall that n! = gamma(n+1) # so log(x) = log(kx) - log(gamma(k+1)) log_rep_factor = T.gammaln( T.cast( self.new_nodes_per_iter - correct_num_new_nodes + 1, 'floatX')) scaled_ll = full_ll - log_rep_factor node_loss = -scaled_ll if evaluate_accuracy: best_match_idx = T.argmax(reduced_perm_lls, 1) # should be of shape (n_batch), indexing the best permutation best_correct_str = permuted_correct_str[ T.arange(n_batch), best_match_idx] best_correct_ids = permuted_correct_ids[ T.arange(n_batch), best_match_idx] snapped_strengths = util.independent_best( new_strengths) snapped_ids = util.categorical_best( new_ids) * T.shape_padright(snapped_strengths) close_strengths = T.all( T.isclose(best_correct_str, snapped_strengths), (1)) close_ids = T.all( T.isclose(best_correct_ids, snapped_ids), (1, 2)) node_accuracy = T.and_(close_strengths, close_ids) # now substitute in the correct nodes gstate = gstate.with_additional_nodes( correct_new_strengths, correct_new_node_ids) elif snap_to_best: snapped_strengths = util.independent_best( new_strengths) snapped_ids = util.categorical_best(new_ids) gstate = gstate.with_additional_nodes( snapped_strengths, snapped_ids) else: gstate = gstate.with_additional_nodes( new_strengths, new_ids) # Update edge state gstate, dropout_masks = self.edge_state_updater.process( gstate, input_repr, dropout_masks) if with_correct_graph: cropped_correct_edges = correct_edges[:, :gstate.n_nodes, : gstate.n_nodes, :] edge_lls = cropped_correct_edges * T.log( gstate.edge_strengths + util.EPSILON) + (1 - cropped_correct_edges) * T.log( 1 - gstate.edge_strengths + util.EPSILON) # edge_lls currently penalizes for edges connected to nodes that do not exist # we do not want it to do this, so we mask it with node strengths mask_src = util.shape_padaxes(gstate.node_strengths, [2, 3]) mask_dest = util.shape_padaxes(gstate.node_strengths, [1, 3]) masked_edge_lls = edge_lls * mask_src * mask_dest edge_loss = -T.sum(masked_edge_lls, axis=[1, 2, 3]) if evaluate_accuracy: snapped_edges = util.independent_best( gstate.edge_strengths) close_edges = T.isclose(cropped_correct_edges, snapped_edges) ok_mask = 1 - T.cast( mask_src * mask_dest, 'int8' ) # its OK for things not to match if node strengths are NOT both 1 edge_accuracy = T.all(T.or_(close_edges, ok_mask), (1, 2, 3)) overall_accuracy = edge_accuracy if node_accuracy is None else T.and_( node_accuracy, edge_accuracy) else: overall_accuracy = None gstate = gstate.with_updates( edge_strengths=cropped_correct_edges) return gstate, node_loss, edge_loss, overall_accuracy elif snap_to_best: snapped_edges = util.independent_best( gstate.edge_strengths) gstate = gstate.with_updates(edge_strengths=snapped_edges) return gstate else: return gstate # Scan over each sentence def _scan_fn( input_repr, *stuff ): # (input_repr, [ref_matrix?], [*correct_graph_stuff?], [dropout_masks?], *flat_graph_state, pad_graph_size) stuff = list(stuff) if len(self.word_node_mapping) > 0: ref_matrix = stuff[0] stuff = stuff[1:] else: ref_matrix = None if with_correct_graph: c_num_new_nodes, c_new_strengths, c_new_node_ids, c_edges = stuff[: 4] stuff = stuff[4:] if using_dropout: dropout_masks = stuff[:len(iter_dropouts)] stuff = stuff[len(iter_dropouts):] else: dropout_masks = None flat_graph_state = stuff[:-1] pad_graph_size = stuff[-1] gstate = GraphState.unflatten_from_const_size(flat_graph_state) if with_correct_graph: gstate, node_loss, edge_loss, overall_accuracy = _iter_fn( input_repr, ref_matrix, gstate, c_num_new_nodes, c_new_strengths, c_new_node_ids, c_edges, dropout_masks=dropout_masks) else: gstate = _iter_fn(input_repr, ref_matrix, gstate, dropout_masks=dropout_masks) retvals = gstate.flatten_to_const_size(pad_graph_size) if with_correct_graph: if self.dynamic_nodes: retvals.append(node_loss) retvals.append(edge_loss) if evaluate_accuracy: retvals.append(overall_accuracy) return retvals if self.dynamic_nodes: initial_gstate = GraphState.create_empty( n_batch, self.num_node_ids, self.node_state_size, self.num_edge_types) else: initial_gstate = GraphState.create_full_unique( n_batch, self.num_node_ids, self.node_state_size, self.num_edge_types) # Account for all nodes, plus the extra padding node to prevent GPU unpleasantness if self.dynamic_nodes: pad_graph_size = n_sentences * self.new_nodes_per_iter + 1 else: pad_graph_size = self.num_node_ids outputs_info = initial_gstate.flatten_to_const_size(pad_graph_size) prepped_input = input_reprs.dimshuffle([1, 0, 2]) sequences = [prepped_input] if len(self.word_node_mapping) > 0: sequences.append(ref_matrices.dimshuffle([1, 0, 2, 3])) if with_correct_graph: sequences.append(graph_num_new_nodes.swapaxes(0, 1)) sequences.append(graph_new_node_strengths.swapaxes(0, 1)) sequences.append(graph_new_node_ids.swapaxes(0, 1)) sequences.append(graph_new_edges.swapaxes(0, 1)) if self.dynamic_nodes: outputs_info.extend([None]) if evaluate_accuracy: outputs_info.extend([None]) outputs_info.extend([None]) if using_dropout: sequences.extend(iter_dropouts) all_scan_out, _ = theano.scan(_scan_fn, sequences=sequences, outputs_info=outputs_info, non_sequences=[pad_graph_size]) graph_accurate_list = None if with_correct_graph: if evaluate_accuracy: full_graph_accuracy = all_scan_out[-1] all_scan_out = all_scan_out[:-1] graph_accurate_list = T.all(full_graph_accuracy, 0) info["graph_accuracy"] = T.sum(graph_accurate_list, dtype='floatX') / T.cast( n_batch, 'floatX') if self.dynamic_nodes: all_flat_gstates = all_scan_out[:-2] node_loss, edge_loss = all_scan_out[-2:] reduced_node_loss = T.sum(node_loss) / T.cast( n_batch, 'floatX') reduced_edge_loss = T.sum(edge_loss) / T.cast( n_batch, 'floatX') avg_graph_loss = (reduced_node_loss + reduced_edge_loss) / T.cast( input_words.shape[1], 'floatX') info["node_loss"] = reduced_node_loss info["edge_loss"] = reduced_edge_loss else: all_flat_gstates = all_scan_out[:-1] edge_loss = all_scan_out[-1] reduced_edge_loss = T.sum(edge_loss) / T.cast( n_batch, 'floatX') avg_graph_loss = reduced_edge_loss / T.cast( input_words.shape[1], 'floatX') info["edge_loss"] = reduced_edge_loss else: all_flat_gstates = all_scan_out if self.sequence_representation: # Each part of all_flat_gstates is of shape (n_sentences, n_batch, ...) # except for the last one, which we handle separately # Swap to (n_batch, n_sentences, ...) # Then flatten to (n_batch*n_sentences, ...) for further processing final_flat_gstate = [ x.swapaxes(0, 1).reshape(T.concatenate([[-1], x.shape[2:]]), ndim=(x.ndim - 1)) for x in all_flat_gstates[:-1] ] # As for the last one, we need to get a single scalar value. The last one will be the biggest # so we will take that. Note that this will introduce a bunch of zero-nodes, but thats # OK and we can process that later. (We REQUIRE that padding in graph_state makes zero strength # nodes here!) final_flat_gstate.append(all_flat_gstates[-1][-1]) # We also need to repeat query_repr and query_ref_matrix so that they broadcast together query_repr = T.extra_ops.repeat(query_repr, n_sentences, 0) query_ref_matrix = T.extra_ops.repeat(query_ref_matrix, n_sentences, 0) else: # Extract last timestep final_flat_gstate = [x[-1] for x in all_flat_gstates] final_gstate = GraphState.unflatten_from_const_size( final_flat_gstate) if self.train_with_query: if self.wipe_node_state: final_gstate = final_gstate.with_updates( node_states=T.zeros_like(final_gstate.node_states)) qnsu_dropout_masks = self.query_node_state_updater.dropout_masks( self.srng, states_mask) query_gstate, _ = self.query_node_state_updater.process( final_gstate, query_repr, qnsu_dropout_masks) if len(self.word_node_mapping) > 0: qdru_dropout_masks = self.query_direct_reference_updater.dropout_masks( self.srng, states_mask) query_gstate, _ = self.query_direct_reference_updater.process( query_gstate, query_ref_matrix, qdru_dropout_masks) fp_dropout_masks = self.final_propagator.dropout_masks( self.srng, states_mask) propagated_gstate, _ = self.final_propagator.process_multiple( query_gstate, self.final_propagate, fp_dropout_masks) agg_dropout_masks = self.aggregator.dropout_masks(self.srng) aggregated_repr, _ = self.aggregator.process( propagated_gstate, agg_dropout_masks) # shape (n_batch, output_repr_size) if self.sequence_representation: # aggregated_repr is of shape (n_batch*n_sentences, repr_width) # We want to split back to timesteps: (n_batch, n_sentences, repr_width) agg_repr_seq = aggregated_repr.reshape( [n_batch, n_sentences, -1]) # Now collapse it to a summary representation aggsum_dropout_masks = self.aggregate_summarizer.dropout_masks( self.srng) aggregated_repr, _ = self.aggregate_summarizer.process( agg_repr_seq, aggsum_dropout_masks) # At this point aggregated_repr is (n_batch, repr_width) as desired max_seq_len = correct_output.shape[1] if self.output_format == ModelOutputFormat.sequence: final_output = self.output_processor.process( aggregated_repr, max_seq_len) # shape (n_batch, ?, num_output_words) else: final_output = self.output_processor.process( aggregated_repr) if snap_to_best: final_output = self.output_processor.snap_to_best( final_output) if self.output_format == ModelOutputFormat.subset: elemwise_loss = T.nnet.binary_crossentropy( final_output, correct_output) query_loss = T.sum(elemwise_loss) else: flat_final_output = final_output.reshape( [-1, self.num_output_words]) flat_correct_output = correct_output.reshape( [-1, self.num_output_words]) timewise_loss = T.nnet.categorical_crossentropy( flat_final_output, flat_correct_output) query_loss = T.sum(timewise_loss) query_loss = query_loss / T.cast(n_batch, 'floatX') info["query_loss"] = query_loss else: final_output = T.zeros([]) full_loss = np.array(0.0, np.float32) if with_correct_graph: full_loss = full_loss + avg_graph_loss if self.train_with_query: full_loss = full_loss + query_loss if self.train_with_query: adjusted_query_gstates = [ x.reshape(T.concatenate([[n_batch, n_sentences], x.shape[1:]]), ndim=(x.ndim + 1)) if self.sequence_representation else T.shape_padaxis(x, 1) for x in query_gstate.flatten() ] adjusted_prop_gstates = [ x.reshape(T.concatenate([[n_batch, n_sentences], x.shape[1:]]), ndim=(x.ndim + 1)) if self.sequence_representation else T.shape_padaxis(x, 1) for x in propagated_gstate.flatten() ] full_flat_gstates = [ T.concatenate([a.swapaxes(0, 1), b, c], 1) for a, b, c in zip(all_flat_gstates[:-1], adjusted_query_gstates, adjusted_prop_gstates) ] else: full_flat_gstates = [ a.swapaxes(0, 1) for a in all_flat_gstates[:-1] ] max_seq_len = T.iscalar() return full_loss, final_output, full_flat_gstates, graph_accurate_list, max_seq_len, info train_loss, _, _, _, _, train_info = _build(self.train_with_graph, False, True, False) adam_updates = Adam(train_loss, self.params, lr=self.learning_rate_var) self.info_keys = list(train_info.keys()) print("Compiling...") optimizer = theano.compile.predefined_optimizers[ 'fast_run' if self.check_mode == 'debug' else theano.config.optimizer] optimizer = optimizer.excluding( "scanOp_pushout_output", "remove_constants_and_unused_inputs_scan") if self.check_mode == 'nan': mode = NanGuardMode(optimizer=optimizer, nan_is_error=True, inf_is_error=True, big_is_error=True) elif self.check_mode == 'debug': mode = DebugMode(optimizer=optimizer, check_isfinite=False, check_py_code=False, stability_patience=1) theano.tensor.TensorType.filter_checks_isfinite = False else: mode = theano.Mode(optimizer=optimizer) self.train_fn = theano.function([ input_words, query_words, correct_output, graph_num_new_nodes, graph_new_node_strengths, graph_new_node_ids, graph_new_edges ], [train_loss] + list(train_info.values()), updates=adam_updates, allow_input_downcast=True, on_unused_input='ignore', mode=mode) eval_loss, _, full_flat_gstates, graph_accurate_list, _, eval_info = _build( self.train_with_graph, False, False, True) self.eval_info_keys = list(eval_info.keys()) self.eval_fn = theano.function([ input_words, query_words, correct_output, graph_num_new_nodes, graph_new_node_strengths, graph_new_node_ids, graph_new_edges ], [eval_loss, graph_accurate_list] + list(eval_info.values()), allow_input_downcast=True, on_unused_input='ignore', mode=mode) self.debug_test_fn = theano.function([ input_words, query_words, correct_output, graph_num_new_nodes, graph_new_node_strengths, graph_new_node_ids, graph_new_edges ], full_flat_gstates, allow_input_downcast=True, on_unused_input='ignore', mode=mode) test_loss, final_output, full_flat_gstates, _, max_seq_len, _ = _build( False, False, False, False) self.fuzzy_test_fn = theano.function( [input_words, query_words] + ([max_seq_len] if self.output_format == ModelOutputFormat.sequence else []), [final_output] + full_flat_gstates, allow_input_downcast=True, on_unused_input='ignore', mode=mode) test_loss, final_output, full_flat_gstates, _, max_seq_len, _ = _build( False, True, False, False) self.snap_test_fn = theano.function( [input_words, query_words] + ([max_seq_len] if self.output_format == ModelOutputFormat.sequence else []), [final_output] + full_flat_gstates, allow_input_downcast=True, on_unused_input='ignore', mode=mode)
def update_opt(self, loss, target, leq_constraint, inputs, extra_inputs=None, constraint_name="constraint", *args, **kwargs): """ :param loss: Symbolic expression for the loss function. :param target: A parameterized object to optimize over. It should implement methods of the :class:`rllab.core.paramerized.Parameterized` class. :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. :param inputs: A list of symbolic variables as inputs, which could be subsampled if needed. It is assumed that the first dimension of these inputs should correspond to the number of data points :param extra_inputs: A list of symbolic variables as extra inputs which should not be subsampled :return: No return value. """ inputs = tuple(inputs) if extra_inputs is None: extra_inputs = tuple() else: extra_inputs = tuple(extra_inputs) constraint_term, constraint_value = leq_constraint params = target.get_params(trainable=True) grads = theano.grad(loss, wrt=params) flat_grad = ext.flatten_tensor_variables(grads) constraint_grads = theano.grad(constraint_term, wrt=params) xs = tuple([ext.new_tensor_like("%s x" % p.name, p) for p in params]) Hx_plain_splits = TT.grad( TT.sum([TT.sum(g * x) for g, x in itertools.izip(constraint_grads, xs)]), wrt=params, ) Hx_plain = TT.concatenate([TT.flatten(s) for s in Hx_plain_splits]) self._target = target self._max_constraint_val = constraint_value self._constraint_name = constraint_name if self._debug_nan: from theano.compile.nanguardmode import NanGuardMode mode = NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) else: mode = None self._opt_fun = ext.lazydict( f_loss=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=loss, log_name="f_loss", mode=mode, ), f_grad=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=flat_grad, log_name="f_grad", mode=mode, ), f_Hx_plain=lambda: ext.compile_function( inputs=inputs + extra_inputs + xs, outputs=Hx_plain, log_name="f_Hx_plain", mode=mode, ), f_constraint=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=constraint_term, log_name="constraint", mode=mode, ), f_loss_constraint=lambda: ext.compile_function( inputs=inputs + extra_inputs, outputs=[loss, constraint_term], log_name="f_loss_constraint", mode=mode, ), )
def main(num_epochs=1, n_songs_train=1, n_songs_val=1, n_songs_test=1, batch_size=256, learning_rate=1e-4): """ Main function """ # Theano config theano.config.floatX = 'float32' train, val, test = None, None, None try: train, val, test = use_preparsed_data(outputdir='/zap/tsob/audio/', ) except: train, val, test = get_data(n_songs_train=n_songs_train, n_songs_val=n_songs_val, n_songs_test=n_songs_test, outputdir='/zap/tsob/audio/', seed=None) # Save the returned metadata np.savez('/zap/tsob/audio/metadata', train, val, test) # Print the dimensions print "Data dimensions:" for datapt in [ train['Xshape'], train['yshape'], val['Xshape'], val['yshape'], test['Xshape'], test['yshape'] ]: print datapt # Parse dimensions n_train = train['yshape'][0] n_val = val['yshape'][0] n_test = test['yshape'][0] n_chan = train['Xshape'][1] n_feats = train['Xshape'][2] n_frames = train['Xshape'][3] print "n_train = {0}".format(n_train) print "n_val = {0}".format(n_val) print "n_test = {0}".format(n_test) print "n_chan = {0}".format(n_chan) print "n_feats = {0}".format(n_feats) print "n_frames = {0}".format(n_frames) # Prepare Theano variables for inputs and targets input_var = T.tensor4(name='inputs') target_var = T.fcol(name='targets') # Create neural network model (depending on first command line parameter) print("Building model and compiling functions..."), network = build_cnn(input_var) print("Done.") # Create a loss expression for training, i.e., a scalar objective we want to minimize prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.binary_hinge_loss(prediction, target_var) loss = loss.mean() # Create update expressions for training # Here, we'll use adam params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.adam(loss, params, learning_rate=learning_rate, beta1=0.95, beta2=0.999, epsilon=1e-08) # Create a loss expression for validation/testing. # The crucial difference here is that we do a deterministic forward pass # through the network, disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.binary_hinge_loss(test_prediction, target_var) test_loss = test_loss.mean() test_pred_fn = theano.function([input_var], test_prediction, allow_input_downcast=True) # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function( [input_var, target_var], loss, updates=updates, mode=NanGuardMode( #TODO remove nan_is_error=True, inf_is_error=True, big_is_error=True #TODO remove ), #TODO remove allow_input_downcast=True) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], [test_loss, test_acc], allow_input_downcast=True) # Finally, launch the training loop. print("Starting training...") train_error_hist = [] # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(train, batch_size, shuffle=True): inputs, targets = batch train_err_increment = train_fn(inputs, targets) train_err += train_err_increment train_error_hist.append(train_err_increment) train_batches += 1 # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatches(val, batch_size, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.8f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.8f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100)) print("Done training.") # After training, we compute and print the test error: test_err = 0 test_acc = 0 test_batches = 0 test_predictions = [] for batch in iterate_minibatches(test, batch_size, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) test_predictions.append(test_pred_fn(inputs)) test_err += err test_acc += acc test_batches += 1 print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) print(" test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100)) # Optionally, you could now dump the network weights to a file like this: timestr = str(time.time()) np.savez('/zap/tsob/audio/model' + timestr + '.npz', *lasagne.layers.get_all_param_values(network)) np.save('/zap/tsob/audio/train_error_hist' + timestr + '.npy', train_error_hist) np.save('/zap/tsob/audio/test_predictions' + timestr + '.npy', test_predictions) print "Wrote model to {0}, test error histogram to {1}, and test predictions to {2}".format( 'model' + timestr + '.npz', 'train_error_hist' + timestr + '.npy', 'test_predictions' + timestr + '.npy')
def main(exp_config, source_vocab, target_vocab, dev_stream, use_bokeh=True): # def setup_model_and_stream(exp_config, source_vocab, target_vocab): # def setup_model_and_stream(exp_config, source_vocab, target_vocab): train_encoder, train_decoder, theano_sampling_source_input, theano_sampling_context_input, generated, masked_stream = setup_model_and_stream( exp_config, source_vocab, target_vocab) cost = create_model(train_encoder, train_decoder, exp_config.get('imt_smoothing_constant', 0.005)) # Set up training model logger.info("Building model") train_model = Model(cost) # Set the parameters from a trained models (.npz file) logger.info("Loading parameters from model: {}".format( exp_config['saved_parameters'])) # Note the brick delimeter='-' is here for legacy reasons because blocks changed the serialization API param_values = LoadNMT.load_parameter_values( exp_config['saved_parameters'], brick_delimiter=exp_config.get('brick_delimiter', None)) LoadNMT.set_model_parameters(train_model, param_values) logger.info('Creating computational graph') cg = ComputationGraph(cost) # GRAPH TRANSFORMATIONS FOR BETTER TRAINING if exp_config.get('l2_regularization', False) is True: l2_reg_alpha = exp_config['l2_regularization_alpha'] logger.info( 'Applying l2 regularization with alpha={}'.format(l2_reg_alpha)) model_weights = VariableFilter(roles=[WEIGHT])(cg.variables) for W in model_weights: cost = cost + (l2_reg_alpha * (W**2).sum()) # why do we need to rename the cost variable? Where did the original name come from? cost.name = 'decoder_cost_cost' cg = ComputationGraph(cost) # apply dropout for regularization # Note dropout variables are hard-coded here if exp_config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, exp_config['dropout']) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(exp_config['saveto']): os.makedirs(exp_config['saveto']) # TODO: mv the actual config file once we switch to .yaml for min-risk shutil.copy(exp_config['config_file'], exp_config['saveto']) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=exp_config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(exp_config['saveto'], every_n_batches=exp_config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary # TODO: change the if statement here if exp_config['hook_samples'] >= 1 or exp_config['bleu_script'] is not None: logger.info("Building sampling model") search_model = Model(generated) _, samples = VariableFilter( bricks=[train_decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling -- TODO: sampling is broken for min-risk #if config['hook_samples'] >= 1: # logger.info("Building sampler") # extensions.append( # Sampler(model=search_model, data_stream=tr_stream, # hook_samples=config['hook_samples'], # every_n_batches=config['sampling_freq'], # src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu # TODO: use multimodal meteor and BLEU validator # TODO: add 'validator' key to IMT config # Add early stopping based on bleu if exp_config.get('bleu_script', None) is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(theano_sampling_source_input, theano_sampling_context_input, samples=samples, config=exp_config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=exp_config['normalized_bleu'], every_n_batches=exp_config['bleu_val_freq'])) if exp_config.get('imt_f1_validation', False) is not False: logger.info("Building imt F1 validator") extensions.append( IMT_F1_Validator(theano_sampling_source_input, theano_sampling_context_input, samples=samples, config=exp_config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=exp_config['normalized_bleu'], every_n_batches=exp_config['bleu_val_freq'])) # Add early stopping based on Meteor # if exp_config.get('meteor_directory', None) is not None: # logger.info("Building meteor validator") # extensions.append( # MeteorValidator(theano_sampling_source_input, theano_sampling_context_input, # samples=samples, # config=config, # model=search_model, data_stream=dev_stream, # src_vocab=src_vocab, # trg_vocab=trg_vocab, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if exp_config['reload']: extensions.append(LoadNMT(exp_config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot(exp_config['model_save_directory'], channels=[[ 'decoder_cost_cost', 'validation_set_imt_f1_score', 'validation_set_bleu_score', 'validation_set_meteor_score' ]], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # if there is l2_regularization, dropout or random noise, we need to use the output of the modified graph # WORKING: try to catch and fix nan if exp_config['dropout'] < 1.0: if exp_config.get('nan_guard', False): from theano.compile.nanguardmode import NanGuardMode algorithm = GradientDescent(cost=cg.outputs[0], parameters=cg.parameters, step_rule=CompositeRule([ StepClipping( exp_config['step_clipping']), eval(exp_config['step_rule'])() ]), on_unused_sources='warn', theano_func_kwargs={ 'mode': NanGuardMode(nan_is_error=True, inf_is_error=True) }) else: algorithm = GradientDescent(cost=cg.outputs[0], parameters=cg.parameters, step_rule=CompositeRule([ StepClipping( exp_config['step_clipping']), eval(exp_config['step_rule'])() ]), on_unused_sources='warn') else: algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping( exp_config['step_clipping']), eval(exp_config['step_rule'])() ]), on_unused_sources='warn') # enrich the logged information extensions.append(Timing(every_n_batches=100)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=train_model, algorithm=algorithm, data_stream=masked_stream, extensions=extensions) # Train! main_loop.run()
def build(self): config = self.config processor = self.processor source_inputs = T.imatrix() target_inputs = T.imatrix() target_outputs = T.imatrix() source_mask_inputs = T.matrix() target_mask_inputs = T.matrix() # map_inputs = T.tensor3() l_source_inputs = lasagne.layers.InputLayer(shape=(None, config.source_len), input_var=source_inputs) l_target_inputs = lasagne.layers.InputLayer(shape=(None, config.target_len), input_var=target_inputs) l_output = lasagne.layers.InputLayer(shape=(None, config.target_len), input_var=target_outputs) l_source_mask_inputs = lasagne.layers.InputLayer( shape=(None, config.source_len), input_var=source_mask_inputs) l_target_mask_inputs = lasagne.layers.InputLayer( shape=(None, config.target_len), input_var=target_mask_inputs) # l_map_inputs = lasagne.layers.InputLayer(shape=(None, config.source_len, processor.target_vocab_size), # input_var=map_inputs) l_source = lasagne.layers.EmbeddingLayer(l_source_inputs, processor.source_vocab_size, config.embedding_size) l_target = lasagne.layers.EmbeddingLayer(l_target_inputs, processor.target_vocab_size, config.embedding_size) self.W1 = l_source.W self.W2 = l_target.W # T.sum(l_source.W) # l_s_gru_fw = lasagne.layers.GRULayer(l_source, config.enc_units, mask_input=l_source_mask_inputs, # grad_clipping=config.grad_clipping) # l_s_gru_bw = lasagne.layers.GRULayer(l_source, config.enc_units, mask_input=l_source_mask_inputs, # grad_clipping=config.grad_clipping) # l_source = lasagne.layers.ConcatLayer([l_s_gru_fw, l_s_gru_bw], axis=2) # l_source = lasagne.layers.GRULayer(l_source, config.enc_units, mask_input=l_source_mask_inputs, # grad_clipping=config.grad_clipping) # l_source_last = lasagne.layers.ElemwiseSumLayer(l_source) #lasagne.layers.SliceLayer(l_source, -1, axis=1) l_target_outputs = layers.GRUCoverageTrainLayer( l_target_inputs, config.dec_units, mask_input=l_target_mask_inputs, grad_clipping=config.grad_clipping, source_token_cnt=processor.source_vocab_size, target_token_cnt=processor.target_vocab_size, l_enc_feat=l_source, l_enc_mask=l_source_mask_inputs, l_output=l_output, W_emb=self.W2, unk_index=processor.get_char_index( 'UNK', False)) #, hid_init=l_source_last) l_t = l_target_outputs l_target_outputs = lasagne.layers.ReshapeLayer( l_target_outputs, (-1, [2])) # (batch * dec_len, vocab + extra) l_gen = layers.GRUCoverageTestLayer( config.dec_units, grad_clipping=config.grad_clipping, source_token_cnt=processor.source_vocab_size, target_token_cnt=processor.target_vocab_size, l_enc_feat=l_source, l_enc_mask=l_source_mask_inputs, W_emb=self.W2, resetgate=l_t.resetgate, updategate=l_t.updategate, hidden_update=l_t.hidden_update, #hid_init=l_source_last, unk_index=processor.get_char_index('UNK', False), start_index=processor.get_char_index('START', False), W_gen=l_t.W_gen, gen_len=config.target_len) l_att = layers.GRUCoverageAttLayer( config.dec_units, grad_clipping=config.grad_clipping, source_token_cnt=processor.source_vocab_size, target_token_cnt=processor.target_vocab_size, l_enc_feat=l_source, l_enc_mask=l_source_mask_inputs, W_emb=self.W2, resetgate=l_t.resetgate, updategate=l_t.updategate, hidden_update=l_t.hidden_update, #hid_init=l_source_last, unk_index=processor.get_char_index('UNK', False), start_index=processor.get_char_index('START', False), W_gen=l_t.W_gen, gen_len=config.target_len) self.l = l_target_outputs py = lasagne.layers.get_output(l_target_outputs) loss = (py * T.extra_ops.to_one_hot(target_outputs.flatten(), processor.target_vocab_size)).sum( axis=1) # (batch * dec_len) loss = -(loss * target_mask_inputs.flatten()).mean() params = lasagne.layers.get_all_params(self.l, trainable=True) updates = lasagne.updates.adam(loss, params, learning_rate=config.learning_rate) gen_y = lasagne.layers.get_output(l_gen) gen_att = lasagne.layers.get_output(l_att) self.train_fn = theano.function([ source_inputs, target_inputs, target_outputs, source_mask_inputs, target_mask_inputs ], None, updates=updates, on_unused_input='ignore', mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) self.loss_fn = theano.function([ source_inputs, target_inputs, target_outputs, source_mask_inputs, target_mask_inputs ], loss, on_unused_input='ignore', mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) self.test_fn = theano.function([source_inputs, source_mask_inputs], gen_y, on_unused_input='ignore') self.att_fn = theano.function([source_inputs, source_mask_inputs], gen_att, on_unused_input='ignore') l_samp = layers.GRUCopyPureSampleLayer( config.dec_units, grad_clipping=config.grad_clipping, source_token_cnt=processor.source_vocab_size, target_token_cnt=processor.target_vocab_size, l_enc_feat=l_source, l_enc_mask=l_source_mask_inputs, W_emb=self.W2, resetgate=l_t.resetgate, updategate=l_t.updategate, hidden_update=l_t.hidden_update, #hid_init=l_source_last, unk_index=processor.get_char_index('UNK', False), start_index=processor.get_char_index('START', False), gen_len=config.target_len, W_gen=l_t.W_gen, MRG_stream=self.MRG_stream) # (batch, dec_len) samp_y = lasagne.layers.get_output(l_samp) self.sample_fn = theano.function([source_inputs, source_mask_inputs], samp_y, updates=l_samp.updates, on_unused_input='ignore') reward_inputs = T.matrix() # (batch, dec_len) reinforce_loss = (py * T.extra_ops.to_one_hot( target_outputs.flatten(), processor.target_vocab_size)).sum( axis=1) # (batch * dec_len) reinforce_loss = -(reinforce_loss * target_mask_inputs.flatten() * reward_inputs.flatten()).mean() reinforce_updates = lasagne.updates.adam( reinforce_loss, params, learning_rate=config.reinforce_learning_rate) self.reinforce_fn = theano.function([ source_inputs, target_inputs, target_outputs, source_mask_inputs, target_mask_inputs, reward_inputs ], None, updates=reinforce_updates, on_unused_input='ignore') print('params', lasagne.layers.count_params(self.l, trainable=True))
def main(num_epochs=500, mode="run", batchsize=96): # Debug #theano.config.profile=True #theano.config.optimizer_profile=True #theano.config.warn_float64='warn' # Loading all preprocessed data global Ws, bs Xtr, Ytr, Xva, Yva, imgMean_vals, Ws, bs = data_prep() # Sanity check: try to overfit a tiny (eg 40 instances) subset of the data if mode == "toy": batchsize = 10 np.random.RandomState(11) idx = np.random.randint(0, Xtr.shape[0] / 10, batchsize * 4) Xtr = Xtr[idx, :, :, :] Ytr = Ytr[idx, :] """ COMPILING THEANO function """ start_time = time.time() # Prepare Theano variables for inputs and targets input_var = T.ftensor4('inputs') target_var = T.imatrix('targets') # Center the input images imgMean = T.TensorType(dtype='float32', broadcastable=(True, False, False, False))('imgMean') z = (input_var - imgMean) center_fn = theano.function([input_var, imgMean], z, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)) print "\nbuilding model... " net0 = build_model(input_var) print "\ncompiling functions... " ''' # Build loss function prediction = lasagne.layers.get_output(net0) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean(axis=0) # Create update expression for training # using RMSprop params = lasagne.layers.get_all_params(net0, trainable=True) updates = lasagne.updates.rmsprop(loss, params, learning_rate=0.01, rho=0.9, epsilon=1e-06) train_fn = theano.function([input_var, target_var], loss, updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) ) ''' ## Building loss evaluation for validation set va_prediction = lasagne.layers.get_output(net0, deterministic=True) va_loss = lasagne.objectives.categorical_crossentropy( va_prediction, target_var) va_loss = va_loss.mean(axis=0) va_fn = theano.function( [input_var, target_var], #mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True), va_loss) print("compilation finished in {:.2f}").format(time.time() - start_time) """ TRAINING - HAVENT SUBTRACT IMAGE MEAN YET!!! """ print "Starting training with batchsize of %d ..." % (batchsize) for epoch in range(num_epochs): ''' # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for inputs, targets in iterate_minibatches(Xtr, Ytr, batchsize, shuffle=True): inputs = center_fn(inputs, imgMean_vals) train_err += train_fn(inputs, targets) train_batches += 1 ''' # And a full pass over the validation data: if mode != "toy": va_err = 0 va_batches = 0 for inputs, targets in iterate_minibatches(Xtr, Ytr, batchsize, shuffle=True): inputs = center_fn(inputs, imgMean_vals) va_err += va_fn(inputs, targets) va_batches += 1 # Then we print the results for this epoch: ''' print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}\t{:d}".format(train_err / train_batches, train_batches)) ''' if mode != "toy": print(" validation loss:\t\t{:.6f}".format(va_err / va_batches)) # Save the model after every 5 epochs '''
def setup_train(self): # dimensions: (batch, time, 12) chord_types = T.btensor3() # dimensions: (batch, time) chord_roots = T.imatrix() # dimensions: (batch, time) relative_posns = [T.imatrix() for _ in self.encodings] # dimesions: (batch, time, output_data) encoded_melodies = [T.btensor3() for _ in self.encodings] # dimesions: (batch, time) correct_notes = T.imatrix() n_batch, n_time = chord_roots.shape def _build(det_dropout): all_activations = [] for encoding, enc_lstmstack, encoded_melody, relative_pos in zip( self.encodings, self.enc_lstmstacks, encoded_melodies, relative_posns): activations = enc_lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch, 1)), relative_position=relative_pos, cur_chord_type=chord_types, cur_chord_root=chord_roots, cur_input=encoded_melody, deterministic_dropout=det_dropout) all_activations.append(activations) reduced_activations = functools.reduce((lambda x, y: x + y), all_activations) queue_loss, feat_strengths, feat_vects, queue_info = self.qman.process( reduced_activations, extra_info=True) features = QueueManager.queue_transform(feat_strengths, feat_vects) all_out_probs = [] for encoding, dec_lstmstack, encoded_melody, relative_pos in zip( self.encodings, self.dec_lstmstacks, encoded_melodies, relative_posns): activations = dec_lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch, 1)), relative_position=relative_pos, cur_chord_type=chord_types, cur_chord_root=chord_roots, cur_feature=features, last_output=T.concatenate([ T.tile(encoding.initial_encoded_form(), (n_batch, 1, 1)), encoded_melody[:, :-1, :] ], 1), deterministic_dropout=det_dropout) out_probs = encoding.decode_to_probs(activations, relative_pos, self.bounds.lowbound, self.bounds.highbound) all_out_probs.append(out_probs) reduced_out_probs = functools.reduce((lambda x, y: x * y), all_out_probs) normsum = T.sum(reduced_out_probs, 2, keepdims=True) normsum = T.maximum(normsum, constants.EPSILON) norm_out_probs = reduced_out_probs / normsum reconstruction_loss, reconstruction_info = Encoding.compute_loss( norm_out_probs, correct_notes, extra_info=True) queue_surrogate_loss_parts = self.qman.surrogate_loss( reconstruction_loss, queue_info) updates = [] full_info = queue_info.copy() full_info.update(reconstruction_info) full_info["queue_loss"] = queue_loss full_info["reconstruction_loss"] = reconstruction_loss float_n_batch = T.cast(n_batch, 'float32') if self.loss_mode is "add": full_loss = queue_loss + reconstruction_loss elif self.loss_mode is "priority": curviness = np.array(self.loss_mode_params[0], np.float32) * float_n_batch # ln( e^x + e^y - 1 ) # ln( C(e^x + e^y - 1) ) - ln(C) # ln( e^c(e^x + e^y - 1) ) - c # ln( e^(x+c) + e^(y+c) - e^c ) - c # ln( e^(x-c) + e^(y-c) - e^(-c) ) + c # Now let c = maximum(x,y), d = minimum(x,y). WOLOG replace x=c, y=d # ln( e^(c-c) + e^(d-c) - e^(-c) ) + c # ln( 1 + e^(d-c) - e^(-c) ) + c x = reconstruction_loss / curviness y = queue_loss / curviness c = T.maximum(x, y) d = T.minimum(x, y) full_loss = (T.log(1 + T.exp(d - c) - T.exp(-c)) + c) * curviness elif self.loss_mode is "cutoff": cutoff_val = np.array(self.loss_mode_params[0], np.float32) full_loss = T.switch( reconstruction_loss < cutoff_val * float_n_batch, reconstruction_loss + queue_loss, reconstruction_loss) elif self.loss_mode is "trigger": trigger_val = np.array(self.loss_mode_params[0], np.float32) trigger_speed = np.array(1.0 / self.loss_mode_params[1], np.float32) trigger_is_on = theano.shared(np.array(0, np.int8)) trigger_scale = theano.shared(np.array(0.0, np.float32)) full_loss = reconstruction_loss + trigger_scale * queue_loss updates.append( (trigger_is_on, T.or_(trigger_is_on, reconstruction_loss < trigger_val * float_n_batch))) updates.append((trigger_scale, T.switch( trigger_is_on, T.minimum(trigger_scale + trigger_speed, np.array(1.0, np.float32)), np.array(0.0, np.float32)))) full_info["trigger_scale"] = trigger_scale if queue_surrogate_loss_parts is not None: surrogate_loss, addtl_updates = queue_surrogate_loss_parts full_loss = full_loss + surrogate_loss updates.extend(addtl_updates) full_info["surrogate_loss"] = surrogate_loss return full_loss, full_info, updates train_loss, train_info, train_updates = _build(False) if self.train_decoder_only: params = list( itertools.chain(*(lstmstack.params for lstmstack in self.dec_lstmstacks))) else: params = self.params adam_updates = Adam(train_loss, params, lr=self.learning_rate_var) eval_loss, eval_info, _ = _build(True) self.loss_info_keys = list(train_info.keys()) self.update_fun = theano.function( inputs=[chord_types, chord_roots, correct_notes] + relative_posns + encoded_melodies, outputs=[train_loss] + list(train_info.values()), updates=train_updates + adam_updates, allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None)) self.eval_fun = theano.function( inputs=[chord_types, chord_roots, correct_notes] + relative_posns + encoded_melodies, outputs=[eval_loss] + list(eval_info.values()), allow_input_downcast=True, mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))