def rescore_model(source_file, target_file, output_file, scorer_settings, options): trng = RandomStreams(1234) def _score(pairs, alignweights=False): # sample given an input sequence and obtain scores scores = [] alignments = [] for i, model in enumerate(scorer_settings.models): f_log_probs = load_scorer(model, options[i], alignweights=alignweights) score, alignment = pred_probs( f_log_probs, prepare_data, options[i], pairs, normalization_alpha=scorer_settings.normalization_alpha, alignweights=alignweights) scores.append(score) alignments.append(alignment) return scores, alignments pairs = TextIterator( source_file.name, target_file.name, options[0]['dictionaries'][:-1], options[0]['dictionaries'][-1], n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'], batch_size=scorer_settings.b, maxlen=float('inf'), use_factor=(options[0]['factors'] > 1), sort_by_length=False ) #TODO: sorting by length could be more efficient, but we'd want to resort after scores, alignments = _score(pairs, scorer_settings.alignweights) source_file.seek(0) target_file.seek(0) source_lines = source_file.readlines() target_lines = target_file.readlines() for i, line in enumerate(target_lines): score_str = ' '.join(map(str, [s[i] for s in scores])) if scorer_settings.verbose: output_file.write('{0} '.format(line.strip())) output_file.write('{0}\n'.format(score_str)) # optionally save attention weights if scorer_settings.alignweights: temp_name = output_file.name + ".json" with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT: for line in alignments: if type(line) == list: for l in line: align_OUT.write(l + "\n") else: align_OUT.write(line + "\n") # combining the actual source and target words. combine_source_target_text_1to1(source_file, target_file, output_file.name, align_OUT)
attinp_h2, attgate_h2 = att_to_h2.proj(w_t) attinp_h3, attgate_h3 = att_to_h3.proj(w_t) h2_t = cell2.step(xinp_h2_t + h1inp_h2 + attinp_h2, xgate_h2_t + h1gate_h2 + attgate_h2, h2_tm1) h2inp_h3, h2gate_h3 = h2_to_h3.proj(h2_t) h3_t = cell3.step(xinp_h3_t + h1inp_h3 + h2inp_h3 + attinp_h3, xgate_h3_t + h1gate_h3 + h2gate_h3 + attgate_h3, h3_tm1) return h1_t, h2_t, h3_t, k_t, w_t init_x = as_shared(np_zeros((minibatch_size, n_out))) srng = RandomStreams(1999) # Used to calculate stopping heuristic from sections 5.3 u_max = 0. * tensor.arange(c_sym.shape[0]) + c_sym.shape[0] u_max = u_max.dimshuffle('x', 'x', 0) u_max = tensor.cast(u_max, theano.config.floatX) def sample_step(x_tm1, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, ctx): xinp_h1_t, xgate_h1_t = inp_to_h1.proj(x_tm1) xinp_h2_t, xgate_h2_t = inp_to_h2.proj(x_tm1) xinp_h3_t, xgate_h3_t = inp_to_h3.proj(x_tm1) attinp_h1, attgate_h1 = att_to_h1.proj(w_tm1) h1_t = cell1.step(xinp_h1_t + attinp_h1, xgate_h1_t + attgate_h1, h1_tm1)
def translate_model(queue, rqueue, pid, models, options, k, normalize, verbose, nbest, return_alignment, suppress_unk): from theano_util import (load_params, init_theano_params) from nmt import (build_sampler, gen_sample, init_params) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) fs_init = [] fs_next = [] for model, option in zip(models, options): # load model parameters and set theano shared variables params = numpy.load(model) tparams = init_theano_params(params) # word index f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=return_alignment) fs_init.append(f_init) fs_next.append(f_next) def _translate(seq): # sample given an input sequence and obtain scores sample, score, word_probs, alignment = gen_sample( fs_init, fs_next, numpy.array(seq).T.reshape([len(seq), 1]), trng=trng, k=k, maxlen=200, stochastic=False, argmax=False, return_alignment=return_alignment, suppress_unk=suppress_unk) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths if nbest: return sample, score, word_probs, alignment else: sidx = numpy.argmin(score) return sample[sidx], score[sidx], word_probs[sidx], alignment[sidx] while True: req = queue.get() if req is None: break idx, x = req[0], req[1] if verbose: sys.stderr.write('{0} - {1}\n'.format(pid, idx)) seq = _translate(x) rqueue.put((idx, seq)) return
def __init__(self, sigma): super(GaussainNoise, self).__init__() self.sigma = sigma self.srng = RandomStreams(seed=np.random.randint(10e6))
def translate_model(queue, rqueue, pid, models, options, k, normalization_alpha, verbose, nbest, return_alignment, suppress_unk, return_hyp_graph, deviceid): # if the --device-list argument is set if deviceid != '': import os theano_flags = os.environ['THEANO_FLAGS'].split(',') exist = False for i in xrange(len(theano_flags)): if theano_flags[i].strip().startswith('device'): exist = True theano_flags[i] = '%s=%s' % ('device', deviceid) break if exist == False: theano_flags.append('%s=%s' % ('device', deviceid)) os.environ['THEANO_FLAGS'] = ','.join(theano_flags) from theano_util import (load_params, init_theano_params) from nmt import (build_sampler, gen_sample, init_params) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) fs_init = [] fs_next = [] for model, option in zip(models, options): # load model parameters and set theano shared variables param_list = numpy.load(model).files param_list = dict.fromkeys([key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) # word index f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=return_alignment) fs_init.append(f_init) fs_next.append(f_next) def _translate(seq): # sample given an input sequence and obtain scores sample, score, word_probs, alignment, hyp_graph = gen_sample(fs_init, fs_next, numpy.array(seq).T.reshape([len(seq[0]), len(seq), 1]), trng=trng, k=k, maxlen=200, stochastic=False, argmax=False, return_alignment=return_alignment, suppress_unk=suppress_unk, return_hyp_graph=return_hyp_graph) # normalize scores according to sequence lengths if normalization_alpha: adjusted_lengths = numpy.array([len(s) ** normalization_alpha for s in sample]) score = score / adjusted_lengths if nbest: return sample, score, word_probs, alignment, hyp_graph else: sidx = numpy.argmin(score) return sample[sidx], score[sidx], word_probs[sidx], alignment[sidx], hyp_graph while True: req = queue.get() if req is None: break idx, x = req[0], req[1] if verbose: sys.stderr.write('{0} - {1}\n'.format(pid,idx)) seq = _translate(x) rqueue.put((idx, seq)) return
def __init__(self, vocabulary, architecture, mode=Mode.minibatch, profile=False): """Initializes the neural network parameters for all layers, and creates Theano shared variables from them. :type vocabulary: Vocabulary :param vocabulary: mapping between word IDs and word classes :type architecture: Architecture :param architecture: an object that describes the network architecture :type mode: Network.Mode :param mode: constructs a variation of the networkif set to True, creates a network that produces the probability distribution for the next word (instead of of target probabilities for a mini-batch) :type profile: bool :param profile: if set to True, creates a Theano profile object """ self.vocabulary = vocabulary self.architecture = architecture self.mode = mode M1 = 2147483647 M2 = 2147462579 random_seed = [ numpy.random.randint(0, M1), numpy.random.randint(0, M1), numpy.random.randint(1, M1), numpy.random.randint(0, M2), numpy.random.randint(0, M2), numpy.random.randint(1, M2) ] self.random = RandomStreams(random_seed) # Word and class inputs will be available to NetworkInput layers. self.word_input = tensor.matrix('network/word_input', dtype='int64') self.class_input = tensor.matrix('network/class_input', dtype='int64') if self.mode.is_minibatch(): self.word_input.tag.test_value = test_value( size=(100, 16), max_value=vocabulary.num_words()) self.class_input.tag.test_value = test_value( size=(100, 16), max_value=vocabulary.num_classes()) else: self.word_input.tag.test_value = test_value( size=(1, 16), max_value=vocabulary.num_words()) self.class_input.tag.test_value = test_value( size=(1, 16), max_value=vocabulary.num_classes()) # Recurrent layers will create these lists, used to initialize state # variables of appropriate sizes, for doing forward passes one step at a # time. self.recurrent_state_input = [] self.recurrent_state_size = [] # Create the layers. logging.debug("Creating layers.") self.layers = OrderedDict() for input_options in architecture.inputs: input = NetworkInput(input_options, self) self.layers[input.name] = input for layer_description in architecture.layers: layer_options = self._layer_options_from_description( layer_description) if layer_options['name'] == architecture.output_layer: layer_options['size'] = vocabulary.num_classes() layer = create_layer(layer_options, self, profile=profile) self.layers[layer.name] = layer self.output_layer = self.layers[architecture.output_layer] # This list will be filled by the recurrent layers to contain the # recurrent state outputs, for doing forward passes one step at a time. self.recurrent_state_output = [None] * len(self.recurrent_state_size) # When the mode is target_words, this input variable specifies the words # whose probabilities will be computed. self.target_class_ids = tensor.matrix('network/target_class_ids', dtype='int64') self.target_class_ids.tag.test_value = test_value( size=(1, 16), max_value=vocabulary.num_classes()) # Create initial parameter values. logging.debug("Initializing parameters.") self.param_init_values = OrderedDict() num_params = 0 for layer in self.layers.values(): for name, value in layer.param_init_values.items(): logging.debug("- %s size=%d", name, value.size) num_params += value.size self.param_init_values.update(layer.param_init_values) logging.debug("Total number of parameters: %d", num_params) # Create Theano shared variables. self.params = { name: theano.shared(value, name) for name, value in self.param_init_values.items() } for layer in self.layers.values(): layer.set_params(self.params) # mask is used to mask out the rest of the input matrix, when a sequence # is shorter than the maximum sequence length. The mask is kept as int8 # data type, which is how Tensor stores booleans. if self.mode.is_minibatch(): self.mask = tensor.matrix('network/mask', dtype='int8') self.mask.tag.test_value = test_value(size=(100, 16), max_value=True) else: self.mask = tensor.ones(self.word_input.shape, dtype='int8') # Dropout layer needs to know whether we are training or evaluating. self.is_training = tensor.scalar('network/is_training', dtype='int8') self.is_training.tag.test_value = 1 for layer in self.layers.values(): layer.create_structure()
def __init__( self, nvisible, nhidden, hbias=None, vbias=None, W_real=None, W_imag=None, input=None, np_rng=None, theano_rng=None, ): """ RBM constructor, :param nvisible: number of visible nodes :param nhidden: number of hidden nodes :param hbias:"magnetic" field in the hidden layer, if the value is None, then initialize it the random number generator otherwise the initialized value is set to be the value of it. :param vbias: "magnetic" field in the visible layer, if the value is None, then initialize it the random number generator otherwise the initialized value is set to be the value of it. :param W_real: real part of the weight matrix connects visible layer and hidden layer :param W_imag: imaginary part of the weight matrix connects visible layer and hidden layer :param input: the initial sample for the visible layer (or spin configuration) if the value is None, then initialize it with the random number generator :param np_rng: random number generator seed :param theano_rng: random number generator seed of Theano """ self.nvisible = nvisible self.nhidden = nhidden if np_rng is None: # create a number generator np_rng = np.random.RandomState(1234) if theano_rng is None: theano_rng = RandomStreams(np_rng.randint(2**30)) if W_real is None: # W_real is initialized with `initial_Wreal` which is uniformely # sampled from -4*sqrt(6./(n_visible+n_hidden)) and # 4*sqrt(6./(n_hidden+n_visible)) the output of uniform if # converted using asarray to dtype theano.config.floatX so # that the code is runable on GPU initial_Wreal = np.asarray(np_rng.uniform( low=-2 * np.sqrt(6. / (nhidden + nvisible)), high=2 * np.sqrt(6. / (nhidden + nvisible)), size=(nvisible, nhidden)), dtype=theano.config.floatX) # theano shared variables for weights real part W_real = theano.shared(value=initial_Wreal, name='Wreal', borrow=True) if W_imag is None: # W_real is initialized with `initial_Wreal` which is uniformely # sampled from -4*sqrt(6./(n_visible+n_hidden)) and # 4*sqrt(6./(n_hidden+n_visible)) the output of uniform if # converted using asarray to dtype theano.config.floatX so # that the code is runable on GPU initial_Wimag = np.asarray(np_rng.uniform( low=-2 * np.sqrt(6. / (nhidden + nvisible)), high=2 * np.sqrt(6. / (nhidden + nvisible)), size=(nvisible, nhidden)), dtype=theano.config.floatX) # theano shared variables for weights imaginary part W_imag = theano.shared(value=initial_Wimag, name='Wimag', borrow=True) if hbias is None: # create shared variable for hidden units bias hbias = theano.shared(value=np.zeros(nhidden, dtype=theano.config.floatX), name='hbias', borrow=True) if vbias is None: # create shared variable for visible units bias vbias = theano.shared(value=np.zeros(nvisible, dtype=theano.config.floatX), name='vbias', borrow=True) # initialize input layer for standalone RBM or layer0 of DBN # self.input = input # if not input: # self.input = T.matrix('input') # self.input=input self.W_real = W_real self.W_imag = W_imag self.hbias = hbias self.vbias = vbias self.theano_rng = theano_rng self.input = input # **** WARNING: It is not a good idea to put things in this list # other than shared variables created in this function. #self.params = [self.W_real,self.W_imag,self.hbias, self.vbias] self.params = [self.W_real, self.hbias, self.vbias]
def __init__(self, input=None, n_visible=784, n_hidden=500, W=None, hbias=None, vbias=None, numpy_rng=None, theano_rng=None): """ RBM constructor. Defines the parameters of the model along with basic operations for inferring hidden from visible (and vice-versa), as well as for performing CD updates. :param input: None for standalone RBMs or symbolic variable if RBM is part of a larger graph. :param n_visible: number of visible units :param n_hidden: number of hidden units :param W: None for standalone RBMs or symbolic variable pointing to a shared weight matrix in case RBM is part of a DBN network; in a DBN, the weights are shared between RBMs and layers of a MLP :param hbias: None for standalone RBMs or symbolic variable pointing to a shared hidden units bias vector in case RBM is part of a different network :param vbias: None for standalone RBMs or a symbolic variable pointing to a shared visible units bias """ self.n_visible = n_visible self.n_hidden = n_hidden if numpy_rng is None: # create a number generator numpy_rng = numpy.random.RandomState(1234) if theano_rng is None: theano_rng = RandomStreams(numpy_rng.randint(2**30)) if W is None: # W is initialized with `initial_W` which is uniformely # sampled from -4*sqrt(6./(n_visible+n_hidden)) and # 4*sqrt(6./(n_hidden+n_visible)) the output of uniform if # converted using asarray to dtype theano.config.floatX so # that the code is runable on GPU initial_W = numpy.asarray(numpy_rng.uniform( low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)), high=4 * numpy.sqrt(6. / (n_hidden + n_visible)), size=(n_visible, n_hidden)), dtype=theano.config.floatX) # theano shared variables for weights and biases W = theano.shared(value=initial_W, name='W', borrow=True) if hbias is None: # create shared variable for hidden units bias hbias = theano.shared(value=numpy.zeros( n_hidden, dtype=theano.config.floatX), name='hbias', borrow=True) if vbias is None: # create shared variable for visible units bias vbias = theano.shared(value=numpy.zeros( n_visible, dtype=theano.config.floatX), name='vbias', borrow=True) # initialize input layer for standalone RBM or layer0 of DBN self.input = input if not input: self.input = T.matrix('input') self.W = W self.hbias = hbias self.vbias = vbias self.theano_rng = theano_rng # **** WARNING: It is not a good idea to put things in this list # other than shared variables created in this function. self.params = [self.W, self.hbias, self.vbias]
def __init__(self, options, channel, data, model): """ Parameters: options: Dictionary `options` is expected to contain the following keys: `cbs` -> int Number of samples to consider at a time when computing some property of the model `gbs` -> int Number of samples over which to compute the gradients `mbs` -> int Number of samples over which to compute the metric `ebs` -> int Number of samples over which to evaluate the training error `mreg` -> float Regularization added to the metric `mrtol` -> float Relative tolerance for inverting the metric `miters` -> int Number of iterations `seed` -> int Random number generator seed `profile` -> bool Flag, if profiling should be on or not `verbose` -> int Verbosity level `lr` -> float Learning rate channel: jobman channel or None data: dictionary-like object return by numpy.load containing the data model : model """ n_params = len(model.params) self.data = data if options['device'] != 'gpu': xdata = theano.shared(data['train_x'][:options['gbs']], name='xdata') ydata = TT._shared(data['train_y'][:options['gbs']], name='ydata') self.xdata = xdata self.ydata = ydata shared_data = [xdata, ydata] else: self.cpu_shared_data = [] xdata = theano.shared(data['train_x'], name='xdata') ydata = TT._shared(data['train_y'], name='ydata') self.xdata = xdata self.ydata = ydata shared_data = [xdata, ydata] self.rng = numpy.random.RandomState(options['seed']) n_samples = data['train_x'].shape[0] self.grad_batches = n_samples // options['gbs'] self.metric_batches = n_samples // options['mbs'] self.eval_batches = n_samples // options['ebs'] self.verbose = options['verbose'] if options['device'] != 'gpu': # Store eucledian gradients self.gs = [ TT._shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] # Store riemannian gradients self.rs = [ TT._shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] else: # Store eucledian gradients self.gs = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] # Store riemannian gradients self.rs = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX)) for shp in model.params_shape ] self.permg = self.rng.permutation(self.grad_batches) self.permr = self.rng.permutation(self.metric_batches) self.perme = self.rng.permutation(self.eval_batches) self.k = 0 self.posg = 0 self.posr = 0 self.pose = 0 # Step 1. Compile function for computing eucledian gradients # inputs gbdx = TT.iscalar('grad_batch_idx') print 'Constructing grad function' srng = RandomStreams(numpy.random.randint(1e5)) loc_inputs = [x.type() for x in model.inputs] def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)] return [args[0] + const(1)] + \ nw_gs ig = [ TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0) for shp in model.params_shape ] idx0 = TT.unbroadcast(const([0]), 0) n_steps = options['gbs'] // options['cbs'] rvals, updates = scan(grad_step, states=[idx0] + ig, n_steps=n_steps, name='grad_loop', profile=options['profile']) nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]] # updates updates.update(dict(zip(self.gs, nw_gs))) # givens if options['device'] == 'gpu': grad_inps = [(x, y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']]) for x, y in zip(loc_inputs, shared_data)] else: grad_inps = zip(loc_inputs, shared_data) print 'Compiling grad function' self.compute_eucledian_gradients = theano.function( [gbdx], [], updates=updates, givens=dict(grad_inps), name='compute_eucledian_gradients', mode=gpu_mode, on_unused_input='warn', profile=options['profile']) # Step 2. Compile function for Computing Riemannian gradients rbdx = TT.iscalar('riemmanian_batch_idx') rbpos = rbdx * options['mbs'] if options['device'] == 'gpu': mode = gpu_mode def compute_Gv(*args): idx0 = const([0]) ep = [ TT.alloc(const(0), 1, *shp) for shp in model.params_shape ] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict( zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] loc_args = [ x for x, y in zip(args, model.params) if y in theano.gof.graph.inputs([nw_out]) ] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const( options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params) ] return [gv_args[0] + const(1)] + Gvs nw_cost, nw_preactiv_out = safe_clone( [model.train_cost, model.preactiv_out], replace) nw_gvs = TT.Lop( nw_preactiv_out, model.params, TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params, args)) Gvs = [ ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs) ] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=theano.Mode(linker='cvm'), name='Gv_step', profile=options['profile']) final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]] return final_Gvs, updates else: mode = cpu_mode def compute_Gv(*args): cgv = [ theano.shared(numpy.zeros(shp, dtype=theano.config.floatX), name='cgv%d' % idx) for idx, shp in enumerate(model.params_shape) ] print_mem('allocated mem for cgv') idx0 = const([0]) ep = [ TT.alloc(const(0), 1, *shp) for shp in model.params_shape ] def Gv_step(*gv_args): idx = TT.cast(gv_args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_outs = safe_clone(model.outs, replace) final_results = dict( zip(model.params, [None] * len(model.params))) for nw_out, out_operator in zip(nw_outs, model.outs_operator): loc_params = [ x for x in model.params if x in theano.gof.graph.inputs([nw_out]) ] loc_args = [ x for x, y in zip(cgv, model.params) if y in theano.gof.graph.inputs([nw_out]) ] if out_operator == 'softmax': factor = const(options['cbs']) * nw_out elif out_operator == 'sigmoid': factor = const( options['cbs']) * nw_out * (1 - nw_out) else: factor = const(options['cbs']) loc_Gvs = TT.Lop(nw_out, loc_params, TT.Rop(nw_out, loc_params, loc_args) /\ factor) for lp, lgv in zip(loc_params, loc_Gvs): if final_results[lp] is None: final_results[lp] = lgv else: final_results[lp] += lgv Gvs = [ ogv + final_results[param] for (ogv, param) in zip(gv_args[1:], model.params) ] return [gv_args[0] + const(1)] + Gvs states = [idx0] + ep n_steps = options['mbs'] // options['cbs'] rvals, updates = scan(Gv_step, states=states, n_steps=n_steps, mode=gpu_mode, name='Gv_step', profile=options['profile']) final_Gvs = [ TT.as_tensor_variable(x[0]) / const(n_steps) for x in rvals[1:] ] grad_inps = zip(loc_inputs, shared_data) loc_fn = theano.function([], final_Gvs, updates=updates, givens=dict(grad_inps), on_unused_input='warn', mode=gpu_mode, name='loc_fn', profile=options['profile']) fake_op = FakeGPUShell(cgv, loc_fn, len(cgv)) return fake_op(*args), {} print 'Constructing riemannian gradient function' norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs)) rvals = minres.minres(compute_Gv, [x / norm_grads for x in self.gs], rtol=options['mrtol'], shift=-options['mreg'], maxit=options['miters'], mode=mode, profile=options['profile']) nw_rs = [x * norm_grads for x in rvals[0]] flag = rvals[1] niters = rvals[2] rel_residual = rvals[3] rel_Aresidual = rvals[4] Anorm = rvals[5] Acond = rvals[6] xnorm = rvals[7] Axnorm = rvals[8] updates = rvals[9] norm_ord0 = TT.max(abs(nw_rs[0])) for r in nw_rs[1:]: norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r))) updates.update(dict(zip(self.rs, nw_rs))) grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']]) for x, y in zip(loc_inputs[:1], shared_data[:1])] print 'Compiling riemannian gradient function' self.compute_riemannian_gradients = theano.function( [rbdx], [ flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm, Axnorm, norm_grads, norm_ord0 ], updates=updates, givens=dict(grad_inps), name='compute_riemannian_gradients', on_unused_input='warn', mode=mode, profile=options['profile']) # Step 3. Compile function for evaluating cost and updating # parameters print 'constructing evaluation function' lr = TT.scalar('lr') self.lr = numpy.float32(options['lr']) ebdx = TT.iscalar('eval_batch_idx') nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)] def cost_step(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps)) nw_cost = safe_clone(model.train_cost, replace=replace) return [_idx + const(1), acc + nw_cost] acc0 = const([0]) idx0 = const([0]) n_steps = options['ebs'] // options['cbs'] rvals, updates = scan(cost_step, states=[idx0, acc0], n_steps=n_steps, name='cost_loop', mode=gpu_mode, profile=options['profile']) final_cost = rvals[1] / const(n_steps) if options['device'] == 'gpu': grad_inps = [(x, y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']]) for x, y in zip(loc_inputs, shared_data)] else: grad_inps = zip(loc_inputs, shared_data) print 'compling evaluation function' self.eval_fn = theano.function([ebdx, lr], final_cost, givens=dict(grad_inps), on_unused_input='warn', updates=updates, name='eval_fn', mode=gpu_mode, profile=options['profile']) update_dict = dict(zip(model.params, nw_ps)) if options['device'] != 'gpu': update_dict.update(dict(zip(model.cparams, nw_ps))) self.update_params = theano.function([lr], [], updates=update_dict, name='update_params', on_unused_input='warn', mode=mode, profile=options['profile']) self.options = options self.old_cost = 1e6 self.device = options['device'] n_steps = options['ebs'] // options['cbs'] def ls_error(_idx, acc): idx = TT.cast(_idx, 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = TT.cast(safe_clone(model.err, replace=replace), 'float32') return [_idx + const(1), acc + nw_cost] states = [ TT.constant(numpy.float32([0])), TT.constant(numpy.float32([0])) ] rvals, _ = scan(ls_error, states=states, n_steps=n_steps, name='ls_err_step', mode=cpu_mode, profile=options['profile']) ferr = rvals[1][0] / const(n_steps) self.compute_error = theano.function([ebdx], ferr, givens=dict(grad_inps), name='compute_err', mode=gpu_mode, on_unused_input='warn', profile=options['profile'])
def build_model(tparams, options): """ Builds the entire computational graph used for training """ opt_ret = dict() trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # description string: #words x #samples x1 = tensor.matrix('x1', dtype='int64') x1_mask = tensor.matrix('x1_mask', dtype='float32') x1_left_mask = tensor.tensor3('x1_left_mask', dtype='float32') x1_right_mask = tensor.tensor3('x1_right_mask', dtype='float32') x2 = tensor.matrix('x2', dtype='int64') x2_mask = tensor.matrix('x2_mask', dtype='float32') x2_left_mask = tensor.tensor3('x2_left_mask', dtype='float32') x2_right_mask = tensor.tensor3('x2_right_mask', dtype='float32') y = tensor.vector('y', dtype='int64') xr1_mask = x1_mask[::-1] xr2_mask = x2_mask[::-1] n_timesteps_x1 = x1.shape[0] n_timesteps_x2 = x2.shape[0] n_samples = x1.shape[1] # word embedding emb1 = tparams['Wemb'][x1.flatten()].reshape( [n_timesteps_x1, n_samples, options['dim_word']]) if options['use_dropout']: emb1 = dropout_layer(emb1, use_noise, trng) emb2 = tparams['Wemb'][x2.flatten()].reshape( [n_timesteps_x2, n_samples, options['dim_word']]) if options['use_dropout']: emb2 = dropout_layer(emb2, use_noise, trng) inputs1 = (emb1, x1_mask, x1_left_mask, x1_right_mask) inputs2 = (emb2, x2_mask, x2_left_mask, x2_right_mask) proj1 = get_layer(options['encoder'])[1](tparams, inputs1, options, prefix='encoder', mask=x1_mask) proj2 = get_layer(options['encoder'])[1](tparams, inputs2, options, prefix='encoder', mask=x2_mask) ctx1 = proj1[0][-1, :, :, :].dimshuffle(1, 0, 2) ctx2 = proj2[0][-1, :, :, :].dimshuffle(1, 0, 2) # ctx1: #step1 x #sample x #dimctx # ctx2: #step2 x #sample x #dimctx ctx1 = ctx1 * x1_mask[:, :, None] ctx2 = ctx2 * x2_mask[:, :, None] # weight_matrix: #sample x #step1 x #step2 weight_matrix = tensor.batched_dot(ctx1.dimshuffle(1, 0, 2), ctx2.dimshuffle(1, 2, 0)) weight_matrix_1 = tensor.exp( weight_matrix - weight_matrix.max(1, keepdims=True)).dimshuffle( 1, 2, 0) weight_matrix_2 = tensor.exp( weight_matrix - weight_matrix.max(2, keepdims=True)).dimshuffle( 1, 2, 0) # weight_matrix_1: #step1 x #step2 x #sample weight_matrix_1 = weight_matrix_1 * x1_mask[:, None, :] weight_matrix_2 = weight_matrix_2 * x2_mask[None, :, :] alpha = weight_matrix_1 / weight_matrix_1.sum(0, keepdims=True) beta = weight_matrix_2 / weight_matrix_2.sum(1, keepdims=True) #ctx1: #step1 x #sample x #dimctx #ctx2: #step2 x #sample x #dimctx ctx2_ = (ctx1.dimshuffle(0, 'x', 1, 2) * alpha.dimshuffle(0, 1, 2, 'x')).sum(0) ctx1_ = (ctx2.dimshuffle('x', 0, 1, 2) * beta.dimshuffle(0, 1, 2, 'x')).sum(1) inp1 = concatenate([ctx1, ctx1_, ctx1 * ctx1_, ctx1 - ctx1_], axis=2) inp2 = concatenate([ctx2, ctx2_, ctx2 * ctx2_, ctx2 - ctx2_], axis=2) inp1 = get_layer('ff')[1](tparams, inp1, options, prefix='projection', activ='relu') inp2 = get_layer('ff')[1](tparams, inp2, options, prefix='projection', activ='relu') inputs3 = (inp1, x1_mask, x1_left_mask, x1_right_mask) inputs4 = (inp2, x2_mask, x2_left_mask, x2_right_mask) proj3 = get_layer(options['decoder'])[1](tparams, inputs3, options, prefix='decoder', mask=x1_mask) proj4 = get_layer(options['decoder'])[1](tparams, inputs4, options, prefix='decoder', mask=x2_mask) logit0 = concatenate([proj3[0][-1, :, -1, :], proj4[0][-1, :, -1, :]], axis=1) logit1 = (proj3[0][-1, :, :, :] * x1_mask.dimshuffle(1, 0, 'x')).sum(1) / x1_mask.sum(0)[:, None] logit2 = (proj3[0][-1, :, :, :] * x1_mask.dimshuffle(1, 0, 'x')).max(1) logit3 = (proj4[0][-1, :, :, :] * x2_mask.dimshuffle(1, 0, 'x')).sum(1) / x2_mask.sum(0)[:, None] logit4 = (proj4[0][-1, :, :, :] * x2_mask.dimshuffle(1, 0, 'x')).max(1) logit = concatenate([logit0, logit1, logit2, logit3, logit4], axis=1) if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_layer_1', activ='tanh') if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_layer_output', activ='linear') probs = tensor.nnet.softmax(logit) cost = tensor.nnet.categorical_crossentropy(probs, y) f_pred = theano.function([ x1, x1_mask, x1_left_mask, x1_right_mask, x2, x2_mask, x2_left_mask, x2_right_mask ], probs.argmax(axis=1), name='f_pred', profile=profile) f_prods = theano.function([ x1, x1_mask, x1_left_mask, x1_right_mask, x2, x2_mask, x2_left_mask, x2_right_mask ], probs, name='f_prods', profile=profile) return trng, use_noise, x1, x1_mask, x1_left_mask, x1_right_mask, x2, x2_mask, x2_left_mask, x2_right_mask, y, opt_ret, cost, f_pred, f_prods
def build_model(shared_params, options): trng = RandomStreams(1234) drop_ratio = options['drop_ratio'] batch_size = options['batch_size'] n_dim = options['n_dim'] w_emb = shared_params['w_emb'] dropout = theano.shared(numpy.float32(0.)) image_feat = T.ftensor3('image_feat') # T x batch_size input_idx = T.imatrix('input_idx') input_mask = T.matrix('input_mask') # label is the TRUE label label = T.ivector('label') empty_word = theano.shared(value=np.zeros((1, options['n_emb']), dtype='float32'), name='empty_word') w_emb_extend = T.concatenate([empty_word, shared_params['w_emb']], axis=0) input_emb = w_emb_extend[input_idx] # get the transformed image feature h_0 = theano.shared(numpy.zeros((batch_size, n_dim), dtype='float32')) c_0 = theano.shared(numpy.zeros((batch_size, n_dim), dtype='float32')) if options['sent_drop']: input_emb = dropout_layer(input_emb, dropout, trng, drop_ratio) h_from_lstm, c_encode = lstm_layer(shared_params, input_emb, input_mask, h_0, c_0, options, prefix='sent_lstm') # pick the last one as encoder Y = fflayer(shared_params, image_feat, options, prefix='image_mlp', act_func=options.get('image_mlp_act', 'tanh')) r_0 = theano.shared(numpy.zeros((batch_size, n_dim), dtype='float32')) r = wbw_attention_layer(shared_params, Y, h_from_lstm, input_mask, r_0, options, return_final=True) h_star = T.tanh( T.dot(r, shared_params['W_p_w']) + T.dot(h_from_lstm[-1], shared_params['W_x_w'])) combined_hidden = fflayer(shared_params, h_star, options, prefix='scale_to_softmax', act_func='linear') # drop the image output prob = T.nnet.softmax(combined_hidden) prob_y = prob[T.arange(prob.shape[0]), label] pred_label = T.argmax(prob, axis=1) # sum or mean? cost = -T.mean(T.log(prob_y)) accu = T.mean(T.eq(pred_label, label)) return image_feat, input_idx, input_mask, \ label, dropout, cost, accu
def translate_model(queue, rqueue, pid, model, options, k, normalize, kp, sigma): from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, options) inps = [x, x_mask, y, y_mask] f_log_probs = theano.function(inps, cost) # word index f_init, f_next = build_sampler(tparams, options, trng) def _translate(idx, seq): all_samples = [] all_scores = [] for kidx in xrange(kp): if kidx == 0: ss = -1. else: ss = sigma # sample given an input sequence and obtain scores sample, score = gen_sample(tparams, f_init, f_next, numpy.array(seq).reshape([len(seq), 1]), options, trng=trng, k=k, maxlen=200, stochastic=False, argmax=False, sigma=ss) # normalize scores according to sequence lengths if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths #print idx, score sidx = numpy.argmin(score) all_samples.append(sample[sidx]) all_scores.append(score[sidx]) source_list = [seq] * kp x, x_mask, y, y_mask = prepare_data(source_list, all_samples, maxlen=None) all_scores = f_log_probs(x, x_mask, y, y_mask) if normalize: lengths = numpy.array([len(s) for s in all_samples]) all_scores = all_scores / lengths print idx, all_scores sidx = numpy.argmin(all_scores) return all_samples[sidx] while True: req = queue.get() if req is None: break idx, x = req[0], req[1] print pid, '-', idx seq = _translate(idx, x) rqueue.put((idx, seq)) return
def evaluate_gpu(gru, test_data, items=None, session_key='SessionId', item_key='ItemId', time_key='Time', cut_off=20, batch_size=100, mode='conservative', output_path=None): if gru.error_during_train: raise Exception print('Measuring Recall@{} and MRR@{}'.format(cut_off, cut_off)) srng = RandomStreams() X = T.ivector() Y = T.ivector() M = T.iscalar() C = [] yhat, H, updatesH = gru.symbolic_predict(X, Y, M, items, batch_size) if mode == 'tiebreaking': yhat += srng.uniform(size=yhat.shape) * 1e-10 if items is None: targets = T.diag(yhat.T[Y]) others = yhat.T else: targets = T.diag(yhat.T[:M]) others = yhat.T[M:] if mode == 'standard': ranks = (others > targets).sum(axis=0) + 1 elif mode == 'conservative': ranks = (others >= targets).sum(axis=0) elif mode == 'median': ranks = (others > targets).sum(axis=0) + 0.5 * ( (others == targets).sum(axis=0) - 1) + 1 elif mode == 'tiebreaking': ranks = (others > targets).sum(axis=0) + 1 else: raise NotImplementedError REC = (ranks <= cut_off).sum() MRR = ((ranks <= cut_off) / ranks).sum() evaluate = theano.function(inputs=[X, Y, M] + C, outputs=[REC, MRR, yhat], updates=updatesH, allow_input_downcast=True, on_unused_input='ignore') test_data = pd.merge(test_data, pd.DataFrame({ 'ItemIdx': gru.itemidmap.values, item_key: gru.itemidmap.index }), on=item_key, how='inner') test_data.sort_values([session_key, time_key, item_key], inplace=True) test_data_items = test_data.ItemIdx.values if items is not None: item_idxs = gru.itemidmap[items] recall, mrr, n = 0, 0, 0 iters = np.arange(batch_size) maxiter = iters.max() session_lengths = test_data.groupby(session_key).size() items_session_lengths = np.array([ session_length for session_length in session_lengths for _ in range(session_length) ]) items_session_ids = np.array([ i for i, session_length in enumerate(session_lengths) for _ in range(session_length) ]) items_pos = np.array([ i for session_length in session_lengths for i in range(session_length) ]) offset_sessions = np.zeros(test_data[session_key].nunique() + 1, dtype=np.int32) offset_sessions[1:] = test_data.groupby(session_key).size().cumsum() start = offset_sessions[iters] end = offset_sessions[iters + 1] finished = False cidxs = [] lim_preds = 500 headers = ['seq_id', 'length', 'event_id', 'event_in', 'event_out'] headers += ['top_pred_idx_' + str(i) for i in range(lim_preds)] headers += ['top_pred_' + str(i) for i in range(lim_preds)] header = ';'.join(headers) lines = [header] while not finished: minlen = (end - start).min() out_idx = test_data_items[start] for i in range(minlen - 1): lengths = items_session_lengths[start + i] positions = items_pos[start + i] session_ids = items_session_ids[start + i] in_idx = out_idx out_idx = test_data_items[start + i + 1] if items is not None: y = np.hstack([out_idx, item_idxs]) else: y = out_idx rec, m, preds = evaluate(in_idx, y, len(iters), *cidxs) for seq_id, in_item, out_item, pred, length, position in zip( session_ids, in_idx, y, preds, lengths, positions): top_k_idx = pred.argsort()[-lim_preds:][::-1] top_k_values = pred[top_k_idx] lines.append(';'.join([ str(int(seq_id)), str(int(length - 1)), str(int(position)), str(int(in_item)), str(int(out_item)) ] + [str(int(idx)) for idx in top_k_idx] + [str(pred) for pred in top_k_values])) recall += rec mrr += m n += len(iters) start = start + minlen - 1 finished_mask = (end - start <= 1) n_finished = finished_mask.sum() iters[finished_mask] = maxiter + np.arange(1, n_finished + 1) maxiter += n_finished valid_mask = (iters < len(offset_sessions) - 1) n_valid = valid_mask.sum() if n_valid == 0: finished = True break mask = finished_mask & valid_mask sessions = iters[mask] start[mask] = offset_sessions[sessions] end[mask] = offset_sessions[sessions + 1] iters = iters[valid_mask] start = start[valid_mask] end = end[valid_mask] if valid_mask.any(): for i in range(len(H)): tmp = H[i].get_value(borrow=True) tmp[mask] = 0 tmp = tmp[valid_mask] H[i].set_value(tmp, borrow=True) if output_path is not None: with open(output_path, 'w') as out_file: out_file.write("\n".join(lines)) return recall / n, mrr / n
def rescore_model(source_file, target_file, saveto, models, options, b, normalization_alpha, verbose, alignweights): trng = RandomStreams(1234) datasets = [source_file.name, target_file.name] dictionaries = [ options[0]['dictionaries'][0], options[0]['dictionaries'][-1] ] n_words = [options[0]['n_words'][0], options[0]['n_words'][-1]] def _score(pairs, alignweights=False): # sample given an input sequence and obtain scores scores = [] alignments = [] for i, model in enumerate(models): f_log_probs = load_scorer(model, options[i], alignweights=alignweights) score, alignment = pred_probs( f_log_probs, prepare_data, options[i], pairs, normalization_alpha=normalization_alpha, alignweights=alignweights) scores.append(score) alignments.append(alignment) return scores, alignments pairs = TextIterator( datasets, dictionaries, n_words_dicts=n_words, batch_size=b, maxlen=float('inf'), factors=options[0]['factors'], outputs=1, sort_by_length=False ) #TODO: sorting by length could be more efficient, but we'd want to resort after scores, alignments = _score(pairs, alignweights) source_file.seek(0) target_file.seek(0) source_lines = source_file.readlines() target_lines = target_file.readlines() for i, line in enumerate(target_lines): score_str = ' '.join(map(str, [s[i] for s in scores])) if verbose: saveto.write('{0} '.format(line.strip())) saveto.write('{0}\n'.format(score_str)) ### optional save weights mode. if alignweights: ### writing out the alignments. temp_name = saveto.name + ".json" with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT: for line in all_alignments: align_OUT.write(line + "\n") ### combining the actual source and target words. combine_source_target_text_1to1(source_file, target_file, saveto.name, align_OUT)
def __init__(self): theano.config.floatX = "float32" self.srng = RandomStreams() self.X = T.ftensor4() self.Y = T.fmatrix()
def __init__(self, t=0.1, eps=1e-20): assert t != 0 self.temperature = t self.eps = eps self._srng = RandomStreams(get_rng().randint(1, 2147462579))
import numpy as np import theano import theano.tensor as T import ipdb import cPickle from keras.preprocessing import sequence from keras import activations, initializations from keras.preprocessing import sequence from keras.layers.embeddings import Embedding from keras.layers.core import Dense from keras.utils.theano_utils import shared_scalar, shared_zeros, sharedX, alloc_zeros_matrix from theano import config trng = RandomStreams(1234) def dropout(X): if train: X *= trng.binomial(X.shape, p=0.5, dtype=theano.config.floatX) X /= 0.5 return X def ortho_weight(ndim): W = np.random.randn(ndim, ndim) u, _, _ = np.linalg.svd(W) return u.astype('float32') ############# Building Models ################
def get_gate_weights(model_name, dictionary, dictionary_target, source_file, args, k=5, normalize=False, chr_level=False): options = load_options(model_name) word_dict, word_idict, word_idict_trg = load_translate_data( dictionary, dictionary_target, source_file, batch_mode=False, chr_level=chr_level, load_input=False) inputs = [] lines = [] print 'Loading input...', with open(source_file, 'r') as f: for idx, line in enumerate(f): if idx >= args.test_number: break lines.append(line) if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = [word_dict[w] if w in word_dict else 1 for w in words] x = [ii if ii < options['n_words_src'] else 1 for ii in x] x.append(0) inputs.append(x) print 'Done' print 'Building model...', model, _ = build_and_init_model(model_name, options, build=False) print 'Done' if args.encoder: return get_encoder_gate_weights(args, model, options, inputs, lines) print 'Building sampler...' trng = RandomStreams(1234) use_noise = theano.shared(np.float32(0.)) f_init, f_next = model.build_sampler( trng=trng, use_noise=use_noise, batch_mode=False, get_gates=True, ) build_result = model, f_init, f_next, trng print 'Done' results = [] for i, src_seq in enumerate(inputs): results.append({ 'index': i, 'input': lines[i].strip(), 'dim': options['dim'], 'encoder': False, }) tgt_seq, kw_ret = translate_sentence(src_seq, build_result, k, normalize) results[-1]['output'] = seq2words(tgt_seq, word_idict_trg) results[-1]['kw_ret'] = kw_ret results[-1]['n_layers'] = len(kw_ret['input_gates_list'][0]) print 'Input:', lines[i] print 'Output:', results[-1]['output'] print '==============================' return results
import numpy try: import pylab except ImportError: print( "pylab isn't available. If you use its functionality, it will crash.") print("It can be installed with 'pip install -q Pillow'") from midi.utils import midiread, midiwrite import theano import theano.tensor as T from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams #Don't use a python long as this don't work on 32 bits computers. numpy.random.seed(0xbeef) rng = RandomStreams(seed=numpy.random.randint(1 << 30)) theano.config.warn.subtensor_merge_bug = False def build_rbm(v, W, bv, bh, k): '''Construct a k-step Gibbs chain starting at v for an RBM. v : Theano vector or matrix If a matrix, multiple chains will be run in parallel (batch). W : Theano matrix Weight matrix of the RBM. bv : Theano vector Visible bias vector of the RBM. bh : Theano vector Hidden bias vector of the RBM. k : scalar or Theano scalar
def build_model(shared_params, options): trng = RandomStreams(1234) drop_ratio = options['drop_ratio'] batch_size = options['batch_size'] n_dim = options['n_dim'] w_emb = shared_params['w_emb'] dropout = theano.shared(numpy.float32(0.)) image_feat = T.ftensor3('image_feat') # batch_size x T input_idx = T.imatrix('input_idx') input_mask = T.matrix('input_mask') # label is the TRUE label label = T.ivector('label') empty_word = theano.shared(value=np.zeros((1, options['n_emb']), dtype='float32'), name='empty_word') w_emb_extend = T.concatenate([empty_word, shared_params['w_emb']], axis=0) input_emb = w_emb_extend[input_idx] # a trick here, set the maxpool_h/w to be large # maxpool_shape = (options['maxpool_h'], options['maxpool_w']) # turn those appending words into zeros # batch_size x T x n_emb input_emb = input_emb * input_mask[:, :, None] if options['sent_drop']: input_emb = dropout_layer(input_emb, dropout, trng, drop_ratio) if options['use_unigram_conv']: unigram_conv_feat = fflayer(shared_params, input_emb, options, prefix='conv_unigram', act_func=options.get('sent_conv_act', 'tanh')) unigram_pool_feat = unigram_conv_feat.max(axis=1) if options['use_bigram_conv']: idx = T.concatenate([T.arange(input_emb.shape[1])[:-1], T.arange(input_emb.shape[1])[1:]]).reshape((2, input_emb.shape[1] - 1)).transpose().flatten() bigram_emb = T.reshape(input_emb[:, idx, :], (input_emb.shape[0], input_emb.shape[1] - 1, 2 * input_emb.shape[2])) bigram_conv_feat = fflayer(shared_params, bigram_emb, options, prefix='conv_bigram', act_func=options.get('sent_conv_act', 'tanh')) bigram_pool_feat = bigram_conv_feat.max(axis=1) if options['use_trigram_conv']: idx = T.concatenate([T.arange(input_emb.shape[1])[:-2], T.arange(input_emb.shape[1])[1:-1], T.arange(input_emb.shape[1])[2:]]).reshape((3, input_emb.shape[1] - 2)).transpose().flatten() trigram_emb = T.reshape(input_emb[:, idx, :], (input_emb.shape[0], input_emb.shape[1] - 2, 3 * input_emb.shape[2])) trigram_conv_feat = fflayer(shared_params, trigram_emb, options, prefix='conv_trigram', act_func=options.get('sent_conv_act', 'tanh')) trigram_pool_feat = trigram_conv_feat.max(axis=1) # pool_feat = T.concatenate([unigram_pool_feat, bigram_pool_feat, trigram_pool_feat], axis=1) image_feat_down = fflayer(shared_params, image_feat, options, prefix='image_mlp', act_func=options.get('image_mlp_act', 'tanh')) if options.get('use_before_attention_drop', False): image_feat_down = dropout_layer(image_feat_down, dropout, trng, drop_ratio) pool_feat = dropout_layer(pool_feat, dropout, trng, drop_ratio) # attention model begins here # first layer attention model image_feat_attention_1 = fflayer(shared_params, image_feat_down, options, prefix='image_att_mlp_1', act_func=options.get('image_att_mlp_act', 'tanh')) pool_feat_attention_1 = fflayer(shared_params, pool_feat, options, prefix='sent_att_mlp_1', act_func=options.get('sent_att_mlp_act', 'tanh')) combined_feat_attention_1 = image_feat_attention_1 + \ pool_feat_attention_1[:, None, :] if options['use_attention_drop']: combined_feat_attention_1 = dropout_layer(combined_feat_attention_1, dropout, trng, drop_ratio) combined_feat_attention_1 = fflayer(shared_params, combined_feat_attention_1, options, prefix='combined_att_mlp_1', act_func=options.get( 'combined_att_mlp_act', 'tanh')) prob_attention_1 = T.nnet.softmax(combined_feat_attention_1[:, :, 0]) image_feat_ave_1 = (prob_attention_1[:, :, None] * image_feat_down).sum(axis=1) combined_hidden_1 = image_feat_ave_1 + pool_feat # second layer attention model image_feat_attention_2 = fflayer(shared_params, image_feat_down, options, prefix='image_att_mlp_2', act_func=options.get('image_att_mlp_act', 'tanh')) pool_feat_attention_2 = fflayer(shared_params, combined_hidden_1, options, prefix='sent_att_mlp_2', act_func=options.get('sent_att_mlp_act', 'tanh')) combined_feat_attention_2 = image_feat_attention_2 + \ pool_feat_attention_2[:, None, :] if options['use_attention_drop']: combined_feat_attention_2 = dropout_layer(combined_feat_attention_2, dropout, trng, drop_ratio) combined_feat_attention_2 = fflayer(shared_params, combined_feat_attention_2, options, prefix='combined_att_mlp_2', act_func=options.get( 'combined_att_mlp_act', 'tanh')) prob_attention_2 = T.nnet.softmax(combined_feat_attention_2[:, :, 0]) image_feat_ave_2 = (prob_attention_2[:, :, None] * image_feat_down).sum(axis=1) if options.get('use_final_image_feat_only', False): combined_hidden = image_feat_ave_2 + pool_feat else: combined_hidden = image_feat_ave_2 + combined_hidden_1 for i in range(options['combined_num_mlp']): if options.get('combined_mlp_drop_%d'%(i), False): combined_hidden = dropout_layer(combined_hidden, dropout, trng, drop_ratio) if i == options['combined_num_mlp'] - 1: combined_hidden = fflayer(shared_params, combined_hidden, options, prefix='combined_mlp_%d'%(i), act_func='linear') else: combined_hidden = fflayer(shared_params, combined_hidden, options, prefix='combined_mlp_%d'%(i), act_func=options.get('combined_mlp_act_%d'%(i), 'tanh')) # drop the image output prob = T.nnet.softmax(combined_hidden) prob_y = prob[T.arange(prob.shape[0]), label] pred_label = T.argmax(prob, axis=1) # sum or mean? cost = -T.mean(T.log(prob_y)) accu = T.mean(T.eq(pred_label, label)) # return image_feat, input_idx, input_mask, \ # label, dropout, cost, accu return image_feat, input_idx, input_mask, \ label, dropout, cost, accu, pred_label, \ prob_attention_1, prob_attention_2
def __init__(self, mu, logsigma, rng=None, **kwargs): self.rng = rng if rng else RandomStreams( lasagne.random.get_rng().randint(1, 2147462579)) super(GaussianSampleLayer, self).__init__([mu, logsigma], **kwargs)
# -*- coding: utf-8 -*- import theano from theano import tensor as T from theano import function from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams import numpy as np import pandas as pd from theano import pp from collections import OrderedDict srng = RandomStreams() class GRU4Rec: ''' GRU4Rec(loss, final_act, hidden_act, layers, n_epochs=10, batch_size=50, dropout_p_hidden=0.5, dropout_p_embed=0.0, learning_rate=0.05, momentum=0.0, lmbd=0.0, embedding=0, n_sample=0, sample_alpha=0.75, smoothing=0, adapt='adagrad', decay=0.9, grad_cap=0, sigma=0, init_as_normal=False, reset_after_session=True, train_random_order=False, time_sort=True, session_key='SessionId', item_key='ItemId', time_key='Time') Initializes the network. Parameters ----------- loss : 'top1', 'bpr', 'cross-entropy', 'xe_logit', top1-max, bpr-max-<X> selects the loss function, <X> is the parameter of the loss final_act : 'softmax', 'linear', 'relu', 'tanh', 'softmax_logit', 'leaky-<X>', elu-<X> selects the activation function of the final layer, <X> is the parameter of the activation function hidden_act : 'tanh', 'relu' or 'linear' selects the activation function on the hidden states layers : 1D array
def __init__(self, p): super(GaussainDropout, self).__init__() self.p = p self.srng = RandomStreams(seed=np.random.randint(10e6))
def search_model_adam(state, channel, reload_model=False): pp.pprint(state) def get_inps(vgen=None, debug=False, output_map=None): X, y = TT.matrix("X", dtype="uint32"), TT.vector("y", dtype="uint8") mask = TT.matrix("mask", dtype="float32") if debug: theano.config.compute_test_value = "warn" batch = next(vgen) X.tag.test_value = batch[0].reshape((batch[0].shape[0], -1)) y.tag.test_value = batch[2].flatten() mask.tag.test_value = batch[1].reshape((batch[1].shape[0], -1)) return [X, y, mask] lr = state['lr'] batch_size = state['batch_size'] # No of els in the cols of the content for the memory mem_size = state['mem_size'] # No of rows in M mem_nel = state['mem_nel'] std = state['std'] renormalization_scale = state['renormalization_scale'] sub_mb_size = state['sub_mb_size'] smoothed_diff_weights = state.get('smoothed_diff_weights', False) inp_size = 41300 # No of hids for controller n_hids = state['n_hids'] # Not using deep out deep_out_size = 100 # Size of the bow embeddings bow_size = state.get('bow_size', 80) # ff controller use_ff_controller = state['use_ff_controller'] # For RNN controller: learn_h0 = state.get('learn_h0', False) use_nogru_mem2q = False # Use loc based addressing: use_loc_based_addressing = state.get('use_loc_based_addressing', False) bowout = state.get('bowout', False) use_reinforce = state.get('use_reinforce', False) permute_order = state.get('permute_order', False) use_layer_norm = state.get('use_layer_norm', False) recurrent_dropout_prob = state.get("recurrent_dropout_prob", -1) seed = 7 n_read_heads = state['n_read_heads'] n_write_heads = 1 n_reading_steps = state['n_reading_steps'] lambda1_rein = state.get('lambda1_rein', 4e-5) lambda2_rein = state.get('lambda2_rein', 1e-5) base_reg = 2e-5 #size of the address in the memory: address_size = state["address_size"] w2v_embed_scale = 0.05 n_out = 3 learn_embeds = state.get('learn_embeds', False) glove_emb_path = state.get('glove_emb_path', None) rng = np.random.RandomState(seed) trng = RandomStreams(seed) NRect = lambda x, use_noise=False: NRect( x, rng=trng, use_noise=use_noise, std=std) use_noise = False use_quad_interactions = state.get('use_quad_interactions', True) mode = state.get('theano_function_mode', None) import sys sys.setrecursionlimit(50000) learning_rule = Adam(gradient_clipping=state.get('gradient_clip', 10)) cont_act = Tanh mem_gater_activ = Sigmoid erase_activ = Sigmoid content_activ = Tanh use_gru_inp = state.get('use_gru_inp', False) use_bow_inp = state.get('use_bow_inp', False) w2v_embed_path = None use_reinforce_baseline = state['use_reinforce_baseline'] use_reinforce = state.get('use_reinforce', False) l1_pen = state.get('l1_pen', 1e-4) l2_pen = state.get('l2_pen', 1e-3) hybrid_att = state.get('hybrid_att', False) use_dice_val = state.get('use_dice_val', False) debug = state.get('debug', False) correlation_ws = state.get('correlation_ws', False) data_path = state.get('data_path', None) idxs = None use_batch_norm = state.get("use_batch_norm", False) anticorr = state.get('anticorr', None) prfx = ( "ntm_on_fb_copy_task_all_learn_h0_l1_no_n_hids_%(n_hids)s_bsize_%(batch_size)d" "_std_%(std)f_mem_nel_%(mem_nel)d_mem_size_%(mem_size)f_lr_%(lr)f_use_bn_%(use_batch_norm)d_hard2" ) % locals() random_flip_order = False train_datagen = SNLI(batch_size=batch_size, random_flip_order=random_flip_order, datapath=data_path, mode="train") valid_datagen = SNLI(batch_size=batch_size, random_flip_order=random_flip_order, datapath=data_path, mode="valid") test_datagen = SNLI(batch_size=batch_size, random_flip_order=random_flip_order, datapath=data_path, mode="test") n_layers = state.get('n_layers', 1) inps = get_inps(vgen=valid_datagen, debug=debug, output_map=True) max_len = inps[0].shape[0] wi = WeightInitializer(sparsity=-1, scale=std, rng=rng, init_method=InitMethods.Adaptive, center=0.0) bi = BiasInitializer(sparsity=-1, scale=1e-3, rng=rng, init_method=BiasInitMethods.Random, center=0.0) ntm = NTMModel(n_in=inp_size, n_hids=n_hids, bow_size=bow_size, n_out=n_out, predict_bow_out=bowout, mem_size=mem_size, mem_nel=mem_nel, use_ff_controller=use_ff_controller, sub_mb_size=sub_mb_size, deep_out_size=deep_out_size, inps=inps, n_layers=n_layers, hybrid_att=hybrid_att, smoothed_diff_weights=smoothed_diff_weights, baseline_reg=base_reg, w2v_embed_path=w2v_embed_path, renormalization_scale=renormalization_scale, use_batch_norm=use_batch_norm, w2v_embed_scale=w2v_embed_scale, n_read_heads=n_read_heads, n_write_heads=n_write_heads, use_last_hidden_state=True, use_loc_based_addressing=use_loc_based_addressing, use_simple_rnn_inp_rep=False, use_gru_inp_rep=use_gru_inp, use_bow_input=use_bow_inp, use_layer_norm=use_layer_norm, recurrent_dropout_prob=recurrent_dropout_prob, use_inp_content=False, use_mask=True, anticorr=anticorr, glove_embed_path=glove_emb_path, learn_embeds=learn_embeds, erase_activ=erase_activ, use_gate_quad_interactions=use_quad_interactions, content_activ=content_activ, use_multiscale_shifts=True, correlation_ws=correlation_ws, learning_rule=learning_rule, lambda1_rein=lambda1_rein, lambda2_rein=lambda2_rein, n_reading_steps=n_reading_steps, use_deepout=False, use_reinforce=use_reinforce, use_nogru_mem2q=use_nogru_mem2q, use_reinforce_baseline=use_reinforce_baseline, controller_activ=cont_act, use_adv_indexing=False, use_out_mem=False, unroll_recurrence=False, address_size=address_size, reinforce_decay=0.9, learn_h0=learn_h0, theano_function_mode=mode, l1_pen=l1_pen, debug=debug, mem_gater_activ=mem_gater_activ, tie_read_write_gates=False, weight_initializer=wi, bias_initializer=bi, use_cost_mask=False, use_noise=use_noise, rnd_indxs=idxs, permute_order=permute_order, max_fact_len=max_len, softmax=True, batch_size=None) save_freq = state.get("save_freq", 1000) main_loop = SNLIMainLoop(ntm, print_every=50, checkpoint_every=save_freq, validate_every=500, train_data_gen=train_datagen, valid_data_gen=valid_datagen, test_data_gen=test_datagen, learning_rate=lr, reload_model=reload_model, num_epochs=250, state=state, prefix=prfx) main_loop.run()
hidden_obs = model.inference_procedure.infer(sharedX(init_examples)) from theano import function outputs = [hidden_obs['H_hat']] for G_hat in hidden_obs['G_hat']: outputs.append(G_hat) init_chain_hid = function([], outputs)() model.dbm.V_chains = sharedX(init_chain_hid[0]) model.dbm.H_chains = [ sharedX(init_chain_elem) for init_chain_elem in init_chain_hid[1:] ] from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams theano_rng = RandomStreams(42) assert hasattr(model.dbm, 'V_chains') and model.dbm.V_chains is not None design_examples_var = model.s3c.random_design_matrix( batch_size=rows * cols, theano_rng=theano_rng, H_sample=model.dbm.V_chains) print 'compiling sampling function' f = function([], design_examples_var, updates=model.dbm.get_sampling_updates()) print 'init_examples later', (init_examples.min(), init_examples.mean(), init_examples.max()) examples = dataset.get_topological_view(init_examples) print 'examples ', (examples.min(), examples.mean(), examples.max()) assert len(examples.shape) == 4 is_color = examples.shape[3] == 3 pv = patch_viewer.PatchViewer((rows, cols), examples.shape[1:3], is_color=is_color)
def test_dA(learning_rate=0.1, training_epochs=15, dataset='mnist.pkl.gz', batch_size=20, output_folder='dA_plots'): """ This demo is tested on MNIST :type learning_rate: float :param learning_rate: learning rate used for training the DeNosing AutoEncoder :type training_epochs: int :param training_epochs: number of epochs used for training :type dataset: string :param dataset: path to the picked dataset """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size # start-snippet-2 # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images # end-snippet-2 if not os.path.isdir(output_folder): os.makedirs(output_folder) os.chdir(output_folder) #################################### # BUILDING THE MODEL NO CORRUPTION # #################################### rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2**30)) da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x, n_visible=28 * 28, n_hidden=500) cost, updates = da.get_cost_updates(corruption_level=0., learning_rate=learning_rate) train_da = theano.function( [index], cost, updates=updates, givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]}) start_time = timeit.default_timer() ############ # TRAINING # ############ # go through training epochs for epoch in range(training_epochs): # go through trainng set c = [] for batch_index in range(n_train_batches): c.append(train_da(batch_index)) print('Training epoch %d, cost ' % epoch, numpy.mean(c, dtype='float64')) end_time = timeit.default_timer() training_time = (end_time - start_time) print(('The no corruption code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((training_time) / 60.)), file=sys.stderr) image = Image.fromarray( tile_raster_images(X=da.W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('filters_corruption_0.png') # start-snippet-3 ##################################### # BUILDING THE MODEL CORRUPTION 30% # ##################################### rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2**30)) da = dA(numpy_rng=rng, theano_rng=theano_rng, input=x, n_visible=28 * 28, n_hidden=500) cost, updates = da.get_cost_updates(corruption_level=0.3, learning_rate=learning_rate) train_da = theano.function( [index], cost, updates=updates, givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]}) start_time = timeit.default_timer() ############ # TRAINING # ############ # go through training epochs for epoch in range(training_epochs): # go through trainng set c = [] for batch_index in range(n_train_batches): c.append(train_da(batch_index)) print('Training epoch %d, cost ' % epoch, numpy.mean(c, dtype='float64')) end_time = timeit.default_timer() training_time = (end_time - start_time) print(('The 30% corruption code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % (training_time / 60.)), file=sys.stderr) # end-snippet-3 # start-snippet-4 image = Image.fromarray( tile_raster_images(X=da.W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('filters_corruption_30.png') # end-snippet-4 os.chdir('../')
def build_model(tparams, options): # MIKE: why is this not a shared variable as in # trng = theano.tensor.shared_randomstreams.RandomStreams(1234) trng = RandomStreams(SEED) # Used for dropout. use_noise = theano.shared(numpy_floatX(0.)) x = tensor.matrix('x', dtype='int64') mask = tensor.matrix('mask', dtype=config.floatX) xt = tensor.matrix('xt', dtype=config.floatX) y = tensor.matrix('y', dtype='int64') yt = tensor.matrix('xt', dtype=config.floatX) n_timesteps = x.shape[0] n_examples = x.shape[1] if (options['arch_remap_input']): emb = tparams['Wemb'][x.flatten()].reshape( [n_timesteps, n_examples, options['n_hid']]) else: Wemb = theano.shared(numpy.concatenate( (numpy.zeros((1, options['n_hid']), dtype=config.floatX), numpy.identity(options['n_hid'], dtype=config.floatX)), axis=0), name='Wemb') emb = Wemb[x.flatten()].reshape( [n_timesteps, n_examples, options['n_hid']]) # this is the call to either lstm_layer or hpm_layer proj = get_layer(options['encoder'])[1](tparams, emb, xt, yt, options, prefix=options['encoder'], mask=mask) # proj has dim n_timesteps X n_examples X n_hid if options['use_dropout']: proj = dropout_layer(proj, use_noise, trng) def _step(proj_step): if (options['arch_output_fn'] == 'softmax'): pred_prob_step = tensor.nnet.softmax( tensor.dot(proj_step, tparams['U']) + tparams['b']) elif (options['arch_output_fn'] == 'logistic'): pred_prob_step = tensor.nnet.sigmoid( tensor.dot(proj_step, tparams['U']) + tparams['b']) else: # '1-1' pred_prob_step = proj_step / tensor.sum( proj_step, axis=1, keepdims=True) pred_prob_step = tensor.concatenate( [tensor.alloc(0, n_examples, 1), pred_prob_step], axis=1) return pred_prob_step # pred_prob_step should have dim n_examples X n_outputs # pred_prob has dim n_timesteps x n_examples x n_outputs # pred_step has have dim n_examples pred_prob, updates = theano.scan(_step, sequences=proj, outputs_info=None, non_sequences=None, n_steps=n_timesteps) # tgt_prob_step should have dim n_examples def _cost_step_norm(pred_prob_step, y_step): tgt_prob_step = tensor.switch( tensor.eq(y_step, 0), 1.0, pred_prob_step[tensor.arange(n_examples), y_step] / (1.0 - pred_prob_step[tensor.arange(n_examples), 0])) # need to add 1 to pass by index 0 which we removed in computing max pred_ix_step = tensor.argmax(pred_prob_step[:, 1:], axis=1) + 1 if (options['type_token_sim']): # DEBUG corr_step = tensor.switch( tensor.eq(y_step, 0), 0, tensor.switch( tensor.eq((y_step - 1) // 5, (pred_ix_step - 1) // 5), 1, -1)) else: corr_step = tensor.switch( tensor.eq(y_step, 0), 0, tensor.switch(tensor.eq(y_step, pred_ix_step), 1, -1)) return tgt_prob_step, corr_step # cost function for predicting target value of a specific event # tgt_prob_step should have dim n_examples def _cost_step_tgt(pred_prob_step, y_step): tgt_prob_step = tensor.switch( tensor.eq(y_step, 0), 1.0, tensor.switch( tensor.gt(y_step, 0), pred_prob_step[tensor.arange(n_examples), y_step], 1.0 - pred_prob_step[tensor.arange(n_examples), -y_step])) corr_step = tensor.switch( tensor.eq(y_step, 0), 0, tensor.switch(tensor.gt(tgt_prob_step, 0.5), 1, -1)) return tgt_prob_step, corr_step if (options['signed_out']): cost_fn = _cost_step_tgt else: cost_fn = _cost_step_norm (tgt_prob, corr), updates = theano.scan(cost_fn, sequences=[pred_prob, y], outputs_info=None, non_sequences=None, n_steps=n_timesteps) off = 1e-8 if tgt_prob.dtype == 'float16': off = 1e-6 # tgt_prob: probability correct (dimensions n_timesteps X n_examples) cost = -tensor.sum(tensor.log(tgt_prob.clip(off, 1.0))) # Note: not dividing by count because it will reweight minibatch by size # / tensor.sum(tensor.gt(y,0)) return use_noise, x, xt, y, yt, mask, pred_prob, corr, cost
def __init__(self, numpy_rng, theano_rng=None, input=None, n_visible=784, n_hidden=500, W=None, bhid=None, bvis=None): """ Initialize the dA class by specifying the number of visible units (the dimension d of the input ), the number of hidden units ( the dimension d' of the latent or hidden space ) and the corruption level. The constructor also receives symbolic variables for the input, weights and bias. Such a symbolic variables are useful when, for example the input is the result of some computations, or when weights are shared between the dA and an MLP layer. When dealing with SdAs this always happens, the dA on layer 2 gets as input the output of the dA on layer 1, and the weights of the dA are used in the second stage of training to construct an MLP. :type numpy_rng: numpy.random.RandomState :param numpy_rng: number random generator used to generate weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type input: theano.tensor.TensorType :param input: a symbolic description of the input or None for standalone dA :type n_visible: int :param n_visible: number of visible units :type n_hidden: int :param n_hidden: number of hidden units :type W: theano.tensor.TensorType :param W: Theano variable pointing to a set of weights that should be shared belong the dA and another architecture; if dA should be standalone set this to None :type bhid: theano.tensor.TensorType :param bhid: Theano variable pointing to a set of biases values (for hidden units) that should be shared belong dA and another architecture; if dA should be standalone set this to None :type bvis: theano.tensor.TensorType :param bvis: Theano variable pointing to a set of biases values (for visible units) that should be shared belong dA and another architecture; if dA should be standalone set this to None """ self.n_visible = n_visible self.n_hidden = n_hidden # create a Theano random generator that gives symbolic random values if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # note : W' was written as `W_prime` and b' as `b_prime` if not W: # W is initialized with `initial_W` which is uniformely sampled # from -4*sqrt(6./(n_visible+n_hidden)) and # 4*sqrt(6./(n_hidden+n_visible))the output of uniform if # converted using asarray to dtype # theano.config.floatX so that the code is runable on GPU initial_W = numpy.asarray(numpy_rng.uniform( low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)), high=4 * numpy.sqrt(6. / (n_hidden + n_visible)), size=(n_visible, n_hidden)), dtype=theano.config.floatX) W = theano.shared(value=initial_W, name='W', borrow=True) if not bvis: bvis = theano.shared(value=numpy.zeros(n_visible, dtype=theano.config.floatX), borrow=True) if not bhid: bhid = theano.shared(value=numpy.zeros(n_hidden, dtype=theano.config.floatX), name='b', borrow=True) self.W = W # b corresponds to the bias of the hidden self.b = bhid # b_prime corresponds to the bias of the visible self.b_prime = bvis # tied weights, therefore W_prime is W transpose self.W_prime = self.W.T self.theano_rng = theano_rng # if no input is given, generate a variable representing the input if input is None: # we use a matrix because we expect a minibatch of several # examples, each example being a row self.x = T.dmatrix(name='input') else: self.x = input self.params = [self.W, self.b, self.b_prime]
def build_model(tparams, options): opt_ret = dict() trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # description string: #words x #samples x = tensor.matrix('x', dtype='int64') x_mask = tensor.matrix('x_mask', dtype='float32') y = tensor.matrix('y', dtype='int64') y_mask = tensor.matrix('y_mask', dtype='float32') # for the backward rnn, we just need to invert x and x_mask xr = x[::-1] xr_mask = x_mask[::-1] n_timesteps = x.shape[0] n_timesteps_trg = y.shape[0] n_samples = x.shape[1] # word embedding for forward rnn (source) emb = tparams['Wemb'][x.flatten()] emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) proj = get_layer(options['encoder'])[1](tparams, emb, options, prefix='encoder', mask=x_mask) # word embedding for backward rnn (source) embr = tparams['Wemb'][xr.flatten()] embr = embr.reshape([n_timesteps, n_samples, options['dim_word']]) projr = get_layer(options['encoder'])[1](tparams, embr, options, prefix='encoder_r', mask=xr_mask) # context will be the concatenation of forward and backward rnns ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim - 1) # mean of the context (across time) will be used to initialize decoder rnn ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None] # or you can use the last state of forward + backward encoder rnns # ctx_mean = concatenate([proj[0][-1], projr[0][-1]], axis=proj[0].ndim-2) # initial decoder state init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh') # word embedding (target), we will shift the target sequence one time step # to the right. This is done because of the bi-gram connections in the # readout and decoder rnn. The first target will be all zeros and we will # not condition on the last output. emb = tparams['Wemb_dec'][y.flatten()] emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']]) emb_shifted = tensor.zeros_like(emb) emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) emb = emb_shifted # decoder - pass through the decoder conditional gru with attention proj = get_layer(options['decoder'])[1](tparams, emb, options, prefix='decoder', mask=y_mask, context=ctx, context_mask=x_mask, one_step=False, init_state=init_state) # hidden states of the decoder gru proj_h = proj[0] # weighted averages of context, generated by attention module ctxs = proj[1] # weights (alignment matrix) opt_ret['dec_alphas'] = proj[2] # compute word probabilities logit_lstm = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') logit_prev = get_layer('ff')[1](tparams, emb, options, prefix='ff_logit_prev', activ='linear') logit_ctx = get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx) if options['use_dropout']: logit = dropout_layer(logit, use_noise, trng) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape probs = tensor.nnet.softmax( logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]])) # cost y_flat = y.flatten() y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat cost = -tensor.log(probs.flatten()[y_flat_idx]) cost = cost.reshape([y.shape[0], y.shape[1]]) cost = (cost * y_mask).sum(0) return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost
import theano import theano.tensor as T # from theano.tensor.shared_randomstreams import RandomStreams from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams # veel sneller import numpy as np numpy_rng = np.random.RandomState(123) theano_rng = RandomStreams(numpy_rng.randint(2**30)) ## samplers def bernoulli(a): # a is the bernoulli parameter return theano_rng.binomial(size=a.shape, n=1, p=a, dtype=theano.config.floatX) def gaussian(a, var=1.0): # a is the mean, var is the variance (not std or precision!) std = T.sqrt(var) return theano_rng.normal(size=a.shape, avg=a, std=std, dtype=theano.config.floatX) def multinomial(a):