Ejemplo n.º 1
0
def rescore_model(source_file, target_file, output_file, scorer_settings,
                  options):

    trng = RandomStreams(1234)

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        alignments = []
        for i, model in enumerate(scorer_settings.models):
            f_log_probs = load_scorer(model,
                                      options[i],
                                      alignweights=alignweights)
            score, alignment = pred_probs(
                f_log_probs,
                prepare_data,
                options[i],
                pairs,
                normalization_alpha=scorer_settings.normalization_alpha,
                alignweights=alignweights)
            scores.append(score)
            alignments.append(alignment)

        return scores, alignments

    pairs = TextIterator(
        source_file.name,
        target_file.name,
        options[0]['dictionaries'][:-1],
        options[0]['dictionaries'][-1],
        n_words_source=options[0]['n_words_src'],
        n_words_target=options[0]['n_words'],
        batch_size=scorer_settings.b,
        maxlen=float('inf'),
        use_factor=(options[0]['factors'] > 1),
        sort_by_length=False
    )  #TODO: sorting by length could be more efficient, but we'd want to resort after

    scores, alignments = _score(pairs, scorer_settings.alignweights)

    source_file.seek(0)
    target_file.seek(0)
    source_lines = source_file.readlines()
    target_lines = target_file.readlines()

    for i, line in enumerate(target_lines):
        score_str = ' '.join(map(str, [s[i] for s in scores]))
        if scorer_settings.verbose:
            output_file.write('{0} '.format(line.strip()))
        output_file.write('{0}\n'.format(score_str))

    # optionally save attention weights
    if scorer_settings.alignweights:
        temp_name = output_file.name + ".json"
        with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT:
            for line in alignments:
                if type(line) == list:
                    for l in line:
                        align_OUT.write(l + "\n")
                else:
                    align_OUT.write(line + "\n")
            # combining the actual source and target words.
            combine_source_target_text_1to1(source_file, target_file,
                                            output_file.name, align_OUT)
        attinp_h2, attgate_h2 = att_to_h2.proj(w_t)
        attinp_h3, attgate_h3 = att_to_h3.proj(w_t)

        h2_t = cell2.step(xinp_h2_t + h1inp_h2 + attinp_h2,
                          xgate_h2_t + h1gate_h2 + attgate_h2, h2_tm1)

        h2inp_h3, h2gate_h3 = h2_to_h3.proj(h2_t)

        h3_t = cell3.step(xinp_h3_t + h1inp_h3 + h2inp_h3 + attinp_h3,
                          xgate_h3_t + h1gate_h3 + h2gate_h3 + attgate_h3,
                          h3_tm1)
        return h1_t, h2_t, h3_t, k_t, w_t

    init_x = as_shared(np_zeros((minibatch_size, n_out)))
    srng = RandomStreams(1999)

    # Used to calculate stopping heuristic from sections 5.3
    u_max = 0. * tensor.arange(c_sym.shape[0]) + c_sym.shape[0]
    u_max = u_max.dimshuffle('x', 'x', 0)
    u_max = tensor.cast(u_max, theano.config.floatX)

    def sample_step(x_tm1, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, ctx):
        xinp_h1_t, xgate_h1_t = inp_to_h1.proj(x_tm1)
        xinp_h2_t, xgate_h2_t = inp_to_h2.proj(x_tm1)
        xinp_h3_t, xgate_h3_t = inp_to_h3.proj(x_tm1)

        attinp_h1, attgate_h1 = att_to_h1.proj(w_tm1)

        h1_t = cell1.step(xinp_h1_t + attinp_h1, xgate_h1_t + attgate_h1,
                          h1_tm1)
Ejemplo n.º 3
0
def translate_model(queue, rqueue, pid, models, options, k, normalize, verbose,
                    nbest, return_alignment, suppress_unk):

    from theano_util import (load_params, init_theano_params)
    from nmt import (build_sampler, gen_sample, init_params)

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    from theano import shared
    trng = RandomStreams(1234)
    use_noise = shared(numpy.float32(0.))

    fs_init = []
    fs_next = []

    for model, option in zip(models, options):

        # load model parameters and set theano shared variables
        params = numpy.load(model)
        tparams = init_theano_params(params)

        # word index
        f_init, f_next = build_sampler(tparams,
                                       option,
                                       use_noise,
                                       trng,
                                       return_alignment=return_alignment)

        fs_init.append(f_init)
        fs_next.append(f_next)

    def _translate(seq):
        # sample given an input sequence and obtain scores
        sample, score, word_probs, alignment = gen_sample(
            fs_init,
            fs_next,
            numpy.array(seq).T.reshape([len(seq), 1]),
            trng=trng,
            k=k,
            maxlen=200,
            stochastic=False,
            argmax=False,
            return_alignment=return_alignment,
            suppress_unk=suppress_unk)

        # normalize scores according to sequence lengths
        if normalize:
            lengths = numpy.array([len(s) for s in sample])
            score = score / lengths
        if nbest:
            return sample, score, word_probs, alignment
        else:
            sidx = numpy.argmin(score)
            return sample[sidx], score[sidx], word_probs[sidx], alignment[sidx]

    while True:
        req = queue.get()
        if req is None:
            break

        idx, x = req[0], req[1]
        if verbose:
            sys.stderr.write('{0} - {1}\n'.format(pid, idx))
        seq = _translate(x)

        rqueue.put((idx, seq))

    return
Ejemplo n.º 4
0
 def __init__(self, sigma):
     super(GaussainNoise, self).__init__()
     self.sigma = sigma
     self.srng = RandomStreams(seed=np.random.randint(10e6))
Ejemplo n.º 5
0
def translate_model(queue, rqueue, pid, models, options, k, normalization_alpha, verbose, nbest, return_alignment, suppress_unk, return_hyp_graph, deviceid):

    # if the --device-list argument is set
    if deviceid != '':
        import os
        theano_flags = os.environ['THEANO_FLAGS'].split(',')
        exist = False
        for i in xrange(len(theano_flags)):
            if theano_flags[i].strip().startswith('device'):
                exist = True
                theano_flags[i] = '%s=%s' % ('device', deviceid)
                break
        if exist == False:
            theano_flags.append('%s=%s' % ('device', deviceid))
        os.environ['THEANO_FLAGS'] = ','.join(theano_flags)

    from theano_util import (load_params, init_theano_params)
    from nmt import (build_sampler, gen_sample, init_params)

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    from theano import shared
    trng = RandomStreams(1234)
    use_noise = shared(numpy.float32(0.))

    fs_init = []
    fs_next = []

    for model, option in zip(models, options):
        # load model parameters and set theano shared variables
        param_list = numpy.load(model).files
        param_list = dict.fromkeys([key for key in param_list if not key.startswith('adam_')], 0)
        params = load_params(model, param_list)
        tparams = init_theano_params(params)

        # word index
        f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=return_alignment)

        fs_init.append(f_init)
        fs_next.append(f_next)

    def _translate(seq):
        # sample given an input sequence and obtain scores
        sample, score, word_probs, alignment, hyp_graph = gen_sample(fs_init, fs_next,
                                   numpy.array(seq).T.reshape([len(seq[0]), len(seq), 1]),
                                   trng=trng, k=k, maxlen=200,
                                   stochastic=False, argmax=False, return_alignment=return_alignment,
                                   suppress_unk=suppress_unk, return_hyp_graph=return_hyp_graph)

        # normalize scores according to sequence lengths
        if normalization_alpha:
            adjusted_lengths = numpy.array([len(s) ** normalization_alpha for s in sample])
            score = score / adjusted_lengths
        if nbest:
            return sample, score, word_probs, alignment, hyp_graph
        else:
            sidx = numpy.argmin(score)
            return sample[sidx], score[sidx], word_probs[sidx], alignment[sidx], hyp_graph

    while True:
        req = queue.get()
        if req is None:
            break

        idx, x = req[0], req[1]
        if verbose:
            sys.stderr.write('{0} - {1}\n'.format(pid,idx))
        seq = _translate(x)

        rqueue.put((idx, seq))

    return
Ejemplo n.º 6
0
    def __init__(self,
                 vocabulary,
                 architecture,
                 mode=Mode.minibatch,
                 profile=False):
        """Initializes the neural network parameters for all layers, and
        creates Theano shared variables from them.

        :type vocabulary: Vocabulary
        :param vocabulary: mapping between word IDs and word classes

        :type architecture: Architecture
        :param architecture: an object that describes the network architecture

        :type mode: Network.Mode
        :param mode: constructs a variation of the networkif set to True, creates a network that
            produces the probability distribution for the next word (instead of
            of target probabilities for a mini-batch)

        :type profile: bool
        :param profile: if set to True, creates a Theano profile object
        """

        self.vocabulary = vocabulary
        self.architecture = architecture
        self.mode = mode

        M1 = 2147483647
        M2 = 2147462579
        random_seed = [
            numpy.random.randint(0, M1),
            numpy.random.randint(0, M1),
            numpy.random.randint(1, M1),
            numpy.random.randint(0, M2),
            numpy.random.randint(0, M2),
            numpy.random.randint(1, M2)
        ]
        self.random = RandomStreams(random_seed)

        # Word and class inputs will be available to NetworkInput layers.
        self.word_input = tensor.matrix('network/word_input', dtype='int64')
        self.class_input = tensor.matrix('network/class_input', dtype='int64')
        if self.mode.is_minibatch():
            self.word_input.tag.test_value = test_value(
                size=(100, 16), max_value=vocabulary.num_words())
            self.class_input.tag.test_value = test_value(
                size=(100, 16), max_value=vocabulary.num_classes())
        else:
            self.word_input.tag.test_value = test_value(
                size=(1, 16), max_value=vocabulary.num_words())
            self.class_input.tag.test_value = test_value(
                size=(1, 16), max_value=vocabulary.num_classes())

        # Recurrent layers will create these lists, used to initialize state
        # variables of appropriate sizes, for doing forward passes one step at a
        # time.
        self.recurrent_state_input = []
        self.recurrent_state_size = []

        # Create the layers.
        logging.debug("Creating layers.")
        self.layers = OrderedDict()
        for input_options in architecture.inputs:
            input = NetworkInput(input_options, self)
            self.layers[input.name] = input
        for layer_description in architecture.layers:
            layer_options = self._layer_options_from_description(
                layer_description)
            if layer_options['name'] == architecture.output_layer:
                layer_options['size'] = vocabulary.num_classes()
            layer = create_layer(layer_options, self, profile=profile)
            self.layers[layer.name] = layer
        self.output_layer = self.layers[architecture.output_layer]

        # This list will be filled by the recurrent layers to contain the
        # recurrent state outputs, for doing forward passes one step at a time.
        self.recurrent_state_output = [None] * len(self.recurrent_state_size)

        # When the mode is target_words, this input variable specifies the words
        # whose probabilities will be computed.
        self.target_class_ids = tensor.matrix('network/target_class_ids',
                                              dtype='int64')
        self.target_class_ids.tag.test_value = test_value(
            size=(1, 16), max_value=vocabulary.num_classes())

        # Create initial parameter values.
        logging.debug("Initializing parameters.")
        self.param_init_values = OrderedDict()
        num_params = 0
        for layer in self.layers.values():
            for name, value in layer.param_init_values.items():
                logging.debug("- %s size=%d", name, value.size)
                num_params += value.size
            self.param_init_values.update(layer.param_init_values)
        logging.debug("Total number of parameters: %d", num_params)

        # Create Theano shared variables.
        self.params = {
            name: theano.shared(value, name)
            for name, value in self.param_init_values.items()
        }
        for layer in self.layers.values():
            layer.set_params(self.params)

        # mask is used to mask out the rest of the input matrix, when a sequence
        # is shorter than the maximum sequence length. The mask is kept as int8
        # data type, which is how Tensor stores booleans.
        if self.mode.is_minibatch():
            self.mask = tensor.matrix('network/mask', dtype='int8')
            self.mask.tag.test_value = test_value(size=(100, 16),
                                                  max_value=True)
        else:
            self.mask = tensor.ones(self.word_input.shape, dtype='int8')

        # Dropout layer needs to know whether we are training or evaluating.
        self.is_training = tensor.scalar('network/is_training', dtype='int8')
        self.is_training.tag.test_value = 1

        for layer in self.layers.values():
            layer.create_structure()
Ejemplo n.º 7
0
    def __init__(
        self,
        nvisible,
        nhidden,
        hbias=None,
        vbias=None,
        W_real=None,
        W_imag=None,
        input=None,
        np_rng=None,
        theano_rng=None,
    ):
        """
        RBM constructor,
        :param nvisible: number of visible nodes
        :param nhidden: number of hidden nodes
        :param hbias:"magnetic" field in the hidden layer,
                      if the value is None, then initialize it the random number generator
                      otherwise the initialized value is set to be the value of it.
        :param vbias: "magnetic" field in the visible layer,
                      if the value is None, then initialize it the random number generator
                      otherwise the initialized value is set to be the value of it.

        :param W_real: real part of the weight matrix connects visible layer and hidden layer

        :param W_imag: imaginary part of the weight matrix connects visible layer and hidden layer

        :param input: the initial sample for the visible layer (or spin configuration)
                      if the value is None, then initialize it with the random number generator
        :param np_rng: random number generator seed
        :param theano_rng: random number generator seed of Theano

        """
        self.nvisible = nvisible
        self.nhidden = nhidden
        if np_rng is None:
            # create a number generator
            np_rng = np.random.RandomState(1234)

        if theano_rng is None:
            theano_rng = RandomStreams(np_rng.randint(2**30))

        if W_real is None:
            # W_real is initialized with `initial_Wreal` which is uniformely
            # sampled from -4*sqrt(6./(n_visible+n_hidden)) and
            # 4*sqrt(6./(n_hidden+n_visible)) the output of uniform if
            # converted using asarray to dtype theano.config.floatX so
            # that the code is runable on GPU
            initial_Wreal = np.asarray(np_rng.uniform(
                low=-2 * np.sqrt(6. / (nhidden + nvisible)),
                high=2 * np.sqrt(6. / (nhidden + nvisible)),
                size=(nvisible, nhidden)),
                                       dtype=theano.config.floatX)
            # theano shared variables for weights real part
            W_real = theano.shared(value=initial_Wreal,
                                   name='Wreal',
                                   borrow=True)
        if W_imag is None:
            # W_real is initialized with `initial_Wreal` which is uniformely
            # sampled from -4*sqrt(6./(n_visible+n_hidden)) and
            # 4*sqrt(6./(n_hidden+n_visible)) the output of uniform if
            # converted using asarray to dtype theano.config.floatX so
            # that the code is runable on GPU
            initial_Wimag = np.asarray(np_rng.uniform(
                low=-2 * np.sqrt(6. / (nhidden + nvisible)),
                high=2 * np.sqrt(6. / (nhidden + nvisible)),
                size=(nvisible, nhidden)),
                                       dtype=theano.config.floatX)
            # theano shared variables for weights imaginary part
            W_imag = theano.shared(value=initial_Wimag,
                                   name='Wimag',
                                   borrow=True)

        if hbias is None:
            # create shared variable for hidden units bias
            hbias = theano.shared(value=np.zeros(nhidden,
                                                 dtype=theano.config.floatX),
                                  name='hbias',
                                  borrow=True)

        if vbias is None:
            # create shared variable for visible units bias
            vbias = theano.shared(value=np.zeros(nvisible,
                                                 dtype=theano.config.floatX),
                                  name='vbias',
                                  borrow=True)

        # initialize input layer for standalone RBM or layer0 of DBN
        # self.input = input
        # if not input:
        #     self.input = T.matrix('input')
        # self.input=input

        self.W_real = W_real
        self.W_imag = W_imag
        self.hbias = hbias
        self.vbias = vbias
        self.theano_rng = theano_rng
        self.input = input
        # **** WARNING: It is not a good idea to put things in this list
        # other than shared variables created in this function.
        #self.params = [self.W_real,self.W_imag,self.hbias, self.vbias]
        self.params = [self.W_real, self.hbias, self.vbias]
Ejemplo n.º 8
0
    def __init__(self,
                 input=None,
                 n_visible=784,
                 n_hidden=500,
                 W=None,
                 hbias=None,
                 vbias=None,
                 numpy_rng=None,
                 theano_rng=None):
        """
        RBM constructor. Defines the parameters of the model along with
        basic operations for inferring hidden from visible (and vice-versa),
        as well as for performing CD updates.

        :param input: None for standalone RBMs or symbolic variable if RBM is
        part of a larger graph.

        :param n_visible: number of visible units

        :param n_hidden: number of hidden units

        :param W: None for standalone RBMs or symbolic variable pointing to a
        shared weight matrix in case RBM is part of a DBN network; in a DBN,
        the weights are shared between RBMs and layers of a MLP

        :param hbias: None for standalone RBMs or symbolic variable pointing
        to a shared hidden units bias vector in case RBM is part of a
        different network

        :param vbias: None for standalone RBMs or a symbolic variable
        pointing to a shared visible units bias
        """

        self.n_visible = n_visible
        self.n_hidden = n_hidden

        if numpy_rng is None:
            # create a number generator
            numpy_rng = numpy.random.RandomState(1234)

        if theano_rng is None:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))

        if W is None:
            # W is initialized with `initial_W` which is uniformely
            # sampled from -4*sqrt(6./(n_visible+n_hidden)) and
            # 4*sqrt(6./(n_hidden+n_visible)) the output of uniform if
            # converted using asarray to dtype theano.config.floatX so
            # that the code is runable on GPU
            initial_W = numpy.asarray(numpy_rng.uniform(
                low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
                high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
                size=(n_visible, n_hidden)),
                                      dtype=theano.config.floatX)
            # theano shared variables for weights and biases
            W = theano.shared(value=initial_W, name='W', borrow=True)

        if hbias is None:
            # create shared variable for hidden units bias
            hbias = theano.shared(value=numpy.zeros(
                n_hidden, dtype=theano.config.floatX),
                                  name='hbias',
                                  borrow=True)

        if vbias is None:
            # create shared variable for visible units bias
            vbias = theano.shared(value=numpy.zeros(
                n_visible, dtype=theano.config.floatX),
                                  name='vbias',
                                  borrow=True)

        # initialize input layer for standalone RBM or layer0 of DBN
        self.input = input
        if not input:
            self.input = T.matrix('input')

        self.W = W
        self.hbias = hbias
        self.vbias = vbias
        self.theano_rng = theano_rng
        # **** WARNING: It is not a good idea to put things in this list
        # other than shared variables created in this function.
        self.params = [self.W, self.hbias, self.vbias]
Ejemplo n.º 9
0
    def __init__(self, options, channel, data, model):
        """
        Parameters:
            options: Dictionary
            `options` is expected to contain the following keys:
                `cbs` -> int
                    Number of samples to consider at a time when computing
                    some property of the model
                `gbs` -> int
                    Number of samples over which to compute the gradients
                `mbs` -> int
                    Number of samples over which to compute the metric
                `ebs` -> int
                    Number of samples over which to evaluate the training
                    error
                `mreg` -> float
                    Regularization added to the metric
                `mrtol` -> float
                    Relative tolerance for inverting the metric
                `miters` -> int
                    Number of iterations
                `seed` -> int
                    Random number generator seed
                `profile` -> bool
                    Flag, if profiling should be on or not
                `verbose` -> int
                    Verbosity level
                `lr` -> float
                    Learning rate
            channel: jobman channel or None
            data: dictionary-like object return by numpy.load containing the
                data
            model : model
        """
        n_params = len(model.params)
        self.data = data

        if options['device'] != 'gpu':
            xdata = theano.shared(data['train_x'][:options['gbs']],
                                  name='xdata')
            ydata = TT._shared(data['train_y'][:options['gbs']], name='ydata')
            self.xdata = xdata
            self.ydata = ydata
            shared_data = [xdata, ydata]
        else:
            self.cpu_shared_data = []
            xdata = theano.shared(data['train_x'], name='xdata')
            ydata = TT._shared(data['train_y'], name='ydata')
            self.xdata = xdata
            self.ydata = ydata
            shared_data = [xdata, ydata]

        self.rng = numpy.random.RandomState(options['seed'])
        n_samples = data['train_x'].shape[0]
        self.grad_batches = n_samples // options['gbs']
        self.metric_batches = n_samples // options['mbs']
        self.eval_batches = n_samples // options['ebs']

        self.verbose = options['verbose']
        if options['device'] != 'gpu':
            # Store eucledian gradients
            self.gs = [
                TT._shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]
            # Store riemannian gradients
            self.rs = [
                TT._shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]
        else:
            # Store eucledian gradients
            self.gs = [
                theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]
            # Store riemannian gradients
            self.rs = [
                theano.shared(numpy.zeros(shp, dtype=theano.config.floatX))
                for shp in model.params_shape
            ]

        self.permg = self.rng.permutation(self.grad_batches)
        self.permr = self.rng.permutation(self.metric_batches)
        self.perme = self.rng.permutation(self.eval_batches)
        self.k = 0
        self.posg = 0
        self.posr = 0
        self.pose = 0

        # Step 1. Compile function for computing eucledian gradients

        # inputs
        gbdx = TT.iscalar('grad_batch_idx')
        print 'Constructing grad function'
        srng = RandomStreams(numpy.random.randint(1e5))
        loc_inputs = [x.type() for x in model.inputs]

        def grad_step(*args):
            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[1:1 + n_params], gs)]
            return [args[0] + const(1)] + \
                    nw_gs

        ig = [
            TT.unbroadcast(TT.alloc(const(0), 1, *shp), 0)
            for shp in model.params_shape
        ]
        idx0 = TT.unbroadcast(const([0]), 0)
        n_steps = options['gbs'] // options['cbs']
        rvals, updates = scan(grad_step,
                              states=[idx0] + ig,
                              n_steps=n_steps,
                              name='grad_loop',
                              profile=options['profile'])

        nw_gs = [x[0] / const(n_steps) for x in rvals[1:1 + n_params]]

        # updates
        updates.update(dict(zip(self.gs, nw_gs)))
        # givens
        if options['device'] == 'gpu':
            grad_inps = [(x,
                          y[gbdx * options['gbs']:(gbdx + 1) * options['gbs']])
                         for x, y in zip(loc_inputs, shared_data)]
        else:
            grad_inps = zip(loc_inputs, shared_data)

        print 'Compiling grad function'
        self.compute_eucledian_gradients = theano.function(
            [gbdx], [],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_eucledian_gradients',
            mode=gpu_mode,
            on_unused_input='warn',
            profile=options['profile'])

        # Step 2. Compile function for Computing Riemannian gradients
        rbdx = TT.iscalar('riemmanian_batch_idx')
        rbpos = rbdx * options['mbs']

        if options['device'] == 'gpu':
            mode = gpu_mode

            def compute_Gv(*args):
                idx0 = const([0])
                ep = [
                    TT.alloc(const(0), 1, *shp) for shp in model.params_shape
                ]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(
                        zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs,
                                                    model.outs_operator):
                        loc_params = [
                            x for x in model.params
                            if x in theano.gof.graph.inputs([nw_out])
                        ]
                        loc_args = [
                            x for x, y in zip(args, model.params)
                            if y in theano.gof.graph.inputs([nw_out])
                        ]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(
                                options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [
                        ogv + final_results[param]
                        for (ogv, param) in zip(gv_args[1:], model.params)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                    nw_cost, nw_preactiv_out = safe_clone(
                        [model.train_cost, model.preactiv_out], replace)
                    nw_gvs = TT.Lop(
                        nw_preactiv_out, model.params,
                        TT.Rop(TT.grad(nw_cost, nw_preactiv_out), model.params,
                               args))

                    Gvs = [
                        ogv + ngv for (ogv, ngv) in zip(gv_args[1:], nw_gvs)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=theano.Mode(linker='cvm'),
                                      name='Gv_step',
                                      profile=options['profile'])

                final_Gvs = [x[0] / const(n_steps) for x in rvals[1:]]
                return final_Gvs, updates
        else:
            mode = cpu_mode

            def compute_Gv(*args):
                cgv = [
                    theano.shared(numpy.zeros(shp, dtype=theano.config.floatX),
                                  name='cgv%d' % idx)
                    for idx, shp in enumerate(model.params_shape)
                ]
                print_mem('allocated mem for cgv')
                idx0 = const([0])
                ep = [
                    TT.alloc(const(0), 1, *shp) for shp in model.params_shape
                ]

                def Gv_step(*gv_args):
                    idx = TT.cast(gv_args[0], 'int32')
                    nw_inps = [x[idx * options['cbs']: \
                                 (idx + 1) * options['cbs']] for x in
                               loc_inputs]
                    replace = dict(zip(model.inputs, nw_inps))
                    nw_outs = safe_clone(model.outs, replace)
                    final_results = dict(
                        zip(model.params, [None] * len(model.params)))
                    for nw_out, out_operator in zip(nw_outs,
                                                    model.outs_operator):
                        loc_params = [
                            x for x in model.params
                            if x in theano.gof.graph.inputs([nw_out])
                        ]
                        loc_args = [
                            x for x, y in zip(cgv, model.params)
                            if y in theano.gof.graph.inputs([nw_out])
                        ]
                        if out_operator == 'softmax':
                            factor = const(options['cbs']) * nw_out
                        elif out_operator == 'sigmoid':
                            factor = const(
                                options['cbs']) * nw_out * (1 - nw_out)
                        else:
                            factor = const(options['cbs'])

                        loc_Gvs = TT.Lop(nw_out, loc_params,
                                         TT.Rop(nw_out, loc_params, loc_args) /\
                                         factor)

                        for lp, lgv in zip(loc_params, loc_Gvs):
                            if final_results[lp] is None:
                                final_results[lp] = lgv
                            else:
                                final_results[lp] += lgv

                    Gvs = [
                        ogv + final_results[param]
                        for (ogv, param) in zip(gv_args[1:], model.params)
                    ]
                    return [gv_args[0] + const(1)] + Gvs

                states = [idx0] + ep
                n_steps = options['mbs'] // options['cbs']
                rvals, updates = scan(Gv_step,
                                      states=states,
                                      n_steps=n_steps,
                                      mode=gpu_mode,
                                      name='Gv_step',
                                      profile=options['profile'])
                final_Gvs = [
                    TT.as_tensor_variable(x[0]) / const(n_steps)
                    for x in rvals[1:]
                ]
                grad_inps = zip(loc_inputs, shared_data)
                loc_fn = theano.function([],
                                         final_Gvs,
                                         updates=updates,
                                         givens=dict(grad_inps),
                                         on_unused_input='warn',
                                         mode=gpu_mode,
                                         name='loc_fn',
                                         profile=options['profile'])
                fake_op = FakeGPUShell(cgv, loc_fn, len(cgv))

                return fake_op(*args), {}

        print 'Constructing riemannian gradient function'
        norm_grads = TT.sqrt(sum(TT.sum(x**2) for x in self.gs))
        rvals = minres.minres(compute_Gv, [x / norm_grads for x in self.gs],
                              rtol=options['mrtol'],
                              shift=-options['mreg'],
                              maxit=options['miters'],
                              mode=mode,
                              profile=options['profile'])
        nw_rs = [x * norm_grads for x in rvals[0]]
        flag = rvals[1]
        niters = rvals[2]
        rel_residual = rvals[3]
        rel_Aresidual = rvals[4]
        Anorm = rvals[5]
        Acond = rvals[6]
        xnorm = rvals[7]
        Axnorm = rvals[8]
        updates = rvals[9]

        norm_ord0 = TT.max(abs(nw_rs[0]))
        for r in nw_rs[1:]:
            norm_ord0 = TT.maximum(norm_ord0, TT.max(abs(r)))

        updates.update(dict(zip(self.rs, nw_rs)))
        grad_inps = [(x, y[rbdx * options['mbs']:(rbdx + 1) * options['mbs']])
                     for x, y in zip(loc_inputs[:1], shared_data[:1])]
        print 'Compiling riemannian gradient function'
        self.compute_riemannian_gradients = theano.function(
            [rbdx], [
                flag, niters, rel_residual, rel_Aresidual, Anorm, Acond, xnorm,
                Axnorm, norm_grads, norm_ord0
            ],
            updates=updates,
            givens=dict(grad_inps),
            name='compute_riemannian_gradients',
            on_unused_input='warn',
            mode=mode,
            profile=options['profile'])

        # Step 3. Compile function for evaluating cost and updating
        # parameters
        print 'constructing evaluation function'
        lr = TT.scalar('lr')
        self.lr = numpy.float32(options['lr'])
        ebdx = TT.iscalar('eval_batch_idx')
        nw_ps = [p - lr * r for p, r in zip(model.params, self.rs)]

        def cost_step(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs + model.params, nw_inps + nw_ps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            return [_idx + const(1), acc + nw_cost]

        acc0 = const([0])
        idx0 = const([0])
        n_steps = options['ebs'] // options['cbs']
        rvals, updates = scan(cost_step,
                              states=[idx0, acc0],
                              n_steps=n_steps,
                              name='cost_loop',
                              mode=gpu_mode,
                              profile=options['profile'])

        final_cost = rvals[1] / const(n_steps)
        if options['device'] == 'gpu':
            grad_inps = [(x,
                          y[ebdx * options['ebs']:(ebdx + 1) * options['ebs']])
                         for x, y in zip(loc_inputs, shared_data)]
        else:
            grad_inps = zip(loc_inputs, shared_data)

        print 'compling evaluation function'
        self.eval_fn = theano.function([ebdx, lr],
                                       final_cost,
                                       givens=dict(grad_inps),
                                       on_unused_input='warn',
                                       updates=updates,
                                       name='eval_fn',
                                       mode=gpu_mode,
                                       profile=options['profile'])

        update_dict = dict(zip(model.params, nw_ps))
        if options['device'] != 'gpu':
            update_dict.update(dict(zip(model.cparams, nw_ps)))
        self.update_params = theano.function([lr], [],
                                             updates=update_dict,
                                             name='update_params',
                                             on_unused_input='warn',
                                             mode=mode,
                                             profile=options['profile'])
        self.options = options
        self.old_cost = 1e6
        self.device = options['device']
        n_steps = options['ebs'] // options['cbs']

        def ls_error(_idx, acc):
            idx = TT.cast(_idx, 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']] for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = TT.cast(safe_clone(model.err, replace=replace),
                              'float32')
            return [_idx + const(1), acc + nw_cost]

        states = [
            TT.constant(numpy.float32([0])),
            TT.constant(numpy.float32([0]))
        ]
        rvals, _ = scan(ls_error,
                        states=states,
                        n_steps=n_steps,
                        name='ls_err_step',
                        mode=cpu_mode,
                        profile=options['profile'])
        ferr = rvals[1][0] / const(n_steps)
        self.compute_error = theano.function([ebdx],
                                             ferr,
                                             givens=dict(grad_inps),
                                             name='compute_err',
                                             mode=gpu_mode,
                                             on_unused_input='warn',
                                             profile=options['profile'])
Ejemplo n.º 10
0
def build_model(tparams, options):
    """ Builds the entire computational graph used for training
    """
    opt_ret = dict()

    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # description string: #words x #samples
    x1 = tensor.matrix('x1', dtype='int64')
    x1_mask = tensor.matrix('x1_mask', dtype='float32')
    x1_left_mask = tensor.tensor3('x1_left_mask', dtype='float32')
    x1_right_mask = tensor.tensor3('x1_right_mask', dtype='float32')
    x2 = tensor.matrix('x2', dtype='int64')
    x2_mask = tensor.matrix('x2_mask', dtype='float32')
    x2_left_mask = tensor.tensor3('x2_left_mask', dtype='float32')
    x2_right_mask = tensor.tensor3('x2_right_mask', dtype='float32')

    y = tensor.vector('y', dtype='int64')

    xr1_mask = x1_mask[::-1]
    xr2_mask = x2_mask[::-1]

    n_timesteps_x1 = x1.shape[0]
    n_timesteps_x2 = x2.shape[0]
    n_samples = x1.shape[1]

    # word embedding
    emb1 = tparams['Wemb'][x1.flatten()].reshape(
        [n_timesteps_x1, n_samples, options['dim_word']])
    if options['use_dropout']:
        emb1 = dropout_layer(emb1, use_noise, trng)

    emb2 = tparams['Wemb'][x2.flatten()].reshape(
        [n_timesteps_x2, n_samples, options['dim_word']])
    if options['use_dropout']:
        emb2 = dropout_layer(emb2, use_noise, trng)

    inputs1 = (emb1, x1_mask, x1_left_mask, x1_right_mask)
    inputs2 = (emb2, x2_mask, x2_left_mask, x2_right_mask)

    proj1 = get_layer(options['encoder'])[1](tparams,
                                             inputs1,
                                             options,
                                             prefix='encoder',
                                             mask=x1_mask)
    proj2 = get_layer(options['encoder'])[1](tparams,
                                             inputs2,
                                             options,
                                             prefix='encoder',
                                             mask=x2_mask)

    ctx1 = proj1[0][-1, :, :, :].dimshuffle(1, 0, 2)
    ctx2 = proj2[0][-1, :, :, :].dimshuffle(1, 0, 2)

    # ctx1: #step1 x #sample x #dimctx
    # ctx2: #step2 x #sample x #dimctx

    ctx1 = ctx1 * x1_mask[:, :, None]
    ctx2 = ctx2 * x2_mask[:, :, None]

    # weight_matrix: #sample x #step1 x #step2
    weight_matrix = tensor.batched_dot(ctx1.dimshuffle(1, 0, 2),
                                       ctx2.dimshuffle(1, 2, 0))
    weight_matrix_1 = tensor.exp(
        weight_matrix - weight_matrix.max(1, keepdims=True)).dimshuffle(
            1, 2, 0)
    weight_matrix_2 = tensor.exp(
        weight_matrix - weight_matrix.max(2, keepdims=True)).dimshuffle(
            1, 2, 0)

    # weight_matrix_1: #step1 x #step2 x #sample
    weight_matrix_1 = weight_matrix_1 * x1_mask[:, None, :]
    weight_matrix_2 = weight_matrix_2 * x2_mask[None, :, :]

    alpha = weight_matrix_1 / weight_matrix_1.sum(0, keepdims=True)
    beta = weight_matrix_2 / weight_matrix_2.sum(1, keepdims=True)

    #ctx1: #step1 x #sample x #dimctx
    #ctx2: #step2 x #sample x #dimctx
    ctx2_ = (ctx1.dimshuffle(0, 'x', 1, 2) *
             alpha.dimshuffle(0, 1, 2, 'x')).sum(0)
    ctx1_ = (ctx2.dimshuffle('x', 0, 1, 2) *
             beta.dimshuffle(0, 1, 2, 'x')).sum(1)

    inp1 = concatenate([ctx1, ctx1_, ctx1 * ctx1_, ctx1 - ctx1_], axis=2)
    inp2 = concatenate([ctx2, ctx2_, ctx2 * ctx2_, ctx2 - ctx2_], axis=2)

    inp1 = get_layer('ff')[1](tparams,
                              inp1,
                              options,
                              prefix='projection',
                              activ='relu')
    inp2 = get_layer('ff')[1](tparams,
                              inp2,
                              options,
                              prefix='projection',
                              activ='relu')

    inputs3 = (inp1, x1_mask, x1_left_mask, x1_right_mask)
    inputs4 = (inp2, x2_mask, x2_left_mask, x2_right_mask)

    proj3 = get_layer(options['decoder'])[1](tparams,
                                             inputs3,
                                             options,
                                             prefix='decoder',
                                             mask=x1_mask)
    proj4 = get_layer(options['decoder'])[1](tparams,
                                             inputs4,
                                             options,
                                             prefix='decoder',
                                             mask=x2_mask)

    logit0 = concatenate([proj3[0][-1, :, -1, :], proj4[0][-1, :, -1, :]],
                         axis=1)
    logit1 = (proj3[0][-1, :, :, :] *
              x1_mask.dimshuffle(1, 0, 'x')).sum(1) / x1_mask.sum(0)[:, None]
    logit2 = (proj3[0][-1, :, :, :] * x1_mask.dimshuffle(1, 0, 'x')).max(1)
    logit3 = (proj4[0][-1, :, :, :] *
              x2_mask.dimshuffle(1, 0, 'x')).sum(1) / x2_mask.sum(0)[:, None]
    logit4 = (proj4[0][-1, :, :, :] * x2_mask.dimshuffle(1, 0, 'x')).max(1)
    logit = concatenate([logit0, logit1, logit2, logit3, logit4], axis=1)

    if options['use_dropout']:
        logit = dropout_layer(logit, use_noise, trng)

    logit = get_layer('ff')[1](tparams,
                               logit,
                               options,
                               prefix='ff_layer_1',
                               activ='tanh')
    if options['use_dropout']:
        logit = dropout_layer(logit, use_noise, trng)
    logit = get_layer('ff')[1](tparams,
                               logit,
                               options,
                               prefix='ff_layer_output',
                               activ='linear')
    probs = tensor.nnet.softmax(logit)
    cost = tensor.nnet.categorical_crossentropy(probs, y)

    f_pred = theano.function([
        x1, x1_mask, x1_left_mask, x1_right_mask, x2, x2_mask, x2_left_mask,
        x2_right_mask
    ],
                             probs.argmax(axis=1),
                             name='f_pred',
                             profile=profile)
    f_prods = theano.function([
        x1, x1_mask, x1_left_mask, x1_right_mask, x2, x2_mask, x2_left_mask,
        x2_right_mask
    ],
                              probs,
                              name='f_prods',
                              profile=profile)

    return trng, use_noise, x1, x1_mask, x1_left_mask, x1_right_mask, x2, x2_mask, x2_left_mask, x2_right_mask, y, opt_ret, cost, f_pred, f_prods
Ejemplo n.º 11
0
def build_model(shared_params, options):
    trng = RandomStreams(1234)
    drop_ratio = options['drop_ratio']
    batch_size = options['batch_size']
    n_dim = options['n_dim']

    w_emb = shared_params['w_emb']

    dropout = theano.shared(numpy.float32(0.))
    image_feat = T.ftensor3('image_feat')
    # T x batch_size
    input_idx = T.imatrix('input_idx')
    input_mask = T.matrix('input_mask')
    # label is the TRUE label
    label = T.ivector('label')

    empty_word = theano.shared(value=np.zeros((1, options['n_emb']),
                                              dtype='float32'),
                               name='empty_word')
    w_emb_extend = T.concatenate([empty_word, shared_params['w_emb']], axis=0)
    input_emb = w_emb_extend[input_idx]

    # get the transformed image feature
    h_0 = theano.shared(numpy.zeros((batch_size, n_dim), dtype='float32'))
    c_0 = theano.shared(numpy.zeros((batch_size, n_dim), dtype='float32'))

    if options['sent_drop']:
        input_emb = dropout_layer(input_emb, dropout, trng, drop_ratio)

    h_from_lstm, c_encode = lstm_layer(shared_params,
                                       input_emb,
                                       input_mask,
                                       h_0,
                                       c_0,
                                       options,
                                       prefix='sent_lstm')
    # pick the last one as encoder

    Y = fflayer(shared_params,
                image_feat,
                options,
                prefix='image_mlp',
                act_func=options.get('image_mlp_act', 'tanh'))
    r_0 = theano.shared(numpy.zeros((batch_size, n_dim), dtype='float32'))
    r = wbw_attention_layer(shared_params,
                            Y,
                            h_from_lstm,
                            input_mask,
                            r_0,
                            options,
                            return_final=True)

    h_star = T.tanh(
        T.dot(r, shared_params['W_p_w']) +
        T.dot(h_from_lstm[-1], shared_params['W_x_w']))
    combined_hidden = fflayer(shared_params,
                              h_star,
                              options,
                              prefix='scale_to_softmax',
                              act_func='linear')

    # drop the image output
    prob = T.nnet.softmax(combined_hidden)
    prob_y = prob[T.arange(prob.shape[0]), label]
    pred_label = T.argmax(prob, axis=1)
    # sum or mean?
    cost = -T.mean(T.log(prob_y))
    accu = T.mean(T.eq(pred_label, label))

    return image_feat, input_idx, input_mask, \
        label, dropout, cost, accu
Ejemplo n.º 12
0
def translate_model(queue, rqueue, pid, model, options, k, normalize, kp,
                    sigma):

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    trng = RandomStreams(1234)

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, options)
    inps = [x, x_mask, y, y_mask]

    f_log_probs = theano.function(inps, cost)

    # word index
    f_init, f_next = build_sampler(tparams, options, trng)

    def _translate(idx, seq):
        all_samples = []
        all_scores = []

        for kidx in xrange(kp):
            if kidx == 0:
                ss = -1.
            else:
                ss = sigma
            # sample given an input sequence and obtain scores
            sample, score = gen_sample(tparams,
                                       f_init,
                                       f_next,
                                       numpy.array(seq).reshape([len(seq), 1]),
                                       options,
                                       trng=trng,
                                       k=k,
                                       maxlen=200,
                                       stochastic=False,
                                       argmax=False,
                                       sigma=ss)

            # normalize scores according to sequence lengths
            if normalize:
                lengths = numpy.array([len(s) for s in sample])
                score = score / lengths
            #print idx, score
            sidx = numpy.argmin(score)
            all_samples.append(sample[sidx])
            all_scores.append(score[sidx])

        source_list = [seq] * kp
        x, x_mask, y, y_mask = prepare_data(source_list,
                                            all_samples,
                                            maxlen=None)
        all_scores = f_log_probs(x, x_mask, y, y_mask)
        if normalize:
            lengths = numpy.array([len(s) for s in all_samples])
            all_scores = all_scores / lengths

        print idx, all_scores
        sidx = numpy.argmin(all_scores)
        return all_samples[sidx]

    while True:
        req = queue.get()
        if req is None:
            break

        idx, x = req[0], req[1]
        print pid, '-', idx
        seq = _translate(idx, x)

        rqueue.put((idx, seq))

    return
Ejemplo n.º 13
0
def evaluate_gpu(gru,
                 test_data,
                 items=None,
                 session_key='SessionId',
                 item_key='ItemId',
                 time_key='Time',
                 cut_off=20,
                 batch_size=100,
                 mode='conservative',
                 output_path=None):
    if gru.error_during_train: raise Exception
    print('Measuring Recall@{} and MRR@{}'.format(cut_off, cut_off))
    srng = RandomStreams()
    X = T.ivector()
    Y = T.ivector()
    M = T.iscalar()
    C = []
    yhat, H, updatesH = gru.symbolic_predict(X, Y, M, items, batch_size)
    if mode == 'tiebreaking': yhat += srng.uniform(size=yhat.shape) * 1e-10
    if items is None:
        targets = T.diag(yhat.T[Y])
        others = yhat.T
    else:
        targets = T.diag(yhat.T[:M])
        others = yhat.T[M:]
    if mode == 'standard':
        ranks = (others > targets).sum(axis=0) + 1
    elif mode == 'conservative':
        ranks = (others >= targets).sum(axis=0)
    elif mode == 'median':
        ranks = (others > targets).sum(axis=0) + 0.5 * (
            (others == targets).sum(axis=0) - 1) + 1
    elif mode == 'tiebreaking':
        ranks = (others > targets).sum(axis=0) + 1
    else:
        raise NotImplementedError
    REC = (ranks <= cut_off).sum()
    MRR = ((ranks <= cut_off) / ranks).sum()
    evaluate = theano.function(inputs=[X, Y, M] + C,
                               outputs=[REC, MRR, yhat],
                               updates=updatesH,
                               allow_input_downcast=True,
                               on_unused_input='ignore')
    test_data = pd.merge(test_data,
                         pd.DataFrame({
                             'ItemIdx': gru.itemidmap.values,
                             item_key: gru.itemidmap.index
                         }),
                         on=item_key,
                         how='inner')
    test_data.sort_values([session_key, time_key, item_key], inplace=True)
    test_data_items = test_data.ItemIdx.values
    if items is not None:
        item_idxs = gru.itemidmap[items]
    recall, mrr, n = 0, 0, 0
    iters = np.arange(batch_size)
    maxiter = iters.max()
    session_lengths = test_data.groupby(session_key).size()
    items_session_lengths = np.array([
        session_length for session_length in session_lengths
        for _ in range(session_length)
    ])
    items_session_ids = np.array([
        i for i, session_length in enumerate(session_lengths)
        for _ in range(session_length)
    ])

    items_pos = np.array([
        i for session_length in session_lengths for i in range(session_length)
    ])

    offset_sessions = np.zeros(test_data[session_key].nunique() + 1,
                               dtype=np.int32)
    offset_sessions[1:] = test_data.groupby(session_key).size().cumsum()
    start = offset_sessions[iters]
    end = offset_sessions[iters + 1]
    finished = False
    cidxs = []
    lim_preds = 500
    headers = ['seq_id', 'length', 'event_id', 'event_in', 'event_out']
    headers += ['top_pred_idx_' + str(i) for i in range(lim_preds)]
    headers += ['top_pred_' + str(i) for i in range(lim_preds)]
    header = ';'.join(headers)
    lines = [header]
    while not finished:
        minlen = (end - start).min()
        out_idx = test_data_items[start]
        for i in range(minlen - 1):
            lengths = items_session_lengths[start + i]
            positions = items_pos[start + i]
            session_ids = items_session_ids[start + i]
            in_idx = out_idx
            out_idx = test_data_items[start + i + 1]
            if items is not None:
                y = np.hstack([out_idx, item_idxs])
            else:
                y = out_idx
            rec, m, preds = evaluate(in_idx, y, len(iters), *cidxs)
            for seq_id, in_item, out_item, pred, length, position in zip(
                    session_ids, in_idx, y, preds, lengths, positions):
                top_k_idx = pred.argsort()[-lim_preds:][::-1]
                top_k_values = pred[top_k_idx]
                lines.append(';'.join([
                    str(int(seq_id)),
                    str(int(length - 1)),
                    str(int(position)),
                    str(int(in_item)),
                    str(int(out_item))
                ] + [str(int(idx)) for idx in top_k_idx] +
                                      [str(pred) for pred in top_k_values]))
            recall += rec
            mrr += m
            n += len(iters)
        start = start + minlen - 1
        finished_mask = (end - start <= 1)
        n_finished = finished_mask.sum()
        iters[finished_mask] = maxiter + np.arange(1, n_finished + 1)
        maxiter += n_finished
        valid_mask = (iters < len(offset_sessions) - 1)
        n_valid = valid_mask.sum()
        if n_valid == 0:
            finished = True
            break
        mask = finished_mask & valid_mask
        sessions = iters[mask]
        start[mask] = offset_sessions[sessions]
        end[mask] = offset_sessions[sessions + 1]
        iters = iters[valid_mask]
        start = start[valid_mask]
        end = end[valid_mask]
        if valid_mask.any():
            for i in range(len(H)):
                tmp = H[i].get_value(borrow=True)
                tmp[mask] = 0
                tmp = tmp[valid_mask]
                H[i].set_value(tmp, borrow=True)

    if output_path is not None:
        with open(output_path, 'w') as out_file:
            out_file.write("\n".join(lines))
    return recall / n, mrr / n
Ejemplo n.º 14
0
def rescore_model(source_file, target_file, saveto, models, options, b,
                  normalization_alpha, verbose, alignweights):

    trng = RandomStreams(1234)
    datasets = [source_file.name, target_file.name]
    dictionaries = [
        options[0]['dictionaries'][0], options[0]['dictionaries'][-1]
    ]
    n_words = [options[0]['n_words'][0], options[0]['n_words'][-1]]

    def _score(pairs, alignweights=False):
        # sample given an input sequence and obtain scores
        scores = []
        alignments = []
        for i, model in enumerate(models):
            f_log_probs = load_scorer(model,
                                      options[i],
                                      alignweights=alignweights)
            score, alignment = pred_probs(
                f_log_probs,
                prepare_data,
                options[i],
                pairs,
                normalization_alpha=normalization_alpha,
                alignweights=alignweights)
            scores.append(score)
            alignments.append(alignment)

        return scores, alignments

    pairs = TextIterator(
        datasets,
        dictionaries,
        n_words_dicts=n_words,
        batch_size=b,
        maxlen=float('inf'),
        factors=options[0]['factors'],
        outputs=1,
        sort_by_length=False
    )  #TODO: sorting by length could be more efficient, but we'd want to resort after

    scores, alignments = _score(pairs, alignweights)

    source_file.seek(0)
    target_file.seek(0)
    source_lines = source_file.readlines()
    target_lines = target_file.readlines()

    for i, line in enumerate(target_lines):
        score_str = ' '.join(map(str, [s[i] for s in scores]))
        if verbose:
            saveto.write('{0} '.format(line.strip()))
        saveto.write('{0}\n'.format(score_str))

    ### optional save weights mode.
    if alignweights:
        ### writing out the alignments.
        temp_name = saveto.name + ".json"
        with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT:
            for line in all_alignments:
                align_OUT.write(line + "\n")
            ### combining the actual source and target words.
            combine_source_target_text_1to1(source_file, target_file,
                                            saveto.name, align_OUT)
Ejemplo n.º 15
0
 def __init__(self):
     theano.config.floatX = "float32"
     self.srng = RandomStreams()
     self.X = T.ftensor4()
     self.Y = T.fmatrix()
Ejemplo n.º 16
0
 def __init__(self, t=0.1, eps=1e-20):
     assert t != 0
     self.temperature = t
     self.eps = eps
     self._srng = RandomStreams(get_rng().randint(1, 2147462579))
Ejemplo n.º 17
0
import numpy as np
import theano
import theano.tensor as T
import ipdb
import cPickle

from keras.preprocessing import sequence
from keras import activations, initializations
from keras.preprocessing import sequence
from keras.layers.embeddings import Embedding
from keras.layers.core import Dense
from keras.utils.theano_utils import shared_scalar, shared_zeros, sharedX, alloc_zeros_matrix

from theano import config

trng = RandomStreams(1234)
def dropout(X):
    if train:
        X *= trng.binomial(X.shape, p=0.5, dtype=theano.config.floatX)
        X /= 0.5

    return X

def ortho_weight(ndim):

    W = np.random.randn(ndim, ndim)
    u, _, _ = np.linalg.svd(W)
    return u.astype('float32')


############# Building Models ################
Ejemplo n.º 18
0
def get_gate_weights(model_name,
                     dictionary,
                     dictionary_target,
                     source_file,
                     args,
                     k=5,
                     normalize=False,
                     chr_level=False):
    options = load_options(model_name)

    word_dict, word_idict, word_idict_trg = load_translate_data(
        dictionary,
        dictionary_target,
        source_file,
        batch_mode=False,
        chr_level=chr_level,
        load_input=False)

    inputs = []
    lines = []

    print 'Loading input...',
    with open(source_file, 'r') as f:
        for idx, line in enumerate(f):
            if idx >= args.test_number:
                break

            lines.append(line)
            if chr_level:
                words = list(line.decode('utf-8').strip())
            else:
                words = line.strip().split()

            x = [word_dict[w] if w in word_dict else 1 for w in words]
            x = [ii if ii < options['n_words_src'] else 1 for ii in x]
            x.append(0)

            inputs.append(x)
    print 'Done'

    print 'Building model...',
    model, _ = build_and_init_model(model_name, options, build=False)
    print 'Done'

    if args.encoder:
        return get_encoder_gate_weights(args, model, options, inputs, lines)

    print 'Building sampler...'
    trng = RandomStreams(1234)
    use_noise = theano.shared(np.float32(0.))
    f_init, f_next = model.build_sampler(
        trng=trng,
        use_noise=use_noise,
        batch_mode=False,
        get_gates=True,
    )
    build_result = model, f_init, f_next, trng
    print 'Done'

    results = []

    for i, src_seq in enumerate(inputs):
        results.append({
            'index': i,
            'input': lines[i].strip(),
            'dim': options['dim'],
            'encoder': False,
        })

        tgt_seq, kw_ret = translate_sentence(src_seq, build_result, k,
                                             normalize)

        results[-1]['output'] = seq2words(tgt_seq, word_idict_trg)
        results[-1]['kw_ret'] = kw_ret
        results[-1]['n_layers'] = len(kw_ret['input_gates_list'][0])

        print 'Input:', lines[i]
        print 'Output:', results[-1]['output']
        print '=============================='

    return results
Ejemplo n.º 19
0
import numpy
try:
    import pylab
except ImportError:
    print(
        "pylab isn't available. If you use its functionality, it will crash.")
    print("It can be installed with 'pip install -q Pillow'")

from midi.utils import midiread, midiwrite
import theano
import theano.tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

#Don't use a python long as this don't work on 32 bits computers.
numpy.random.seed(0xbeef)
rng = RandomStreams(seed=numpy.random.randint(1 << 30))
theano.config.warn.subtensor_merge_bug = False


def build_rbm(v, W, bv, bh, k):
    '''Construct a k-step Gibbs chain starting at v for an RBM.

    v : Theano vector or matrix
        If a matrix, multiple chains will be run in parallel (batch).
    W : Theano matrix
        Weight matrix of the RBM.
    bv : Theano vector
        Visible bias vector of the RBM.
    bh : Theano vector
        Hidden bias vector of the RBM.
    k : scalar or Theano scalar
def build_model(shared_params, options):
    trng = RandomStreams(1234)
    drop_ratio = options['drop_ratio']
    batch_size = options['batch_size']
    n_dim = options['n_dim']

    w_emb = shared_params['w_emb']

    dropout = theano.shared(numpy.float32(0.))
    image_feat = T.ftensor3('image_feat')
    # batch_size x T
    input_idx = T.imatrix('input_idx')
    input_mask = T.matrix('input_mask')
    # label is the TRUE label
    label = T.ivector('label')

    empty_word = theano.shared(value=np.zeros((1, options['n_emb']),
                                              dtype='float32'),
                               name='empty_word')
    w_emb_extend = T.concatenate([empty_word, shared_params['w_emb']],
                                 axis=0)
    input_emb = w_emb_extend[input_idx]

    # a trick here, set the maxpool_h/w to be large
    # maxpool_shape = (options['maxpool_h'], options['maxpool_w'])

    # turn those appending words into zeros
    # batch_size x T x n_emb
    input_emb = input_emb * input_mask[:, :, None]
    if options['sent_drop']:
        input_emb = dropout_layer(input_emb, dropout, trng, drop_ratio)

    if options['use_unigram_conv']:
        unigram_conv_feat = fflayer(shared_params, input_emb, options,
                                    prefix='conv_unigram',
                                    act_func=options.get('sent_conv_act', 'tanh'))
        unigram_pool_feat = unigram_conv_feat.max(axis=1)
    if options['use_bigram_conv']:
        idx = T.concatenate([T.arange(input_emb.shape[1])[:-1],
                             T.arange(input_emb.shape[1])[1:]]).reshape((2, input_emb.shape[1] - 1)).transpose().flatten()
        bigram_emb = T.reshape(input_emb[:, idx, :], (input_emb.shape[0],
                                                      input_emb.shape[1] - 1,
                                                      2 * input_emb.shape[2]))
        bigram_conv_feat = fflayer(shared_params, bigram_emb,
                                   options, prefix='conv_bigram',
                                   act_func=options.get('sent_conv_act', 'tanh'))
        bigram_pool_feat = bigram_conv_feat.max(axis=1)
    if options['use_trigram_conv']:
        idx = T.concatenate([T.arange(input_emb.shape[1])[:-2],
                             T.arange(input_emb.shape[1])[1:-1],
                             T.arange(input_emb.shape[1])[2:]]).reshape((3, input_emb.shape[1] - 2)).transpose().flatten()
        trigram_emb = T.reshape(input_emb[:, idx, :], (input_emb.shape[0],
                                                      input_emb.shape[1] - 2,
                                                      3 * input_emb.shape[2]))
        trigram_conv_feat = fflayer(shared_params, trigram_emb,
                                    options, prefix='conv_trigram',
                                    act_func=options.get('sent_conv_act', 'tanh'))
        trigram_pool_feat = trigram_conv_feat.max(axis=1)  #

    pool_feat = T.concatenate([unigram_pool_feat,
                               bigram_pool_feat,
                               trigram_pool_feat], axis=1)

    image_feat_down = fflayer(shared_params, image_feat, options,
                              prefix='image_mlp',
                              act_func=options.get('image_mlp_act',
                                                   'tanh'))
    if options.get('use_before_attention_drop', False):
        image_feat_down = dropout_layer(image_feat_down, dropout, trng, drop_ratio)
        pool_feat = dropout_layer(pool_feat, dropout, trng, drop_ratio)

    # attention model begins here
    # first layer attention model
    image_feat_attention_1 = fflayer(shared_params, image_feat_down, options,
                                     prefix='image_att_mlp_1',
                                     act_func=options.get('image_att_mlp_act',
                                                          'tanh'))
    pool_feat_attention_1 = fflayer(shared_params, pool_feat, options,
                                    prefix='sent_att_mlp_1',
                                    act_func=options.get('sent_att_mlp_act',
                                                         'tanh'))
    combined_feat_attention_1 = image_feat_attention_1 + \
                                pool_feat_attention_1[:, None, :]
    if options['use_attention_drop']:
        combined_feat_attention_1 = dropout_layer(combined_feat_attention_1,
                                                  dropout, trng, drop_ratio)

    combined_feat_attention_1 = fflayer(shared_params,
                                        combined_feat_attention_1, options,
                                        prefix='combined_att_mlp_1',
                                        act_func=options.get(
                                            'combined_att_mlp_act',
                                            'tanh'))
    prob_attention_1 = T.nnet.softmax(combined_feat_attention_1[:, :, 0])

    image_feat_ave_1 = (prob_attention_1[:, :, None] * image_feat_down).sum(axis=1)

    combined_hidden_1 = image_feat_ave_1 + pool_feat
    # second layer attention model

    image_feat_attention_2 = fflayer(shared_params, image_feat_down, options,
                                     prefix='image_att_mlp_2',
                                     act_func=options.get('image_att_mlp_act',
                                                          'tanh'))
    pool_feat_attention_2 = fflayer(shared_params, combined_hidden_1, options,
                                    prefix='sent_att_mlp_2',
                                    act_func=options.get('sent_att_mlp_act',
                                                         'tanh'))
    combined_feat_attention_2 = image_feat_attention_2 + \
                                pool_feat_attention_2[:, None, :]
    if options['use_attention_drop']:
        combined_feat_attention_2 = dropout_layer(combined_feat_attention_2,
                                                  dropout, trng, drop_ratio)

    combined_feat_attention_2 = fflayer(shared_params,
                                        combined_feat_attention_2, options,
                                        prefix='combined_att_mlp_2',
                                        act_func=options.get(
                                            'combined_att_mlp_act', 'tanh'))
    prob_attention_2 = T.nnet.softmax(combined_feat_attention_2[:, :, 0])

    image_feat_ave_2 = (prob_attention_2[:, :, None] * image_feat_down).sum(axis=1)

    if options.get('use_final_image_feat_only', False):
        combined_hidden = image_feat_ave_2 + pool_feat
    else:
        combined_hidden = image_feat_ave_2 + combined_hidden_1


    for i in range(options['combined_num_mlp']):
        if options.get('combined_mlp_drop_%d'%(i), False):
            combined_hidden = dropout_layer(combined_hidden, dropout, trng,
                                            drop_ratio)
        if i == options['combined_num_mlp'] - 1:
            combined_hidden = fflayer(shared_params, combined_hidden, options,
                                      prefix='combined_mlp_%d'%(i),
                                      act_func='linear')
        else:
            combined_hidden = fflayer(shared_params, combined_hidden, options,
                                      prefix='combined_mlp_%d'%(i),
                                      act_func=options.get('combined_mlp_act_%d'%(i),
                                                           'tanh'))

    # drop the image output
    prob = T.nnet.softmax(combined_hidden)
    prob_y = prob[T.arange(prob.shape[0]), label]
    pred_label = T.argmax(prob, axis=1)
    # sum or mean?
    cost = -T.mean(T.log(prob_y))
    accu = T.mean(T.eq(pred_label, label))

    # return image_feat, input_idx, input_mask, \
        # label, dropout, cost, accu
    return image_feat, input_idx, input_mask, \
        label, dropout, cost, accu, pred_label, \
        prob_attention_1, prob_attention_2
Ejemplo n.º 21
0
 def __init__(self, mu, logsigma, rng=None, **kwargs):
     self.rng = rng if rng else RandomStreams(
         lasagne.random.get_rng().randint(1, 2147462579))
     super(GaussianSampleLayer, self).__init__([mu, logsigma], **kwargs)
Ejemplo n.º 22
0
# -*- coding: utf-8 -*-

import theano
from theano import tensor as T
from theano import function
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import numpy as np
import pandas as pd
from theano import pp
from collections import OrderedDict
srng = RandomStreams()


class GRU4Rec:
    '''
    GRU4Rec(loss, final_act, hidden_act, layers,
                 n_epochs=10, batch_size=50, dropout_p_hidden=0.5, dropout_p_embed=0.0, learning_rate=0.05, momentum=0.0, lmbd=0.0, embedding=0, n_sample=0, sample_alpha=0.75, smoothing=0,
                 adapt='adagrad', decay=0.9, grad_cap=0,
                 sigma=0, init_as_normal=False, reset_after_session=True, train_random_order=False, time_sort=True,
                 session_key='SessionId', item_key='ItemId', time_key='Time')
    Initializes the network.

    Parameters
    -----------
    loss : 'top1', 'bpr', 'cross-entropy', 'xe_logit', top1-max, bpr-max-<X>
        selects the loss function, <X> is the parameter of the loss
    final_act : 'softmax', 'linear', 'relu', 'tanh', 'softmax_logit', 'leaky-<X>', elu-<X>
        selects the activation function of the final layer, <X> is the parameter of the activation function
    hidden_act : 'tanh', 'relu' or 'linear'
        selects the activation function on the hidden states
    layers : 1D array
Ejemplo n.º 23
0
 def __init__(self, p):
     super(GaussainDropout, self).__init__()
     self.p = p
     self.srng = RandomStreams(seed=np.random.randint(10e6))
Ejemplo n.º 24
0
def search_model_adam(state, channel, reload_model=False):

    pp.pprint(state)

    def get_inps(vgen=None, debug=False, output_map=None):
        X, y = TT.matrix("X", dtype="uint32"), TT.vector("y", dtype="uint8")
        mask = TT.matrix("mask", dtype="float32")

        if debug:
            theano.config.compute_test_value = "warn"
            batch = next(vgen)
            X.tag.test_value = batch[0].reshape((batch[0].shape[0], -1))
            y.tag.test_value = batch[2].flatten()
            mask.tag.test_value = batch[1].reshape((batch[1].shape[0], -1))
        return [X, y, mask]

    lr = state['lr']
    batch_size = state['batch_size']

    # No of els in the cols of the content for the memory
    mem_size = state['mem_size']

    # No of rows in M
    mem_nel = state['mem_nel']
    std = state['std']

    renormalization_scale = state['renormalization_scale']
    sub_mb_size = state['sub_mb_size']
    smoothed_diff_weights = state.get('smoothed_diff_weights', False)

    inp_size = 41300

    # No of hids for controller
    n_hids = state['n_hids']

    # Not using deep out
    deep_out_size = 100

    # Size of the bow embeddings
    bow_size = state.get('bow_size', 80)

    # ff controller
    use_ff_controller = state['use_ff_controller']

    # For RNN controller:
    learn_h0 = state.get('learn_h0', False)
    use_nogru_mem2q = False

    # Use loc based addressing:
    use_loc_based_addressing = state.get('use_loc_based_addressing', False)
    bowout = state.get('bowout', False)
    use_reinforce = state.get('use_reinforce', False)
    permute_order = state.get('permute_order', False)

    use_layer_norm = state.get('use_layer_norm', False)
    recurrent_dropout_prob = state.get("recurrent_dropout_prob", -1)

    seed = 7
    n_read_heads = state['n_read_heads']
    n_write_heads = 1
    n_reading_steps = state['n_reading_steps']

    lambda1_rein = state.get('lambda1_rein', 4e-5)
    lambda2_rein = state.get('lambda2_rein', 1e-5)
    base_reg = 2e-5

    #size of the address in the memory:
    address_size = state["address_size"]
    w2v_embed_scale = 0.05
    n_out = 3
    learn_embeds = state.get('learn_embeds', False)
    glove_emb_path = state.get('glove_emb_path', None)

    rng = np.random.RandomState(seed)
    trng = RandomStreams(seed)
    NRect = lambda x, use_noise=False: NRect(
        x, rng=trng, use_noise=use_noise, std=std)
    use_noise = False

    use_quad_interactions = state.get('use_quad_interactions', True)

    mode = state.get('theano_function_mode', None)
    import sys
    sys.setrecursionlimit(50000)

    learning_rule = Adam(gradient_clipping=state.get('gradient_clip', 10))

    cont_act = Tanh
    mem_gater_activ = Sigmoid
    erase_activ = Sigmoid
    content_activ = Tanh
    use_gru_inp = state.get('use_gru_inp', False)
    use_bow_inp = state.get('use_bow_inp', False)

    w2v_embed_path = None
    use_reinforce_baseline = state['use_reinforce_baseline']

    use_reinforce = state.get('use_reinforce', False)
    l1_pen = state.get('l1_pen', 1e-4)
    l2_pen = state.get('l2_pen', 1e-3)
    hybrid_att = state.get('hybrid_att', False)
    use_dice_val = state.get('use_dice_val', False)
    debug = state.get('debug', False)
    correlation_ws = state.get('correlation_ws', False)
    data_path = state.get('data_path', None)
    idxs = None

    use_batch_norm = state.get("use_batch_norm", False)
    anticorr = state.get('anticorr', None)
    prfx = (
        "ntm_on_fb_copy_task_all_learn_h0_l1_no_n_hids_%(n_hids)s_bsize_%(batch_size)d"
        "_std_%(std)f_mem_nel_%(mem_nel)d_mem_size_%(mem_size)f_lr_%(lr)f_use_bn_%(use_batch_norm)d_hard2"
    ) % locals()

    random_flip_order = False
    train_datagen = SNLI(batch_size=batch_size,
                         random_flip_order=random_flip_order,
                         datapath=data_path,
                         mode="train")

    valid_datagen = SNLI(batch_size=batch_size,
                         random_flip_order=random_flip_order,
                         datapath=data_path,
                         mode="valid")

    test_datagen = SNLI(batch_size=batch_size,
                        random_flip_order=random_flip_order,
                        datapath=data_path,
                        mode="test")
    n_layers = state.get('n_layers', 1)

    inps = get_inps(vgen=valid_datagen, debug=debug, output_map=True)

    max_len = inps[0].shape[0]

    wi = WeightInitializer(sparsity=-1,
                           scale=std,
                           rng=rng,
                           init_method=InitMethods.Adaptive,
                           center=0.0)
    bi = BiasInitializer(sparsity=-1,
                         scale=1e-3,
                         rng=rng,
                         init_method=BiasInitMethods.Random,
                         center=0.0)

    ntm = NTMModel(n_in=inp_size,
                   n_hids=n_hids,
                   bow_size=bow_size,
                   n_out=n_out,
                   predict_bow_out=bowout,
                   mem_size=mem_size,
                   mem_nel=mem_nel,
                   use_ff_controller=use_ff_controller,
                   sub_mb_size=sub_mb_size,
                   deep_out_size=deep_out_size,
                   inps=inps,
                   n_layers=n_layers,
                   hybrid_att=hybrid_att,
                   smoothed_diff_weights=smoothed_diff_weights,
                   baseline_reg=base_reg,
                   w2v_embed_path=w2v_embed_path,
                   renormalization_scale=renormalization_scale,
                   use_batch_norm=use_batch_norm,
                   w2v_embed_scale=w2v_embed_scale,
                   n_read_heads=n_read_heads,
                   n_write_heads=n_write_heads,
                   use_last_hidden_state=True,
                   use_loc_based_addressing=use_loc_based_addressing,
                   use_simple_rnn_inp_rep=False,
                   use_gru_inp_rep=use_gru_inp,
                   use_bow_input=use_bow_inp,
                   use_layer_norm=use_layer_norm,
                   recurrent_dropout_prob=recurrent_dropout_prob,
                   use_inp_content=False,
                   use_mask=True,
                   anticorr=anticorr,
                   glove_embed_path=glove_emb_path,
                   learn_embeds=learn_embeds,
                   erase_activ=erase_activ,
                   use_gate_quad_interactions=use_quad_interactions,
                   content_activ=content_activ,
                   use_multiscale_shifts=True,
                   correlation_ws=correlation_ws,
                   learning_rule=learning_rule,
                   lambda1_rein=lambda1_rein,
                   lambda2_rein=lambda2_rein,
                   n_reading_steps=n_reading_steps,
                   use_deepout=False,
                   use_reinforce=use_reinforce,
                   use_nogru_mem2q=use_nogru_mem2q,
                   use_reinforce_baseline=use_reinforce_baseline,
                   controller_activ=cont_act,
                   use_adv_indexing=False,
                   use_out_mem=False,
                   unroll_recurrence=False,
                   address_size=address_size,
                   reinforce_decay=0.9,
                   learn_h0=learn_h0,
                   theano_function_mode=mode,
                   l1_pen=l1_pen,
                   debug=debug,
                   mem_gater_activ=mem_gater_activ,
                   tie_read_write_gates=False,
                   weight_initializer=wi,
                   bias_initializer=bi,
                   use_cost_mask=False,
                   use_noise=use_noise,
                   rnd_indxs=idxs,
                   permute_order=permute_order,
                   max_fact_len=max_len,
                   softmax=True,
                   batch_size=None)

    save_freq = state.get("save_freq", 1000)
    main_loop = SNLIMainLoop(ntm,
                             print_every=50,
                             checkpoint_every=save_freq,
                             validate_every=500,
                             train_data_gen=train_datagen,
                             valid_data_gen=valid_datagen,
                             test_data_gen=test_datagen,
                             learning_rate=lr,
                             reload_model=reload_model,
                             num_epochs=250,
                             state=state,
                             prefix=prfx)

    main_loop.run()
Ejemplo n.º 25
0
hidden_obs = model.inference_procedure.infer(sharedX(init_examples))

from theano import function
outputs = [hidden_obs['H_hat']]
for G_hat in hidden_obs['G_hat']:
    outputs.append(G_hat)
init_chain_hid = function([], outputs)()

model.dbm.V_chains = sharedX(init_chain_hid[0])
model.dbm.H_chains = [
    sharedX(init_chain_elem) for init_chain_elem in init_chain_hid[1:]
]

from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
theano_rng = RandomStreams(42)
assert hasattr(model.dbm, 'V_chains') and model.dbm.V_chains is not None
design_examples_var = model.s3c.random_design_matrix(
    batch_size=rows * cols, theano_rng=theano_rng, H_sample=model.dbm.V_chains)
print 'compiling sampling function'
f = function([], design_examples_var, updates=model.dbm.get_sampling_updates())

print 'init_examples later', (init_examples.min(), init_examples.mean(),
                              init_examples.max())
examples = dataset.get_topological_view(init_examples)
print 'examples ', (examples.min(), examples.mean(), examples.max())
assert len(examples.shape) == 4
is_color = examples.shape[3] == 3
pv = patch_viewer.PatchViewer((rows, cols),
                              examples.shape[1:3],
                              is_color=is_color)
Ejemplo n.º 26
0
def test_dA(learning_rate=0.1,
            training_epochs=15,
            dataset='mnist.pkl.gz',
            batch_size=20,
            output_folder='dA_plots'):
    """
    This demo is tested on MNIST

    :type learning_rate: float
    :param learning_rate: learning rate used for training the DeNosing
                          AutoEncoder

    :type training_epochs: int
    :param training_epochs: number of epochs used for training

    :type dataset: string
    :param dataset: path to the picked dataset

    """
    datasets = load_data(dataset)
    train_set_x, train_set_y = datasets[0]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size

    # start-snippet-2
    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    # end-snippet-2

    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    os.chdir(output_folder)

    ####################################
    # BUILDING THE MODEL NO CORRUPTION #
    ####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2**30))

    da = dA(numpy_rng=rng,
            theano_rng=theano_rng,
            input=x,
            n_visible=28 * 28,
            n_hidden=500)

    cost, updates = da.get_cost_updates(corruption_level=0.,
                                        learning_rate=learning_rate)

    train_da = theano.function(
        [index],
        cost,
        updates=updates,
        givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]})

    start_time = timeit.default_timer()

    ############
    # TRAINING #
    ############

    # go through training epochs
    for epoch in range(training_epochs):
        # go through trainng set
        c = []
        for batch_index in range(n_train_batches):
            c.append(train_da(batch_index))

        print('Training epoch %d, cost ' % epoch, numpy.mean(c,
                                                             dtype='float64'))

    end_time = timeit.default_timer()

    training_time = (end_time - start_time)

    print(('The no corruption code for file ' + os.path.split(__file__)[1] +
           ' ran for %.2fm' % ((training_time) / 60.)),
          file=sys.stderr)
    image = Image.fromarray(
        tile_raster_images(X=da.W.get_value(borrow=True).T,
                           img_shape=(28, 28),
                           tile_shape=(10, 10),
                           tile_spacing=(1, 1)))
    image.save('filters_corruption_0.png')

    # start-snippet-3
    #####################################
    # BUILDING THE MODEL CORRUPTION 30% #
    #####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2**30))

    da = dA(numpy_rng=rng,
            theano_rng=theano_rng,
            input=x,
            n_visible=28 * 28,
            n_hidden=500)

    cost, updates = da.get_cost_updates(corruption_level=0.3,
                                        learning_rate=learning_rate)

    train_da = theano.function(
        [index],
        cost,
        updates=updates,
        givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]})

    start_time = timeit.default_timer()

    ############
    # TRAINING #
    ############

    # go through training epochs
    for epoch in range(training_epochs):
        # go through trainng set
        c = []
        for batch_index in range(n_train_batches):
            c.append(train_da(batch_index))

        print('Training epoch %d, cost ' % epoch, numpy.mean(c,
                                                             dtype='float64'))

    end_time = timeit.default_timer()

    training_time = (end_time - start_time)

    print(('The 30% corruption code for file ' + os.path.split(__file__)[1] +
           ' ran for %.2fm' % (training_time / 60.)),
          file=sys.stderr)
    # end-snippet-3

    # start-snippet-4
    image = Image.fromarray(
        tile_raster_images(X=da.W.get_value(borrow=True).T,
                           img_shape=(28, 28),
                           tile_shape=(10, 10),
                           tile_spacing=(1, 1)))
    image.save('filters_corruption_30.png')
    # end-snippet-4

    os.chdir('../')
Ejemplo n.º 27
0
def build_model(tparams, options):
    # MIKE: why is this not a shared variable as in
    # trng = theano.tensor.shared_randomstreams.RandomStreams(1234)
    trng = RandomStreams(SEED)

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    x = tensor.matrix('x', dtype='int64')
    mask = tensor.matrix('mask', dtype=config.floatX)
    xt = tensor.matrix('xt', dtype=config.floatX)
    y = tensor.matrix('y', dtype='int64')
    yt = tensor.matrix('xt', dtype=config.floatX)

    n_timesteps = x.shape[0]
    n_examples = x.shape[1]

    if (options['arch_remap_input']):
        emb = tparams['Wemb'][x.flatten()].reshape(
            [n_timesteps, n_examples, options['n_hid']])
    else:
        Wemb = theano.shared(numpy.concatenate(
            (numpy.zeros((1, options['n_hid']), dtype=config.floatX),
             numpy.identity(options['n_hid'], dtype=config.floatX)),
            axis=0),
                             name='Wemb')
        emb = Wemb[x.flatten()].reshape(
            [n_timesteps, n_examples, options['n_hid']])

    # this is the call to either lstm_layer or hpm_layer
    proj = get_layer(options['encoder'])[1](tparams,
                                            emb,
                                            xt,
                                            yt,
                                            options,
                                            prefix=options['encoder'],
                                            mask=mask)

    # proj has dim n_timesteps X n_examples X n_hid
    if options['use_dropout']:
        proj = dropout_layer(proj, use_noise, trng)

    def _step(proj_step):
        if (options['arch_output_fn'] == 'softmax'):
            pred_prob_step = tensor.nnet.softmax(
                tensor.dot(proj_step, tparams['U']) + tparams['b'])
        elif (options['arch_output_fn'] == 'logistic'):
            pred_prob_step = tensor.nnet.sigmoid(
                tensor.dot(proj_step, tparams['U']) + tparams['b'])
        else:  # '1-1'
            pred_prob_step = proj_step / tensor.sum(
                proj_step, axis=1, keepdims=True)
            pred_prob_step = tensor.concatenate(
                [tensor.alloc(0, n_examples, 1), pred_prob_step], axis=1)
        return pred_prob_step
        # pred_prob_step should have dim n_examples X n_outputs
        # pred_prob has dim n_timesteps x n_examples x n_outputs
        # pred_step has have dim n_examples

    pred_prob, updates = theano.scan(_step,
                                     sequences=proj,
                                     outputs_info=None,
                                     non_sequences=None,
                                     n_steps=n_timesteps)

    # tgt_prob_step should have dim n_examples
    def _cost_step_norm(pred_prob_step, y_step):
        tgt_prob_step = tensor.switch(
            tensor.eq(y_step, 0), 1.0,
            pred_prob_step[tensor.arange(n_examples), y_step] /
            (1.0 - pred_prob_step[tensor.arange(n_examples), 0]))

        # need to add 1 to pass by index 0 which we removed in computing max
        pred_ix_step = tensor.argmax(pred_prob_step[:, 1:], axis=1) + 1
        if (options['type_token_sim']):  # DEBUG
            corr_step = tensor.switch(
                tensor.eq(y_step, 0), 0,
                tensor.switch(
                    tensor.eq((y_step - 1) // 5, (pred_ix_step - 1) // 5), 1,
                    -1))
        else:
            corr_step = tensor.switch(
                tensor.eq(y_step, 0), 0,
                tensor.switch(tensor.eq(y_step, pred_ix_step), 1, -1))
        return tgt_prob_step, corr_step

    # cost function for predicting target value of a specific event
    # tgt_prob_step should have dim n_examples
    def _cost_step_tgt(pred_prob_step, y_step):
        tgt_prob_step = tensor.switch(
            tensor.eq(y_step, 0), 1.0,
            tensor.switch(
                tensor.gt(y_step, 0), pred_prob_step[tensor.arange(n_examples),
                                                     y_step],
                1.0 - pred_prob_step[tensor.arange(n_examples), -y_step]))
        corr_step = tensor.switch(
            tensor.eq(y_step, 0), 0,
            tensor.switch(tensor.gt(tgt_prob_step, 0.5), 1, -1))
        return tgt_prob_step, corr_step

    if (options['signed_out']):
        cost_fn = _cost_step_tgt
    else:
        cost_fn = _cost_step_norm

    (tgt_prob, corr), updates = theano.scan(cost_fn,
                                            sequences=[pred_prob, y],
                                            outputs_info=None,
                                            non_sequences=None,
                                            n_steps=n_timesteps)

    off = 1e-8
    if tgt_prob.dtype == 'float16':
        off = 1e-6
    # tgt_prob: probability correct (dimensions n_timesteps X n_examples)
    cost = -tensor.sum(tensor.log(tgt_prob.clip(off, 1.0)))
    # Note: not dividing by count because it will reweight minibatch by size
    #         / tensor.sum(tensor.gt(y,0))

    return use_noise, x, xt, y, yt, mask, pred_prob, corr, cost
Ejemplo n.º 28
0
    def __init__(self,
                 numpy_rng,
                 theano_rng=None,
                 input=None,
                 n_visible=784,
                 n_hidden=500,
                 W=None,
                 bhid=None,
                 bvis=None):
        """
        Initialize the dA class by specifying the number of visible units (the
        dimension d of the input ), the number of hidden units ( the dimension
        d' of the latent or hidden space ) and the corruption level. The
        constructor also receives symbolic variables for the input, weights and
        bias. Such a symbolic variables are useful when, for example the input
        is the result of some computations, or when weights are shared between
        the dA and an MLP layer. When dealing with SdAs this always happens,
        the dA on layer 2 gets as input the output of the dA on layer 1,
        and the weights of the dA are used in the second stage of training
        to construct an MLP.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: number random generator used to generate weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                     generated based on a seed drawn from `rng`

        :type input: theano.tensor.TensorType
        :param input: a symbolic description of the input or None for
                      standalone dA

        :type n_visible: int
        :param n_visible: number of visible units

        :type n_hidden: int
        :param n_hidden:  number of hidden units

        :type W: theano.tensor.TensorType
        :param W: Theano variable pointing to a set of weights that should be
                  shared belong the dA and another architecture; if dA should
                  be standalone set this to None

        :type bhid: theano.tensor.TensorType
        :param bhid: Theano variable pointing to a set of biases values (for
                     hidden units) that should be shared belong dA and another
                     architecture; if dA should be standalone set this to None

        :type bvis: theano.tensor.TensorType
        :param bvis: Theano variable pointing to a set of biases values (for
                     visible units) that should be shared belong dA and another
                     architecture; if dA should be standalone set this to None


        """
        self.n_visible = n_visible
        self.n_hidden = n_hidden

        # create a Theano random generator that gives symbolic random values
        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))

        # note : W' was written as `W_prime` and b' as `b_prime`
        if not W:
            # W is initialized with `initial_W` which is uniformely sampled
            # from -4*sqrt(6./(n_visible+n_hidden)) and
            # 4*sqrt(6./(n_hidden+n_visible))the output of uniform if
            # converted using asarray to dtype
            # theano.config.floatX so that the code is runable on GPU
            initial_W = numpy.asarray(numpy_rng.uniform(
                low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
                high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
                size=(n_visible, n_hidden)),
                                      dtype=theano.config.floatX)
            W = theano.shared(value=initial_W, name='W', borrow=True)

        if not bvis:
            bvis = theano.shared(value=numpy.zeros(n_visible,
                                                   dtype=theano.config.floatX),
                                 borrow=True)

        if not bhid:
            bhid = theano.shared(value=numpy.zeros(n_hidden,
                                                   dtype=theano.config.floatX),
                                 name='b',
                                 borrow=True)

        self.W = W
        # b corresponds to the bias of the hidden
        self.b = bhid
        # b_prime corresponds to the bias of the visible
        self.b_prime = bvis
        # tied weights, therefore W_prime is W transpose
        self.W_prime = self.W.T
        self.theano_rng = theano_rng
        # if no input is given, generate a variable representing the input
        if input is None:
            # we use a matrix because we expect a minibatch of several
            # examples, each example being a row
            self.x = T.dmatrix(name='input')
        else:
            self.x = input

        self.params = [self.W, self.b, self.b_prime]
Ejemplo n.º 29
0
def build_model(tparams, options):
    opt_ret = dict()

    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # description string: #words x #samples
    x = tensor.matrix('x', dtype='int64')
    x_mask = tensor.matrix('x_mask', dtype='float32')
    y = tensor.matrix('y', dtype='int64')
    y_mask = tensor.matrix('y_mask', dtype='float32')

    # for the backward rnn, we just need to invert x and x_mask
    xr = x[::-1]
    xr_mask = x_mask[::-1]

    n_timesteps = x.shape[0]
    n_timesteps_trg = y.shape[0]
    n_samples = x.shape[1]

    # word embedding for forward rnn (source)
    emb = tparams['Wemb'][x.flatten()]
    emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
    proj = get_layer(options['encoder'])[1](tparams,
                                            emb,
                                            options,
                                            prefix='encoder',
                                            mask=x_mask)
    # word embedding for backward rnn (source)
    embr = tparams['Wemb'][xr.flatten()]
    embr = embr.reshape([n_timesteps, n_samples, options['dim_word']])
    projr = get_layer(options['encoder'])[1](tparams,
                                             embr,
                                             options,
                                             prefix='encoder_r',
                                             mask=xr_mask)

    # context will be the concatenation of forward and backward rnns
    ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim - 1)

    # mean of the context (across time) will be used to initialize decoder rnn
    ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None]

    # or you can use the last state of forward + backward encoder rnns
    # ctx_mean = concatenate([proj[0][-1], projr[0][-1]], axis=proj[0].ndim-2)

    # initial decoder state
    init_state = get_layer('ff')[1](tparams,
                                    ctx_mean,
                                    options,
                                    prefix='ff_state',
                                    activ='tanh')

    # word embedding (target), we will shift the target sequence one time step
    # to the right. This is done because of the bi-gram connections in the
    # readout and decoder rnn. The first target will be all zeros and we will
    # not condition on the last output.
    emb = tparams['Wemb_dec'][y.flatten()]
    emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']])
    emb_shifted = tensor.zeros_like(emb)
    emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
    emb = emb_shifted

    # decoder - pass through the decoder conditional gru with attention
    proj = get_layer(options['decoder'])[1](tparams,
                                            emb,
                                            options,
                                            prefix='decoder',
                                            mask=y_mask,
                                            context=ctx,
                                            context_mask=x_mask,
                                            one_step=False,
                                            init_state=init_state)
    # hidden states of the decoder gru
    proj_h = proj[0]

    # weighted averages of context, generated by attention module
    ctxs = proj[1]

    # weights (alignment matrix)
    opt_ret['dec_alphas'] = proj[2]

    # compute word probabilities
    logit_lstm = get_layer('ff')[1](tparams,
                                    proj_h,
                                    options,
                                    prefix='ff_logit_lstm',
                                    activ='linear')
    logit_prev = get_layer('ff')[1](tparams,
                                    emb,
                                    options,
                                    prefix='ff_logit_prev',
                                    activ='linear')
    logit_ctx = get_layer('ff')[1](tparams,
                                   ctxs,
                                   options,
                                   prefix='ff_logit_ctx',
                                   activ='linear')
    logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx)
    if options['use_dropout']:
        logit = dropout_layer(logit, use_noise, trng)
    logit = get_layer('ff')[1](tparams,
                               logit,
                               options,
                               prefix='ff_logit',
                               activ='linear')
    logit_shp = logit.shape
    probs = tensor.nnet.softmax(
        logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]]))

    # cost
    y_flat = y.flatten()
    y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat
    cost = -tensor.log(probs.flatten()[y_flat_idx])
    cost = cost.reshape([y.shape[0], y.shape[1]])
    cost = (cost * y_mask).sum(0)

    return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost
import theano
import theano.tensor as T

# from theano.tensor.shared_randomstreams import RandomStreams
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams  # veel sneller
import numpy as np

numpy_rng = np.random.RandomState(123)
theano_rng = RandomStreams(numpy_rng.randint(2**30))

## samplers


def bernoulli(a):
    # a is the bernoulli parameter
    return theano_rng.binomial(size=a.shape,
                               n=1,
                               p=a,
                               dtype=theano.config.floatX)


def gaussian(a, var=1.0):
    # a is the mean, var is the variance (not std or precision!)
    std = T.sqrt(var)
    return theano_rng.normal(size=a.shape,
                             avg=a,
                             std=std,
                             dtype=theano.config.floatX)


def multinomial(a):