Example #1
0
    def setup_generate(self):

        # dimensions: (batch, time, 12)
        chord_types = T.btensor3()

        # dimensions: (batch, time)
        chord_roots = T.imatrix()

        n_batch, n_time = chord_roots.shape

        specs = [lstmstack.prepare_sample_scan(  start_pos=T.alloc(np.array(encoding.STARTING_POSITION, np.int32), (n_batch)),
                                                    start_out=T.tile(encoding.initial_encoded_form(), (n_batch,1)),
                                                    timestep=T.tile(T.arange(n_time), (n_batch,1)),
                                                    cur_chord_type=chord_types,
                                                    cur_chord_root=chord_roots,
                                                    deterministic_dropout=True )
                    for lstmstack, encoding in zip(self.lstmstacks, self.encodings)]

        updates, all_chosen, all_probs, indiv_probs = helper_generate_from_spec(specs, self.lstmstacks, self.encodings, self.srng, n_batch, n_time, self.bounds, self.normalize_artic_only)

        self.generate_fun = theano.function(
            inputs=[chord_roots, chord_types],
            updates=updates,
            outputs=all_chosen,
            allow_input_downcast=True,
            mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))

        self.generate_visualize_fun = theano.function(
            inputs=[chord_roots, chord_types],
            updates=updates,
            outputs=[all_chosen, all_probs] + indiv_probs,
            allow_input_downcast=True,
            mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
def test_NanGuardMode():
    # Tests if NanGuardMode is working by feeding in numpy.inf and numpy.nans
    # intentionally. A working implementation should be able to capture all
    # the abnormalties.
    x = tt.matrix()
    w = theano.shared(np.random.randn(5, 7).astype(theano.config.floatX))
    y = tt.dot(x, w)

    fun = theano.function([x],
                          y,
                          mode=NanGuardMode(nan_is_error=True,
                                            inf_is_error=True))
    a = np.random.randn(3, 5).astype(theano.config.floatX)
    infa = np.tile((np.asarray(100.0)**1000000).astype(theano.config.floatX),
                   (3, 5))
    nana = np.tile(np.asarray(np.nan).astype(theano.config.floatX), (3, 5))
    biga = np.tile(np.asarray(1e20).astype(theano.config.floatX), (3, 5))

    fun(a)  # normal values

    # Temporarily silence logger
    _logger = logging.getLogger("theano.compile.nanguardmode")
    try:
        _logger.propagate = False
        with pytest.raises(AssertionError):
            fun(infa)  # INFs
        with pytest.raises(AssertionError):
            fun(nana)  # NANs
        with pytest.raises(AssertionError):
            fun(biga)  # big values
    finally:
        _logger.propagate = True

    # slices
    a = np.random.randn(3, 4, 5).astype(theano.config.floatX)
    infa = np.tile((np.asarray(100.0)**1000000).astype(theano.config.floatX),
                   (3, 4, 5))
    nana = np.tile(np.asarray(np.nan).astype(theano.config.floatX), (3, 4, 5))
    biga = np.tile(np.asarray(1e20).astype(theano.config.floatX), (3, 4, 5))

    x = tt.tensor3()
    y = x[:, tt.arange(2), tt.arange(2), None]
    fun = theano.function([x],
                          y,
                          mode=NanGuardMode(nan_is_error=True,
                                            inf_is_error=True))
    fun(a)  # normal values
    try:
        _logger.propagate = False
        with pytest.raises(AssertionError):
            fun(infa)  # INFs
        with pytest.raises(AssertionError):
            fun(nana)  # NANs
        with pytest.raises(AssertionError):
            fun(biga)  # big values
    finally:
        _logger.propagate = True
def test_NanGuardMode():
    """
    Tests if NanGuardMode is working by feeding in numpy.inf and numpy.nans
    intentionally. A working implementation should be able to capture all
    the abnormalties.
    """
    x = T.matrix()
    w = theano.shared(numpy.random.randn(5, 7).astype(theano.config.floatX))
    y = T.dot(x, w)

    fun = theano.function(
        [x], y,
        mode=NanGuardMode(nan_is_error=True, inf_is_error=True)
    )
    a = numpy.random.randn(3, 5).astype(theano.config.floatX)
    infa = numpy.tile(
        (numpy.asarray(100.) ** 1000000).astype(theano.config.floatX), (3, 5))
    nana = numpy.tile(
        numpy.asarray(numpy.nan).astype(theano.config.floatX), (3, 5))
    biga = numpy.tile(
        numpy.asarray(1e20).astype(theano.config.floatX), (3, 5))

    fun(a)  # normal values

    # Temporarily silence logger
    _logger = logging.getLogger("theano.compile.nanguardmode")
    try:
        _logger.propagate = False
        assert_raises(AssertionError, fun, infa)  # INFs
        assert_raises(AssertionError, fun, nana)  # NANs
        assert_raises(AssertionError, fun, biga)  # big values
    finally:
        _logger.propagate = True
Example #4
0
    def prepare_style(self, scale=1.0):
        """Called each phase of the optimization, process the style image according to the scale, then run it
        through the model to extract intermediate outputs (e.g. sem4_1) and turn them into patches.
        """
        style_image = skimage.transform.rescale(self.style_img_original,
                                                scale) * 255.0
        self.style_image = self.model.prepare_image(style_image)

        style_map = skimage.transform.rescale(
            self.style_map_original * args.semantic_weight, scale) * 255.0
        self.style_map = style_map.transpose(
            (2, 0, 1))[np.newaxis].astype(np.float32)

        # Compile a function to run on the GPU to extract patches for all layers at once.
        extractor = theano.function(
            [self.model.tensor_img, self.model.tensor_map],
            self.extract_patches([
                self.model.tensor_outputs['sem' + l] for l in self.style_layers
            ]),
            mode=NanGuardMode(nan_is_error=True,
                              inf_is_error=True,
                              big_is_error=False))
        result = extractor(self.style_image, self.style_map)

        # For each layer, we now have a set of patches and their magnitude.
        for layer, patches, norms in zip(self.style_layers, result[::2],
                                         result[1::2]):
            l = self.model.network['nn' + layer]
            l.N = theano.shared(norms)
            l.W.set_value(patches)
            l.num_filters = patches.shape[0]
            print('  - Style layer sem{}: {} patches in {:,}kb.'.format(
                layer, patches.shape[0], patches.size // 1000))
Example #5
0
    def get_fns(self,
                input_dim=123,
                p_learning_rate=0.01,
                d_learning_rate=0.0001,
                p=0.23928176569346055):
        x = T.matrix('X')
        y = T.vector('y')

        mlp, updates, cost, probs = self.primal_step(x, y, p_learning_rate,
                                                     input_dim)
        train_fn = theano.function([x, y], [cost],
                                   updates=updates,
                                   mode=NanGuardMode(nan_is_error=True,
                                                     inf_is_error=True,
                                                     big_is_error=True))

        # Calculate Validation in batch_mode for speedup
        valid_th_fns = theano.function([x], probs)

        def valid_fn(x, y):
            probs = valid_th_fns(x)
            f_beta = self.get_cost(y, probs)
            return f_beta

        return train_fn, valid_fn
Example #6
0
def test_nan_guard_mode():
    # Also test that abs uint* and bool have c code.
    for dtype in ["uint8", "int64", "bool"]:
        x = tensor.vector(dtype=dtype)
        y = x + 1
        mode = NanGuardMode(nan_is_error=True, optimizer=mode_with_gpu.optimizer)
        f = theano.function([x], y, mode=mode)
        d = np.asarray([23, 7]).astype(dtype)
        assert np.allclose(f(d), d + 1)
Example #7
0
    def prepare_style(self, scale=1.0):
        """Called each phase of the optimization, process the style image according to the scale, then run it
        through the model to extract intermediate outputs (e.g. sem4_1) and turn them into patches.
        """
        style_image = skimage.transform.rescale(self.style_img_original,
                                                scale) * 255.0
        self.style_image = self.model.prepare_image(style_image)

        style_map = skimage.transform.rescale(self.style_map_original,
                                              scale) * 255.0
        self.style_map = style_map.transpose(
            (2, 0, 1))[np.newaxis].astype(np.float32)

        # Workaround for Issue #8. Not clear what this is caused by, NaN seems to happen in convolution node
        # on some OSX installations. https://github.com/alexjc/neural-doodle/issues/8
        if args.safe_mode:
            from theano.compile.nanguardmode import NanGuardMode
            flags = {
                'mode':
                NanGuardMode(nan_is_error=True,
                             inf_is_error=True,
                             big_is_error=False)
            }
        else:
            flags = {}

        # Compile a function to run on the GPU to extract patches for all layers at once.
        required_layers = ['conv' + l for l in self.style_layers
                           ] + ['map' + l for l in self.style_layers]
        extractor = theano.function(
            [self.model.tensor_img, self.model.tensor_map],
            self.extract_patches([
                self.model.tensor_outputs[l] for l in required_layers
            ]), **flags)
        result = extractor(self.style_image, self.style_map)

        # For each layer, build it from set of patches and their magnitude.
        def build(layer, prefix, name, patches, norms):
            l = self.model.network[prefix + layer]
            l.N = theano.shared(norms)
            l.W.set_value(patches)
            l.num_filters = patches.shape[0]
            print('  - {} layer {}: {} patches in {:,}kb.'.format(
                name, layer, patches.shape[0], patches.size // 1000))

        if args.style_weight > 0.0:
            result_nn = result[:len(self.style_layers) * 2]
            for layer, *data in zip(self.style_layers, result_nn[::2],
                                    result_nn[1::2]):
                build(layer, 'nn', 'Style', *data)

        if args.semantic_weight > 0.0:
            result_mm = result[len(self.style_layers) * 2:]
            for layer, *data in zip(self.style_layers, result_mm[::2],
                                    result_mm[1::2]):
                build(layer, 'mm', 'Semantic', *data)
Example #8
0
def rmsprop(lr, tparams, grads, inp, cost, opt_ret=None):
    """
    RMS prop optimizer

    :param lr:
    :param tparams:
    :param grads:
    :param inp:
    :param cost:
    :param opt_ret:
    :return f_grad_shared, f_update:
    """
    zipped_grads = [
        theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad' % k)
        for k, p in iteritems(tparams)
    ]
    running_grads = [
        theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad' % k)
        for k, p in iteritems(tparams)
    ]
    running_grads2 = [
        theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad2' % k)
        for k, p in iteritems(tparams)
    ]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2))
             for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function(inp, [cost],
                                    updates=zgup + rgup + rg2up,
                                    profile=profile,
                                    mode=NanGuardMode(nan_is_error=True,
                                                      inf_is_error=True,
                                                      big_is_error=True))

    updir = [
        theano.shared(p.get_value() * numpy.float32(0.), name='%s_updir' % k)
        for k, p in iteritems(tparams)
    ]
    updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg**2 + 1e-4))
                 for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
                                            running_grads2)]
    param_up = [(p, p + udn[1])
                for p, udn in zip(itervalues(tparams), updir_new)]
    f_update = theano.function([lr], [],
                               updates=updir_new + param_up,
                               on_unused_input='ignore',
                               profile=profile)

    return f_grad_shared, f_update
Example #9
0
def compile(inputs, outputs, *args, mode=None, **kwargs):
    """
    Use as theano.function().
    TODO: Something useful with non-symbolic output ?
    
    Parameters
    ----------
    ...
    mode: In addition to the values accepted by `theano.function`, also accepts
       a string to make it easier to use `NanGuardMode`.
       If a string, a `NanGuardMode` object is created; the string should contain
       comma separated values indicating against which values we want to guard.
       For example, with the string ``"nan,inf"``, a `NanGuardMode` object is
       created with the options ``NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)``.
    """
    if not any(
            core.is_theano_object(arg) for arg in itertools.chain(
                [inputs, outputs], args, kwargs.values())):
        raise ValueError(
            "`shim.graph.function()` is undefined for non-symbolic outputs")
    if mode:
        from theano.compile.nanguardmode import NanGuardMode
        if isinstance(mode, NanGuardMode):
            kwargs['mode'] = mode
        elif isinstance(mode, str):
            nanguard = 'nan' in mode
            infguard = 'inf' in mode
            bigguard = 'big' in mode
            kwargs['mode'] = NanGuardMode(nan_is_error=nanguard,
                                          inf_is_error=infguard,
                                          big_is_error=bigguard)
    # Replace dict by OrderedDict to silence Theano warnings – since 3.7, dicts
    # now have guaranteed order
    if sys.version_info.major >= 3 and sys.version_info.minor >= 7:
        args = tuple(
            collections.OrderedDict(a) if type(a) is dict else a for a in args)
        kwargs = {
            k: collections.OrderedDict(v) if type(v) is dict else v
            for k, v in kwargs.items()
        }
    return core.theano.function(inputs, outputs, *args, **kwargs)
Example #10
0
 def buildvalidfun(self, model):
     self.tt.tick("compiling validation function")
     inps, out = self.autobuild_model(model,
                                      *self.traindata,
                                      _trainmode=False)
     if issequence(out):
         out = out[0]
     metrics, newinp = self.buildlosses(out, self.validators)
     inputs = newinp if newinp is not None else inps
     ret = None
     if len(metrics) > 0:
         ret = theano.function(inputs=[x.d
                                       for x in inputs] + [self.goldvar],
                               outputs=metrics,
                               mode=NanGuardMode(nan_is_error=True,
                                                 inf_is_error=False,
                                                 big_is_error=False))
     else:
         self.tt.msg("NO VALIDATION METRICS DEFINED, RETURNS NONE")
     self.tt.tock("validation function compiled")
     return ret
Example #11
0
def test_NanGuardMode():
    """
    Tests if NanGuardMode is working by feeding in numpy.inf and numpy.nans
    intentionally. A working implementation should be able to capture all
    the abnormalties.
    """
    x = T.matrix()
    w = theano.shared(numpy.random.randn(5, 7).astype(theano.config.floatX))
    y = T.dot(x, w)

    fun = theano.function([x],
                          y,
                          mode=NanGuardMode(nan_is_error=True,
                                            inf_is_error=True))
    a = numpy.random.randn(3, 5).astype(theano.config.floatX)
    infa = numpy.tile(
        (numpy.asarray(100.)**1000000).astype(theano.config.floatX), (3, 5))
    nana = numpy.tile(
        numpy.asarray(numpy.nan).astype(theano.config.floatX), (3, 5))
    biga = numpy.tile(numpy.asarray(1e20).astype(theano.config.floatX), (3, 5))

    work = [False, False, False]

    fun(a)  # normal values
    try:
        fun(infa)  # INFs
    except AssertionError:
        work[0] = True
    try:
        fun(nana)  # NANs
    except AssertionError:
        work[1] = True
    try:
        fun(biga)  # big values
    except AssertionError:
        work[2] = True

    if not (work[0] and work[1] and work[2]):
        raise AssertionError("NanGuardMode not working.")
Example #12
0
    def setup_encode(self):

        # dimensions: (batch, time, 12)
        chord_types = T.btensor3()
        # dimensions: (batch, time)
        chord_roots = T.imatrix()
        # dimensions: (batch, time)
        relative_posns = [T.imatrix() for _ in self.encodings]
        # dimesions: (batch, time, output_data)
        encoded_melodies = [T.btensor3() for _ in self.encodings]
        n_batch, n_time = chord_roots.shape

        all_activations = []
        for encoding, enc_lstmstack, encoded_melody, relative_pos in zip(
                self.encodings, self.enc_lstmstacks, encoded_melodies,
                relative_posns):
            activations = enc_lstmstack.do_preprocess_scan(
                timestep=T.tile(T.arange(n_time), (n_batch, 1)),
                relative_position=relative_pos,
                cur_chord_type=chord_types,
                cur_chord_root=chord_roots,
                cur_input=encoded_melody,
                deterministic_dropout=True)
            all_activations.append(activations)
        reduced_activations = functools.reduce((lambda x, y: x + y),
                                               all_activations)
        strengths, vects = self.qman.get_strengths_and_vects(
            reduced_activations)

        self.encode_fun = theano.function(
            inputs=[chord_types, chord_roots] + relative_posns +
            encoded_melodies,
            outputs=[strengths, vects],
            allow_input_downcast=True,
            mode=(NanGuardMode(nan_is_error=True,
                               inf_is_error=True,
                               big_is_error=True) if self.nanguard else None))
Example #13
0
 def buildvalidfun(self, model, batsize):
     self.tt.tick("validation - autobuilding")
     inps, outps = self.autobuild_model(model,
                                        *self.traindata,
                                        _trainmode=False,
                                        _batsize=batsize)
     assert (len(outps) == 1)
     outp = outps[0]
     self.tt.tock("validation - autobuilt")
     self.tt.tick("compiling validation function")
     metrics, newinp = self.buildlosses(outp, self.validators)
     inputs = newinp if newinp is not None else inps
     ret = None
     if len(metrics) > 0:
         ret = theano.function(inputs=[x.d
                                       for x in inputs] + [self.goldvar],
                               outputs=metrics,
                               mode=NanGuardMode(nan_is_error=True,
                                                 inf_is_error=False,
                                                 big_is_error=True))
     else:
         self.tt.msg("NO VALIDATION METRICS DEFINED, RETURNS NONE")
     self.tt.tock("validation function compiled")
     return ret
    def __init__(
        self, n_dim, n_out, n_chan=1, n_superbatch=12800, opt_alg='adam',
        opt_params={'lr': 1e-3, 'b1': 0.9, 'b2': 0.99}
    ):
        """RBM constructor.
        Defines the parameters of the model along with
        basic operations for inferring hidden from visible (and vice-versa),
        as well as for performing CD updates.
        """
        self.numpy_rng = np.random.RandomState(1234)
        self.theano_rng = RandomStreams(self.numpy_rng.randint(2 ** 30))
        self.create_mat = lambda x, y: self.numpy_rng.normal(0, 0.01, (x, y)).astype(theano.config.floatX)

        # save config
        n_batch = opt_params.get('nb')
        self.n_hidden = 100
        self.n_visible = n_chan*n_dim*n_dim  # size of visible layer
        self.n_batch = n_batch
        self.n_qk = 10 # num of components in MoB used of q
        self.n_mc = 30 # num of monte carlo samples from each MoB component

        self.n_dim = n_dim
        self.n_out = n_out
        self.n_superbatch = n_superbatch
        self.alg = opt_alg

        # set up general RBM methods
        AbstractRBM.__init__(self, n_dim, n_chan, n_out, n_superbatch, opt_alg, opt_params) 

        # create updates
        alpha = T.scalar(dtype=theano.config.floatX)  # learning rate

        # save config
        self.n_class = 2
        self.n_dim = n_dim
        self.n_out = n_out


        self.n_components = self.n_qk
        self.n_samples = self.n_mc
        self.n_tot_samples = self.n_samples*self.n_components


        # create input variables
        D, idx1, idx2 = self.create_inputs()

        # create model
        self.network = self.create_model()

        # create objectives
        loglik, plik = self.create_objectives(D)

        # create gradients
        dL_Theta, dE_Theta, dlogZ_Theta, dL_Phi = self.create_gradients()
        grads = dL_Theta, dE_Theta, dlogZ_Theta, dL_Phi

        # create updates
        uL_Theta, uL_Phi, avg_updates, avg_Theta_updates \
          = self.create_updates(grads, None, alpha, opt_alg, opt_params)
      
        # logF_avg, Z_avg = self.create_llik_estimate(D)
        
        mode = NanGuardMode(nan_is_error=True, inf_is_error=False, big_is_error=False)
        mode = None

        common_update1 = OrderedDict(avg_updates.items() + uL_Phi.items())
        self.train_q = theano.function([idx1, idx2], [loglik, plik], 
          updates=common_update1, mode=mode,
          givens={D: self.train_set_x[idx1:idx2]})

        common_update2 = OrderedDict(avg_Theta_updates.items() + uL_Theta.items())
        self.train_p = theano.function([idx1, idx2], [loglik, plik], 
            updates=common_update2, mode=mode, on_unused_input='warn',
            givens={D: self.train_set_x[idx1:idx2]})
        # self.llik = theano.function([D], logF_avg - T.log(Z_avg), mode=mode)

        common_update3 = OrderedDict(common_update1.items() + common_update2.items())
        self.train = theano.function([idx1, idx2], [loglik, plik], 
            updates=common_update3, mode=mode,
            givens={D: self.train_set_x[idx1:idx2]})
Example #15
0
    def setup_train(self):

        # dimensions: (batch, time, 12)
        chord_types = T.btensor3()

        # dimensions: (batch, time)
        chord_roots = T.imatrix()

        # dimensions: (batch, time)
        relative_posns = [T.imatrix() for _ in self.encodings]

        # dimesions: (batch, time, output_data)
        encoded_melodies = [T.btensor3() for _ in self.encodings]

        # dimesions: (batch, time)
        correct_notes = T.imatrix()

        n_batch, n_time = chord_roots.shape

        def _build(det_dropout):
            all_out_probs = []
            for encoding, lstmstack, encoded_melody, relative_pos in zip(self.encodings, self.lstmstacks, encoded_melodies, relative_posns):
                activations = lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch,1)) ,
                                                            relative_position=relative_pos,
                                                            cur_chord_type=chord_types,
                                                            cur_chord_root=chord_roots,
                                                            last_output=T.concatenate([T.tile(encoding.initial_encoded_form(), (n_batch,1,1)),
                                                                                encoded_melody[:,:-1,:] ], 1),
                                                            deterministic_dropout=det_dropout)

                out_probs = encoding.decode_to_probs(activations, relative_pos, self.bounds.lowbound, self.bounds.highbound)
                all_out_probs.append(out_probs)
            reduced_out_probs = functools.reduce((lambda x,y: x*y), all_out_probs)
            if self.normalize_artic_only:
                non_artic_probs = reduced_out_probs[:,:,:2]
                artic_probs = reduced_out_probs[:,:,2:]
                non_artic_sum = T.sum(non_artic_probs, 2, keepdims=True)
                artic_sum = T.sum(artic_probs, 2, keepdims=True)
                norm_artic_probs = artic_probs*(1-non_artic_sum)/artic_sum
                norm_out_probs = T.concatenate([non_artic_probs, norm_artic_probs], 2)
            else:
                normsum = T.sum(reduced_out_probs, 2, keepdims=True)
                normsum = T.maximum(normsum, constants.EPSILON)
                norm_out_probs = reduced_out_probs/normsum
            return Encoding.compute_loss(norm_out_probs, correct_notes, True)

        train_loss, train_info = _build(False)
        updates = Adam(train_loss, self.get_optimize_params(), lr=self.learning_rate_var)

        eval_loss, eval_info = _build(True)

        self.loss_info_keys = list(train_info.keys())

        self.update_fun = theano.function(
            inputs=[chord_types, chord_roots, correct_notes] + relative_posns + encoded_melodies,
            outputs=[train_loss]+list(train_info.values()),
            updates=updates,
            allow_input_downcast=True,
            on_unused_input='ignore',
            mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))

        self.eval_fun = theano.function(
            inputs=[chord_types, chord_roots, correct_notes] + relative_posns + encoded_melodies,
            outputs=[eval_loss]+list(eval_info.values()),
            allow_input_downcast=True,
            on_unused_input='ignore',
            mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
Example #16
0
args = parser.parse_args()
if args.save != default_save and not args.overwrite and os.path.isfile(
        'interim/%s_model.pkl' % args.save):
    raise Exception(
        'A model with this name was already saved. Provide the --overwrite flag (-o) when trying to --save over an existing model.'
    )

import numpy as np
import scipy
import theano
from theano import tensor as T
from six.moves import cPickle
import sys
sys.setrecursionlimit(100000)
from theano.compile.nanguardmode import NanGuardMode
ngm = NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)
from sklearn.svm import SVC

from vcd import image_iter, models, util

# Configuration
cfg = {

    # General
    'patch_shape': (33, 33),
    'aug_noise_std': 0.05,
    'train_test_split':
    0.8,  # Proportion of dataset to use for the train+validation set if the test set is enabled.
    'train_valid_split':
    0.75,  # Proportion of train+validation set to use for the training set.
    # Note that the final training set size is (label_count * train_test_split * train_valid_split).
Example #17
0
def main():
    # Al inici es recuperen els valor del fitxer de configuració
    trainingSize, validationSize, batchSize, testDataSize, nLayer, num_epochs, getFromFile = getConfigData(
    )

    printAndSave("Loading data...", dt=False)

    # Segons l'escollit al fitxer de configuració les metadades
    # es generen o obtenen d'un fitxer
    if (getFromFile):
        printAndSave("Getting metadata from file...", dt=False)
        getMetadata = getMetadataFromFile
    else:
        printAndSave("Calculating metadata...", dt=False)
        getMetadata = calculateMetadata

    # S'obtenen les dades tant de les coleccions d'entrada com de les etiquetes per validar
    train, trainTargets, val, valTargets, test, \
    testTargets, metadata, colsToRemove = \
    getTrainingTestLists( traiSize = trainingSize,
                            valSize = validationSize,
                            testSize = testDataSize,
                            getMetadata =  getMetadata)

    # Es preparen les variables de theano per a
    # l'entrada i les etiquetes que s'utilitzen
    # per validar els reusltats
    input_var = T.matrix('inputs')
    target_var = T.ivector('targets')

    # Es crea la FNN
    network = buid_MLP(input_var=input_var,
                       depth=nLayer,
                       drop_input=.2,
                       drop_hidden=.5,
                       nCols=len(metadata))

    # S'obté la predicció a partir de la sorida de la MLP
    prediction = lasagne.layers.get_output(network)
    # Expressió per la perdua
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()

    # Es creen les expressions d'update per modificar els
    # parametres en cada pas del training
    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.momentum(loss,
                                       params,
                                       learning_rate=0.01,
                                       momentum=0.9)

    # S'obté la predicció a partir de la sorida de la MLP per la validació i testing,
    # a diferència de l'anterior aquí es desactiven les capes de dropout passant a
    # través de tota la xarxa amb el mode deterministic a True
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(
        test_prediction, target_var)
    test_loss = test_loss.mean()

    # Expressió per la precissió de la classificació es realitza a
    # partir de la predicció obtinguda al a sortida del MLP
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    # Compilar la funcio executant un pas de training mitjancant un petit
    # paquet de dades, es retornara la perdua
    # S'activa el mode NanGuardMode per tal d'obtenir un error en cas de
    # nombres massa grans això val per comprobar la validesa en la
    # normalització de les dades
    train_fn = theano.function([input_var, target_var],
                               loss,
                               updates=updates,
                               name="TrainingFunc",
                               mode=NanGuardMode(nan_is_error=True,
                                                 inf_is_error=True,
                                                 big_is_error=True))

    # Es recuperarà la pèrdua i precissió,
    # s'utilitza tant en la validació com en el test
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc],
                             name="ValidationFunc")

    # S'inicialitza l'entrenament
    printAndSave("*" * 53, dt=False)
    printAndSave("Starting training...", dt=False)
    training_start_time = time.time()

    for epoch in range(num_epochs):
        start_time = time.time()
        # Per cada iteració es fa una execució completa de les dades d'entrenament
        train_err = 0
        train_batches = 0
        for batch in iterate_minibatches(train, trainTargets, batchSize,
                                         metadata, colsToRemove):
            inputs, targets = batch
            tmp = train_fn(inputs, targets)
            train_err += tmp
            train_batches += 1

        # Validació de la iteració
        val_err = 0
        val_acc = 0
        val_batches = 0
        for batch in iterate_minibatches(val, valTargets, batchSize, metadata,
                                         colsToRemove):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            val_err += err
            val_acc += acc
            val_batches += 1

        # Impressió de resultats de la iteració
        printAndSave("Epoch {} of {} took {:.3f}s".format(
            epoch + 1, num_epochs,
            time.time() - start_time),
                     dt=False)
        printAndSave("  training loss:\t\t{:.6f}".format(train_err /
                                                         train_batches),
                     dt=False)
        printAndSave("  validation loss:\t\t{:.6f}".format(val_err /
                                                           val_batches),
                     dt=False)
        printAndSave("  validation accuracy:\t\t{:.2f} %".format(
            val_acc / val_batches * 100),
                     dt=False)

        # #######################################
        # Activar per calcular l'error i precisió
        # amb les dades de test en cada epoch
        # #######################################
        # Es realitza el test
        # start_time = time.time()
        # test_err = 0
        # test_acc = 0
        # test_batches = 0
        # for batch in iterate_minibatches(test,testTargets, batchSize, metadata, colsToRemove):
        #     inputs, targets = batch
        #     err, acc = val_fn(inputs, targets)
        #     test_err += err
        #     test_acc += acc
        #     test_batches += 1
        # # Impressió de resultats del test
        # printAndSave("Final results:",dt=False)
        # printAndSave("  test loss:\t\t\t{:.6f}".format(test_err / test_batches),dt=False)
        # printAndSave("  test accuracy:\t\t{:.2f} %".format(
        #     test_acc / test_batches * 100),dt=False)
        # printAndSave("Tests in {}".format(time.time()-start_time),dt=False)

    printAndSave("Training in {}".format(time.time() - training_start_time),
                 dt=False)

    # Es realitza el test
    start_time = time.time()
    test_err = 0
    test_acc = 0
    test_batches = 0
    for batch in iterate_minibatches(test, testTargets, batchSize, metadata,
                                     colsToRemove):
        inputs, targets = batch
        err, acc = val_fn(inputs, targets)
        test_err += err
        test_acc += acc
        test_batches += 1

    # Impressió de resultats del test
    printAndSave("Final results:", dt=False)
    printAndSave("  test loss:\t\t\t{:.6f}".format(test_err / test_batches),
                 dt=False)
    printAndSave("  test accuracy:\t\t{:.2f} %".format(test_acc /
                                                       test_batches * 100),
                 dt=False)
    printAndSave("Tests in {}".format(time.time() - start_time), dt=False)
Example #18
0
    def __init__(self,parameters=None):
        X = tensor.tensor4()
        Y = tensor.lvector()
        self.params = parameters
        if parameters == None:
            W1 = theano.shared(np.random.randn(32,3,5,5).astype(theano.config.floatX)*0.01)
            b1 = theano.shared(np.zeros(32,).astype(theano.config.floatX))
            W2 = theano.shared(np.random.randn(64,32,5,5).astype(theano.config.floatX)*0.01)
            b2 = theano.shared(np.zeros(64,).astype(theano.config.floatX))
            W3 = theano.shared(np.random.randn(128,64,5,5).astype(theano.config.floatX)*0.01)
            b3 = theano.shared(np.zeros(128,).astype(theano.config.floatX))
            W5 = theano.shared(np.random.randn(28800,1084).astype(theano.config.floatX)*0.01)
            # b5 = theano.shared(np.zeros(64*9*9,))
            W6 = theano.shared(np.random.randn(1084,2).astype(theano.config.floatX)*0.01)
            b6 = theano.shared(np.zeros(2,).astype(theano.config.floatX))
        else:
            W1 = theano.shared(parameters["W1"])
            b1 = theano.shared(parameters["b1"])
            W2 = theano.shared(parameters["W2"])
            b2 = theano.shared(parameters["b2"])
            W3 = theano.shared(parameters["W3"])
            b3 = theano.shared(parameters["b3"])
            W5 = theano.shared(parameters["W5"])
            W6 = theano.shared(parameters["W6"])
            b6 = theano.shared(parameters["b6"])

        layer_1 = conv2d(X,W1)
        layer_1_pool = pool_2d(layer_1,(2,2),ignore_border=True)
        layer_1_output = tensor.tanh(layer_1_pool+b1.dimshuffle('x', 0, 'x', 'x'))

        layer_2 = conv2d(layer_1_output, W2)
        layer_2_pool = pool_2d(layer_2,(2,2),ignore_border=True)
        layer_2_output = tensor.tanh(layer_2_pool+b2.dimshuffle('x', 0, 'x', 'x'))

        layer_3 = conv2d(layer_2_output, W3)
        layer_3_pool = pool_2d(layer_3,(2,2),ignore_border=True)
        layer_3_output = tensor.tanh(layer_3_pool+b3.dimshuffle('x', 0, 'x', 'x'))

        layer_4 = layer_3_output.flatten(2)

        layer_5 = tensor.dot(layer_4,W5)
        layer_5_output = layer_5.tanh()

        layer_6 = tensor.dot(layer_5_output, W6) + b6

        #softmax instead of sigmoid.
        layer_6_output = softmax(layer_6) + 0.0000001
        output = tensor.argmax(layer_6_output,axis=1)
        # cost = ((Y-layer_6_output)**2).sum()

        # Negative Log Likelihood
        cost = -tensor.mean(tensor.log(layer_6_output)[tensor.arange(Y.shape[0]), Y], dtype=theano.config.floatX)

        error = tensor.mean(tensor.neq(output, Y))

        parameters = [W1,b1,W2,b2,W3,b3,W5,W6,b6]

        updates = self.GradientDescent(cost,parameters)

        params = {"W1": W1, "b1": b1, "W2": W2, "b2": b2, "W3": W3, "b3": b3, "W5": W5, "W6": W6, "b6": b6}
        self.parameters = theano.function([],params)
        mode = NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True).excluding('local_elemwise_fusion','inplace')
        self.train = theano.function([X, Y], cost,updates=updates, mode=mode)
        self.test = theano.function([X, Y], error)
        self.predict = theano.function([X],output)
Example #19
0
    def __init__(self,
                 K,
                 vocab_size,
                 num_chars,
                 W_init,
                 S_init,
                 nhidden,
                 embed_dim,
                 dropout,
                 train_emb,
                 sub_dim,
                 use_feat,
                 gating_fn,
                 save_attn=False):
        self.nhidden = nhidden
        self.embed_dim = embed_dim
        self.dropout = dropout
        self.train_emb = train_emb
        self.sub_dim = sub_dim
        self.learning_rate = LEARNING_RATE
        self.num_chars = num_chars
        self.use_feat = use_feat
        self.save_attn = save_attn
        self.gating_fn = gating_fn

        self.use_subs = self.sub_dim != 0
        if W_init is None:
            W_init = lasagne.init.GlorotNormal().sample(
                (vocab_size, self.embed_dim))
        # W_init = lasagne.init.GlorotNormal().sample((vocab_size, self.embed_dim))
        doc_var, query_var, cand_var = T.itensor3('doc'), T.itensor3('quer'), \
                T.wtensor3('cand')
        docmask_var, qmask_var, candmask_var = T.bmatrix('doc_mask'), T.bmatrix('q_mask'), \
                T.bmatrix('c_mask')
        target_var = T.ivector('ans')
        feat_var = T.imatrix('feat')
        doc_toks, qry_toks = T.imatrix('dchars'), T.imatrix('qchars')
        tok_var, tok_mask = T.imatrix('tok'), T.bmatrix('tok_mask')
        cloze_var = T.ivector('cloze')
        self.inps = [
            doc_var, doc_toks, query_var, qry_toks, cand_var, target_var,
            docmask_var, qmask_var, tok_var, tok_mask, candmask_var, feat_var,
            cloze_var
        ]

        self.predicted_probs, predicted_probs_val, self.network, W_emb, attentions = (
            self.build_network(K, vocab_size, W_init, S_init))

        self.loss_fn = T.nnet.categorical_crossentropy(self.predicted_probs,
                                                       target_var).mean()
        self.eval_fn = lasagne.objectives.categorical_accuracy(
            self.predicted_probs, target_var).mean()

        loss_fn_val = T.nnet.categorical_crossentropy(predicted_probs_val,
                                                      target_var).mean()
        eval_fn_val = lasagne.objectives.categorical_accuracy(
            predicted_probs_val, target_var).mean()

        self.params = L.get_all_params(self.network, trainable=True)

        updates = lasagne.updates.adam(self.loss_fn,
                                       self.params,
                                       learning_rate=self.learning_rate)

        self.train_fn = theano.function(
            self.inps, [self.loss_fn, self.eval_fn, self.predicted_probs],
            updates=updates,
            on_unused_input='ignore')
        self.validate_fn = theano.function(
            self.inps,
            [loss_fn_val, eval_fn_val, predicted_probs_val] + attentions,
            mode=NanGuardMode(nan_is_error=True,
                              inf_is_error=True,
                              big_is_error=True),
            on_unused_input='ignore')
Example #20
0
def train_loop(inputs,
               cost,
               train_data,
               times,
               prints=None,
               inject_total_iters=False,
               test_data=None,
               callback=None,
               optimizer=lasagne.updates.adam,
               save_params=False,
               nan_guard=False):

    params = lib.search(cost, lambda x: hasattr(x, 'param'))
    lib.print_params_info(params)

    grads = T.grad(cost, wrt=params, disconnected_inputs='warn')

    grads = [T.clip(g, lib.floatX(-1), lib.floatX(1)) for g in grads]

    updates = optimizer(grads, params)

    if prints is None:
        prints = [('cost', cost)]
    else:
        prints = [('cost', cost)] + prints

    print "Compiling train function..."
    if nan_guard:
        from theano.compile.nanguardmode import NanGuardMode
        mode = NanGuardMode(nan_is_error=True,
                            inf_is_error=True,
                            big_is_error=True)
    else:
        mode = None
    train_fn = theano.function(inputs, [p[1] for p in prints],
                               updates=updates,
                               on_unused_input='warn',
                               mode=mode)

    print "Compiling eval function..."
    eval_fn = theano.function(inputs, [p[1] for p in prints],
                              on_unused_input='warn')

    print "Training!"

    total_iters = 0
    total_seconds = 0.
    last_print = 0
    last_gen = 0

    if len(times) >= 4:
        gen_every = times[3]
    else:
        gen_every = times[1]

    if len(times) >= 5:
        early_stop = times[4]
        if len(times) >= 6:
            early_stop_min = times[5]
        else:
            early_stop_min = 0
    else:
        early_stop = None
        early_stop_min = None

    best_test_cost = np.inf
    best_test_cost_iter = 0.

    all_outputs = []
    all_stats = []
    for epoch in itertools.count():

        generator = train_data()
        while True:
            try:
                inputs = generator.__next__()
            except StopIteration:
                break

            if inject_total_iters:
                inputs = [np.int32(total_iters)] + list(inputs)

            start_time = time.time()
            outputs = train_fn(*inputs)
            total_seconds += time.time() - start_time
            total_iters += 1

            all_outputs.append(outputs)

            if total_iters == 1:
                try:  # This only matters on Ishaan's computer
                    import experiment_tools
                    experiment_tools.register_crash_notifier()
                except ImportError:
                    pass

            if (times[0]=='iters' and total_iters-last_print == times[1]) or \
                (times[0]=='seconds' and total_seconds-last_print >= times[1]):

                mean_outputs = np.array(all_outputs).mean(axis=0)

                if test_data is not None:
                    if inject_total_iters:
                        test_outputs = [
                            eval_fn(np.int32(total_iters), *inputs)
                            for inputs in test_data()
                        ]
                    else:
                        test_outputs = [
                            eval_fn(*inputs) for inputs in test_data()
                        ]
                    test_mean_outputs = np.array(test_outputs).mean(axis=0)

                stats = collections.OrderedDict()
                stats['epoch'] = epoch
                stats['iters'] = total_iters
                for i, p in enumerate(prints):
                    stats['train ' + p[0]] = mean_outputs[i]
                if test_data is not None:
                    for i, p in enumerate(prints):
                        stats['test ' + p[0]] = test_mean_outputs[i]
                stats['secs'] = total_seconds
                stats['secs/iter'] = total_seconds / total_iters

                if test_data != None and (stats['test cost'] < best_test_cost
                                          or
                                          (early_stop_min != None
                                           and total_iters <= early_stop_min)):
                    best_test_cost = stats['test cost']
                    best_test_cost_iter = total_iters

                print_str = ""
                for k, v in stats.items():
                    if isinstance(v, int):
                        print_str += "{}:{}\t".format(k, v)
                    else:
                        print_str += "{}:{:.4f}\t".format(k, v)
                print print_str[:-1]  # omit the last \t

                all_stats.append(stats)

                all_outputs = []
                last_print += times[1]

            if (times[0]=='iters' and total_iters-last_gen==gen_every) or \
                (times[0]=='seconds' and total_seconds-last_gen >= gen_every):
                tag = "iters{}_time{}".format(total_iters, total_seconds)
                if callback is not None:
                    callback(tag)
                if save_params:
                    lib.save_params('params_{}.pkl'.format(tag))

                last_gen += gen_every

            if (times[0]=='iters' and total_iters == times[2]) or \
                (times[0]=='seconds' and total_seconds >= times[2]) or \
                (test_data != None and early_stop != None and total_iters > (3*early_stop) and (total_iters-best_test_cost_iter) > early_stop):

                if (test_data != None and early_stop != None and total_iters >
                    (3 * early_stop)
                        and (total_iters - best_test_cost_iter) > early_stop):
                    print "Early stop! Best test cost was {} at iter {}".format(
                        best_test_cost, best_test_cost_iter)

                print "Done!"

                try:  # This only matters on Ishaan's computer
                    import experiment_tools
                    experiment_tools.send_sms("done!")
                except ImportError:
                    pass

                return all_stats
Example #21
0
y = net.fprop(x**2 / 2.)
cost = y.mean()

parameters = net.params

from blocks.algorithms import Scale
from blocks.algorithms import GradientDescent
optimizer = Scale(0.)

print "Calling Algorithm"
algorithm = GradientDescent(
    #gradients=grads, parameters=parameters,
    cost=cost,
    parameters=parameters,
    step_rule=optimizer)

from theano.compile.nanguardmode import NanGuardMode
fun = theano.function(inputs=[x],
                      outputs=[cost],
                      updates=algorithm.updates,
                      mode=NanGuardMode(nan_is_error=True,
                                        inf_is_error=True,
                                        big_is_error=True))
#npx = getnumpyf32((5, batch_size, channels,)+image_size)
npx = np.random.random((5, 32, 50)).astype(np.float32)
out = fun(npx)
#for i,v in enumerate(parameters):
#    if 'U' in v.name:
#        theano.printing.debugprint(algorithm.updates[i][1])
#        break
def build_model(tparams, options):
    opt_ret = dict()

    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # description string: #words x #samples
    x = tensor.matrix('x', dtype='int64')
    x_mask = tensor.matrix('x_mask', dtype='float32')
    y = tensor.matrix('y', dtype='int64')
    y_mask = tensor.matrix('y_mask', dtype='float32')

    # time_steps
    n_timesteps = x_mask.shape[0]
    n_timesteps_trg = y_mask.shape[0]
    n_samples = x_mask.shape[1]

    # word embedding for forward rnn (source)
    emb = tparams['Wemb'][x.flatten()]
    emb = emb.reshape([n_timesteps, n_samples, options['dim_word']])
    proj = get_layer(options['encoder'])[1](tparams,
                                            emb,
                                            options,
                                            prefix='encoder',
                                            mask=x_mask)

    # for reverse RNN: bi-directional RNN encoder
    if options.get('birnn', False):
        xr = x[::-1]
        xr_mask = x_mask[::-1]

        embr = tparams['Wemb'][xr.flatten()]
        embr = embr.reshape([n_timesteps, n_samples, options['dim_word']])
        projr = get_layer(options['encoder'])[1](tparams,
                                                 embr,
                                                 options,
                                                 prefix='encoder_r',
                                                 mask=xr_mask)
        ctx = concatenate([proj[0], projr[0][::-1]], axis=proj[0].ndim - 1)

    else:
        ctx = proj[0]  # context vectors

    # mean of the context (across time) will be used to initialize decoder rnn
    ctx_mean = (ctx * x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None]

    # or you can use the last state of forward + backward encoder rnns
    # ctx_mean = concatenate([proj[0][-1], projr[0][-1]], axis=proj[0].ndim-2)

    # initial decoder state
    init_state = get_layer('ff')[1](tparams,
                                    ctx_mean,
                                    options,
                                    prefix='ff_state',
                                    activ='tanh')

    # word embedding (target), we will shift the target sequence one time step
    # to the right. This is done because of the bi-gram connections in the
    # readout and decoder rnn. The first target will be all zeros and we will
    # not condition on the last output.
    emb = tparams['Wemb_dec'][y.flatten()]
    emb = emb.reshape([n_timesteps_trg, n_samples, options['dim_word']])
    emb_shifted = tensor.zeros_like(emb)
    emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
    emb = emb_shifted

    # decoder - pass through the decoder conditional gru with attention
    proj = get_layer(options['decoder'])[1](tparams,
                                            emb,
                                            options,
                                            prefix='decoder',
                                            mask=y_mask,
                                            context=ctx,
                                            context_mask=x_mask,
                                            one_step=False,
                                            init_state=init_state)
    # hidden states of the decoder gru
    proj_h = proj[0]

    # weighted averages of context, generated by attention module
    ctxs = proj[1]

    # weights (alignment matrix)
    opt_ret['dec_alphas'] = proj[2]  # --> to show the attenion weights

    # compute word probabilities
    logit_lstm = get_layer('ff')[1](tparams,
                                    proj_h,
                                    options,
                                    prefix='ff_logit_lstm',
                                    activ='linear')
    logit_prev = get_layer('ff')[1](tparams,
                                    emb,
                                    options,
                                    prefix='ff_logit_prev',
                                    activ='linear')
    logit_ctx = get_layer('ff')[1](tparams,
                                   ctxs,
                                   options,
                                   prefix='ff_logit_ctx',
                                   activ='linear')
    logit = tensor.tanh(logit_lstm + logit_prev + logit_ctx)

    # dropout (noise)
    if options['use_dropout']:
        logit = dropout_layer(logit, use_noise, trng)
    logit = get_layer('ff')[1](tparams,
                               logit,
                               options,
                               prefix='ff_logit',
                               activ='linear')
    logit_shp = logit.shape
    probs = tensor.nnet.softmax(
        logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]]))

    # compute the cost (negative loglikelihood)
    y_flat = y.flatten()
    y_flat_idx = tensor.arange(y_flat.shape[0]) * options['n_words'] + y_flat

    cost = -tensor.log(probs.flatten()[y_flat_idx])
    cost = cost.reshape([y.shape[0], y.shape[1]])
    cost = (cost * y_mask).sum(0)

    # we will build an additional function for computing costs
    f_cost = theano.function([ctx, x_mask, y, y_mask],
                             cost,
                             mode=NanGuardMode(nan_is_error=True,
                                               inf_is_error=True,
                                               big_is_error=True))
    return trng, use_noise, x, x_mask, y, y_mask, opt_ret, cost, f_cost
Example #23
0
def trainer(
    r=5,
    dim_word=1000,
    dim=1000,
    trainpath=[
        '../datasets/simQA_test.txt', '../datasets/cand_ent_test.txt',
        '../datasets/cand_rel_test.txt'
    ],
    validpath=[
        '../datasets/simQA_test.txt', '../datasets/cand_ent_test.txt',
        '../datasets/cand_rel_test.txt'
    ],
    dict_character='../datasets/dict/dict.pkl',
    dict_relation='../datasets/dict/dict.pkl',
    dict_word='../datasets/dict/dict.pkl',
    relation_pattern='RWC',
    batch_size=16,
    valid_batch_size=16,
    maxlen=200,
    learning_rate=0.001,
    max_epochs=10,
    dispFreq=100,
    saveFreq=1000,
    validFreq=1000,
    saveto='model.npz',
    overwrite=True,
    patience=10,
    predicate_num=150,
    lstm_end='average',
    lstm_layers=2,
    word=False,
    word_dict_num=5000,
    relation_dict_num=8000,
    character_dict_num=200,
    cross=True,
    one_layer=False,
    en_decode_type='ff',
    qu_split=False,
    structure_number=3,
    en_pooling_type='average',  # only for pooling question when entity decoding
    relation_attention='target_attention'):
    # theano.config.warn_float64 = "raise"
    model_options = locals().copy()
    train = TextIterator(trainpath[0],
                         trainpath[1],
                         trainpath[2],
                         dict_character,
                         dict_word,
                         dict_relation,
                         predicate_num=predicate_num,
                         batch_size=model_options['batch_size'],
                         maxlen=model_options['maxlen'])

    valid = TextIterator(validpath[0],
                         validpath[1],
                         validpath[2],
                         dict_character,
                         dict_word,
                         dict_relation,
                         predicate_num=predicate_num,
                         batch_size=model_options['batch_size'],
                         maxlen=model_options['maxlen'])

    InitParamsIns = InitParams()
    tparams = InitParamsIns.inittparams(model_options)
    ModelIns = MODEL()
    print 'Build Train and Valid Model...',
    x, x_mask, y, y_mask, z_rel, z_mask_rel, z_wor, chz_mask_wor, z_cha, chz_mask_cha, t, cost, errors = ModelIns.BuildTrainModel(
        tparams, model_options)
    x_v, x_mask_v, y_v, y_mask_v, z_rel_v, z_mask_rel_v, z_wor_v, chz_mask_wor_v, z_cha_v, chz_mask_cha_v, t_v, errors_v, en_errors_v, pr_errors_v = ModelIns.BuildValidTestModel(
        tparams, model_options)
    print 'Done'
    inputs_v = [
        x_v, x_mask_v, y_v, y_mask_v, z_rel_v, z_mask_rel_v, z_wor_v,
        chz_mask_wor_v, z_cha_v, chz_mask_cha_v, t_v
    ]
    inputs = [
        x, x_mask, y, y_mask, z_rel, z_mask_rel, z_wor, chz_mask_wor, z_cha,
        chz_mask_cha, t
    ]

    # alpha=[pr_alpha]
    outputs = [cost, errors]
    optputs_v = [errors_v, en_errors_v, pr_errors_v]
    func_ctx = theano.function(inputs,
                               outputs,
                               on_unused_input='ignore',
                               allow_input_downcast=True,
                               mode=NanGuardMode(nan_is_error=True,
                                                 inf_is_error=True,
                                                 big_is_error=True))
    func_valid_error = theano.function(inputs_v,
                                       optputs_v,
                                       on_unused_input='ignore',
                                       allow_input_downcast=True)

    # func_p = theano.function(inputs,p,on_unused_input = 'ignore',allow_input_downcast=True)
    # func_alpha = theano.function(inputs,pr_alpha,on_unused_input = 'ignore',allow_input_downcast=True)
    print 'Building grad...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'

    print 'Building optimizers...',
    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = adadelta(lr, tparams, grads, inputs, cost)
    print 'Done'

    uidx = 0
    best_p = None
    estop = False
    bad_counter = 0
    history_right = []
    for epoch_idx in xrange(max_epochs):
        n_samples = 0
        for source, target, entity, predicate_relation, predicate_word, predicate_character in train:
            n_samples += len(source)
            uidx += 1

            prepare_layer = PrepareDate(source, entity, predicate_character)
            x, x_mask, y, y_mask, z_relation, \
            z_mask_relation,z_word, z_mask_word,z_character, \
            z_mask_character,t = prepare_layer.prepare_valid_test_date_for_cross(source, entity,
                                                                                predicate_relation,predicate_word,
                                                                                predicate_character,target)
            if source is None:
                print 'Minibatch with zero sample'
                uidx -= 1
                continue
            ud_start = time.time()

            cost = f_grad_shared(x, x_mask, y, y_mask, z_relation,
                                 z_mask_relation, z_word, z_mask_word,
                                 z_character, z_mask_character, t)
            # ctx_qu_rel,ctx_qu_wor,ctx_qu_cha,ctx_pr_rel,ctx_pr_wor,ctx_pr_cha=func_p(x, x_mask, y, y_mask, z_relation,
            # z_mask_relation,z_word, z_mask_word,z_character,
            # z_mask_character,t)
            f_update(learning_rate)
            ud = time.time() - ud_start
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                break
            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', epoch_idx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud, 'learning_rate', learning_rate

            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving the best model...',
                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(saveto,
                            history_errs=history_right,
                            uidx=uidx,
                            **params)
                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
                print 'Done'
                # save with uidx
                if not overwrite:
                    print 'Saving the model at iteration {0}...'.format(uidx),
                    saveto_uidx = '{0}.iter{1}.npz'.format(
                        os.path.splitext(saveto)[0], uidx)
                    numpy.savez(saveto_uidx,
                                history_errs=history_right,
                                uidx=uidx,
                                **unzip(tparams))
                    print 'Done'
            # validdata model on validation set and early stop if necessary
            if numpy.mod(uidx, validFreq) == 0:
                rights = []
                for source, target, entity, predicate_relation, predicate_word, predicate_character in valid:
                    valid_prepare_layer = PrepareDate(source, entity,
                                                      predicate_character)
                    x, x_mask, y, y_mask, z_relation, \
                    z_mask_relation,z_word, z_mask_word,z_character, \
                    z_mask_character,t= valid_prepare_layer.prepare_valid_test_date_for_cross(source, entity,
                                                                                predicate_relation,predicate_word,
                                                                                predicate_character,target)

                    right = func_valid_error(x, x_mask, y, y_mask, z_relation,
                                             z_mask_relation, z_word,
                                             z_mask_word, z_character,
                                             z_mask_character, t)

                    rights.append(right[0])

                right_arr = numpy.array(rights)
                valid_right = right_arr.mean() / valid_batch_size
                history_right.append(valid_right)

                if uidx == 0 or valid_right >= numpy.array(
                        history_right).max():
                    best_p = unzip(tparams)
                    bad_counter = 0
                if len(history_right
                       ) > patience and valid_right <= numpy.array(
                           history_right)[:-patience].max():
                    bad_counter += 1
                    if bad_counter > patience:
                        print 'Early Stop!'
                        estop = True
                        break
                # if numpy.isnan(valid_err):
                #     ipdb.set_trace()
                print 'Valid ', valid_right
        print 'seen %d samples' % n_samples
        if estop:
            break
        print 'Saving the model at epoch {0}...'.format(epoch_idx),
        saveto_uidx = '{0}.epoch{1}.npz'.format(
            os.path.splitext(saveto)[0], epoch_idx)
        numpy.savez(saveto_uidx,
                    history_errs=history_right,
                    uidx=uidx,
                    **unzip(tparams))
        print 'Done'
    if best_p is not None:
        zipp(best_p, tparams)

    rights = []
    for source, target, entity, predicate_relation, predicate_word, predicate_character in valid:
        valid_prepare_layer = PrepareDate(source, entity, predicate_character)
        x, x_mask, y, y_mask, z_relation, \
        z_mask_relation,z_word, z_mask_word,z_character, \
        z_mask_character,t= valid_prepare_layer.prepare_valid_test_date_for_cross(source, entity,
                                                                    predicate_relation,predicate_word,
                                                                    predicate_character,target)
        right = func_valid_error(x, x_mask, y, y_mask, z_relation,
                                 z_mask_relation, z_word, z_mask_word,
                                 z_character, z_mask_character, t)
        rights.append(right[0])

    right_arr = numpy.array(rights)
    valid_right = right_arr.mean() / valid_batch_size

    print 'Valid ', valid_right
    # train_err =numpy.array(p_train).mean()/batch_size
    params = copy.copy(best_p)
    numpy.savez(saveto,
                zipped_params=best_p,
                history_errs=history_right,
                uidx=uidx,
                **params)
    return valid_right
    def setup(self):
        """
        Set up the model to train.
        """

        # input_words: shape (n_batch, n_sentence, sentence_len)
        input_words = T.itensor3()
        n_batch, n_sentences, sentence_len = input_words.shape
        # query_words: shape (n_batch, query_len)
        query_words = T.imatrix()
        # correct_output: shape (n_batch, ?, num_output_words)
        correct_output = T.ftensor3()

        # graph_num_new_nodes: shape(n_batch, n_sentence)
        graph_num_new_nodes = T.imatrix()
        # graph_new_node_strengths: shape(n_batch, n_sentence, new_nodes_per_iter)
        graph_new_node_strengths = T.ftensor3()
        # graph_new_node_ids: shape(n_batch, n_sentence, new_nodes_per_iter, num_node_ids)
        graph_new_node_ids = T.ftensor4()
        # graph_new_edges: shape(n_batch, n_sentence, pad_graph_size, pad_graph_size, num_edge_types)
        graph_new_edges = T.TensorType('floatX', (False, ) * 5)()

        def _build(with_correct_graph, snap_to_best, using_dropout,
                   evaluate_accuracy):
            info = {}
            # Process each sentence, flattened to (?, sentence_len)
            flat_input_words = input_words.reshape([-1, sentence_len])
            flat_input_reprs, flat_ref_matrices = self.input_transformer.process(
                flat_input_words)
            # flat_input_reprs of shape (?, input_repr_size)
            # flat_ref_matrices of shape (?, num_node_ids, input_repr_size)
            input_reprs = flat_input_reprs.reshape(
                [n_batch, n_sentences, self.input_repr_size])
            ref_matrices = flat_ref_matrices.reshape([
                n_batch, n_sentences, self.num_node_ids, self.input_repr_size
            ])

            query_repr, query_ref_matrix = self.input_transformer.process(
                query_words)

            if using_dropout:
                iter_dropouts = []
                states_mask = util.make_dropout_mask(
                    (self.node_state_size, ), self.dropout_keep, self.srng)
                if self.nodes_mutable:
                    iter_dropouts.extend(
                        self.node_state_updater.dropout_masks(
                            self.srng, states_mask))
                if len(self.word_node_mapping) > 0:
                    iter_dropouts.extend(
                        self.direct_reference_updater.dropout_masks(
                            self.srng, states_mask))
                if self.intermediate_propagate != 0:
                    iter_dropouts.extend(
                        self.intermediate_propagator.dropout_masks(
                            self.srng, states_mask))
                if self.dynamic_nodes:
                    iter_dropouts.extend(
                        self.new_node_adder.dropout_masks(self.srng))
                iter_dropouts.extend(
                    self.edge_state_updater.dropout_masks(self.srng))
            else:
                iter_dropouts = []
                states_mask = None

            def _iter_fn(input_repr,
                         ref_matrix,
                         gstate,
                         correct_num_new_nodes=None,
                         correct_new_strengths=None,
                         correct_new_node_ids=None,
                         correct_edges=None,
                         dropout_masks=None):
                # If necessary, update node state
                if self.nodes_mutable:
                    gstate, dropout_masks = self.node_state_updater.process(
                        gstate, input_repr, dropout_masks)

                if len(self.word_node_mapping) > 0:
                    gstate, dropout_masks = self.direct_reference_updater.process(
                        gstate, ref_matrix, dropout_masks)

                # If necessary, propagate node state
                if self.intermediate_propagate != 0:
                    gstate, dropout_masks = self.intermediate_propagator.process_multiple(
                        gstate, self.intermediate_propagate, dropout_masks)

                node_loss = None
                node_accuracy = None
                # Propose and vote on new nodes
                if self.dynamic_nodes:
                    new_strengths, new_ids, dropout_masks = self.new_node_adder.get_candidates(
                        gstate, input_repr, self.new_nodes_per_iter,
                        dropout_masks)
                    # new_strengths and correct_new_strengths are of shape (n_batch, new_nodes_per_iter)
                    # new_ids and correct_new_node_ids are of shape (n_batch, new_nodes_per_iter, num_node_ids)
                    if with_correct_graph:
                        perm_idxs = np.array(
                            list(
                                itertools.permutations(
                                    range(self.new_nodes_per_iter))))
                        permuted_correct_str = correct_new_strengths[:,
                                                                     perm_idxs]
                        permuted_correct_ids = correct_new_node_ids[:,
                                                                    perm_idxs]
                        # due to advanced indexing, we should have shape (n_batch, permutation, new_nodes_per_iter, num_node_ids)
                        ext_new_str = T.shape_padaxis(new_strengths, 1)
                        ext_new_ids = T.shape_padaxis(new_ids, 1)
                        strength_ll = permuted_correct_str * T.log(
                            ext_new_str +
                            util.EPSILON) + (1 - permuted_correct_str) * T.log(
                                1 - ext_new_str + util.EPSILON)
                        ids_ll = permuted_correct_ids * T.log(ext_new_ids +
                                                              util.EPSILON)
                        reduced_perm_lls = T.sum(strength_ll, axis=2) + T.sum(
                            ids_ll, axis=[2, 3])
                        if self.best_node_match_only:
                            node_loss = -T.max(reduced_perm_lls, 1)
                        else:
                            full_ll = util.reduce_log_sum(reduced_perm_lls, 1)
                            # Note that some of these permutations are identical, since we likely did not add the maximum
                            # amount of nodes. Thus we will have added repeated elements here.
                            # We have log(x+x+...+x) = log(kx), where k is the repetition factor and x is the probability we want
                            # log(kx) = log(k) + log(x)
                            # Our repetition factor k is given by (new_nodes_per_iter - correct_num_new_nodes)!
                            # Recall that n! = gamma(n+1)
                            # so log(x) = log(kx) - log(gamma(k+1))
                            log_rep_factor = T.gammaln(
                                T.cast(
                                    self.new_nodes_per_iter -
                                    correct_num_new_nodes + 1, 'floatX'))
                            scaled_ll = full_ll - log_rep_factor
                            node_loss = -scaled_ll
                        if evaluate_accuracy:
                            best_match_idx = T.argmax(reduced_perm_lls, 1)
                            # should be of shape (n_batch), indexing the best permutation
                            best_correct_str = permuted_correct_str[
                                T.arange(n_batch), best_match_idx]
                            best_correct_ids = permuted_correct_ids[
                                T.arange(n_batch), best_match_idx]
                            snapped_strengths = util.independent_best(
                                new_strengths)
                            snapped_ids = util.categorical_best(
                                new_ids) * T.shape_padright(snapped_strengths)
                            close_strengths = T.all(
                                T.isclose(best_correct_str, snapped_strengths),
                                (1))
                            close_ids = T.all(
                                T.isclose(best_correct_ids, snapped_ids),
                                (1, 2))
                            node_accuracy = T.and_(close_strengths, close_ids)
                        # now substitute in the correct nodes
                        gstate = gstate.with_additional_nodes(
                            correct_new_strengths, correct_new_node_ids)
                    elif snap_to_best:
                        snapped_strengths = util.independent_best(
                            new_strengths)
                        snapped_ids = util.categorical_best(new_ids)
                        gstate = gstate.with_additional_nodes(
                            snapped_strengths, snapped_ids)
                    else:
                        gstate = gstate.with_additional_nodes(
                            new_strengths, new_ids)

                # Update edge state
                gstate, dropout_masks = self.edge_state_updater.process(
                    gstate, input_repr, dropout_masks)
                if with_correct_graph:
                    cropped_correct_edges = correct_edges[:, :gstate.n_nodes, :
                                                          gstate.n_nodes, :]
                    edge_lls = cropped_correct_edges * T.log(
                        gstate.edge_strengths +
                        util.EPSILON) + (1 - cropped_correct_edges) * T.log(
                            1 - gstate.edge_strengths + util.EPSILON)
                    # edge_lls currently penalizes for edges connected to nodes that do not exist
                    # we do not want it to do this, so we mask it with node strengths
                    mask_src = util.shape_padaxes(gstate.node_strengths,
                                                  [2, 3])
                    mask_dest = util.shape_padaxes(gstate.node_strengths,
                                                   [1, 3])
                    masked_edge_lls = edge_lls * mask_src * mask_dest
                    edge_loss = -T.sum(masked_edge_lls, axis=[1, 2, 3])
                    if evaluate_accuracy:
                        snapped_edges = util.independent_best(
                            gstate.edge_strengths)
                        close_edges = T.isclose(cropped_correct_edges,
                                                snapped_edges)
                        ok_mask = 1 - T.cast(
                            mask_src * mask_dest, 'int8'
                        )  # its OK for things not to match if node strengths are NOT both 1
                        edge_accuracy = T.all(T.or_(close_edges, ok_mask),
                                              (1, 2, 3))
                        overall_accuracy = edge_accuracy if node_accuracy is None else T.and_(
                            node_accuracy, edge_accuracy)
                    else:
                        overall_accuracy = None
                    gstate = gstate.with_updates(
                        edge_strengths=cropped_correct_edges)
                    return gstate, node_loss, edge_loss, overall_accuracy
                elif snap_to_best:
                    snapped_edges = util.independent_best(
                        gstate.edge_strengths)
                    gstate = gstate.with_updates(edge_strengths=snapped_edges)
                    return gstate
                else:
                    return gstate

            # Scan over each sentence
            def _scan_fn(
                input_repr, *stuff
            ):  # (input_repr, [ref_matrix?], [*correct_graph_stuff?], [dropout_masks?], *flat_graph_state, pad_graph_size)
                stuff = list(stuff)

                if len(self.word_node_mapping) > 0:
                    ref_matrix = stuff[0]
                    stuff = stuff[1:]
                else:
                    ref_matrix = None

                if with_correct_graph:
                    c_num_new_nodes, c_new_strengths, c_new_node_ids, c_edges = stuff[:
                                                                                      4]
                    stuff = stuff[4:]

                if using_dropout:
                    dropout_masks = stuff[:len(iter_dropouts)]
                    stuff = stuff[len(iter_dropouts):]
                else:
                    dropout_masks = None

                flat_graph_state = stuff[:-1]
                pad_graph_size = stuff[-1]
                gstate = GraphState.unflatten_from_const_size(flat_graph_state)

                if with_correct_graph:
                    gstate, node_loss, edge_loss, overall_accuracy = _iter_fn(
                        input_repr,
                        ref_matrix,
                        gstate,
                        c_num_new_nodes,
                        c_new_strengths,
                        c_new_node_ids,
                        c_edges,
                        dropout_masks=dropout_masks)
                else:
                    gstate = _iter_fn(input_repr,
                                      ref_matrix,
                                      gstate,
                                      dropout_masks=dropout_masks)

                retvals = gstate.flatten_to_const_size(pad_graph_size)
                if with_correct_graph:
                    if self.dynamic_nodes:
                        retvals.append(node_loss)
                    retvals.append(edge_loss)
                    if evaluate_accuracy:
                        retvals.append(overall_accuracy)
                return retvals

            if self.dynamic_nodes:
                initial_gstate = GraphState.create_empty(
                    n_batch, self.num_node_ids, self.node_state_size,
                    self.num_edge_types)
            else:
                initial_gstate = GraphState.create_full_unique(
                    n_batch, self.num_node_ids, self.node_state_size,
                    self.num_edge_types)

            # Account for all nodes, plus the extra padding node to prevent GPU unpleasantness
            if self.dynamic_nodes:
                pad_graph_size = n_sentences * self.new_nodes_per_iter + 1
            else:
                pad_graph_size = self.num_node_ids
            outputs_info = initial_gstate.flatten_to_const_size(pad_graph_size)
            prepped_input = input_reprs.dimshuffle([1, 0, 2])
            sequences = [prepped_input]
            if len(self.word_node_mapping) > 0:
                sequences.append(ref_matrices.dimshuffle([1, 0, 2, 3]))
            if with_correct_graph:
                sequences.append(graph_num_new_nodes.swapaxes(0, 1))
                sequences.append(graph_new_node_strengths.swapaxes(0, 1))
                sequences.append(graph_new_node_ids.swapaxes(0, 1))
                sequences.append(graph_new_edges.swapaxes(0, 1))

                if self.dynamic_nodes:
                    outputs_info.extend([None])
                if evaluate_accuracy:
                    outputs_info.extend([None])
                outputs_info.extend([None])
            if using_dropout:
                sequences.extend(iter_dropouts)
            all_scan_out, _ = theano.scan(_scan_fn,
                                          sequences=sequences,
                                          outputs_info=outputs_info,
                                          non_sequences=[pad_graph_size])
            graph_accurate_list = None
            if with_correct_graph:
                if evaluate_accuracy:
                    full_graph_accuracy = all_scan_out[-1]
                    all_scan_out = all_scan_out[:-1]
                    graph_accurate_list = T.all(full_graph_accuracy, 0)
                    info["graph_accuracy"] = T.sum(graph_accurate_list,
                                                   dtype='floatX') / T.cast(
                                                       n_batch, 'floatX')
                if self.dynamic_nodes:
                    all_flat_gstates = all_scan_out[:-2]
                    node_loss, edge_loss = all_scan_out[-2:]
                    reduced_node_loss = T.sum(node_loss) / T.cast(
                        n_batch, 'floatX')
                    reduced_edge_loss = T.sum(edge_loss) / T.cast(
                        n_batch, 'floatX')
                    avg_graph_loss = (reduced_node_loss +
                                      reduced_edge_loss) / T.cast(
                                          input_words.shape[1], 'floatX')
                    info["node_loss"] = reduced_node_loss
                    info["edge_loss"] = reduced_edge_loss
                else:
                    all_flat_gstates = all_scan_out[:-1]
                    edge_loss = all_scan_out[-1]
                    reduced_edge_loss = T.sum(edge_loss) / T.cast(
                        n_batch, 'floatX')
                    avg_graph_loss = reduced_edge_loss / T.cast(
                        input_words.shape[1], 'floatX')
                    info["edge_loss"] = reduced_edge_loss
            else:
                all_flat_gstates = all_scan_out

            if self.sequence_representation:
                # Each part of all_flat_gstates is of shape (n_sentences, n_batch, ...)
                # except for the last one, which we handle separately
                # Swap to (n_batch, n_sentences, ...)
                # Then flatten to (n_batch*n_sentences, ...) for further processing
                final_flat_gstate = [
                    x.swapaxes(0, 1).reshape(T.concatenate([[-1],
                                                            x.shape[2:]]),
                                             ndim=(x.ndim - 1))
                    for x in all_flat_gstates[:-1]
                ]
                # As for the last one, we need to get a single scalar value. The last one will be the biggest
                # so we will take that. Note that this will introduce a bunch of zero-nodes, but thats
                # OK and we can process that later. (We REQUIRE that padding in graph_state makes zero strength
                # nodes here!)
                final_flat_gstate.append(all_flat_gstates[-1][-1])
                # We also need to repeat query_repr and query_ref_matrix so that they broadcast together
                query_repr = T.extra_ops.repeat(query_repr, n_sentences, 0)
                query_ref_matrix = T.extra_ops.repeat(query_ref_matrix,
                                                      n_sentences, 0)
            else:
                # Extract last timestep
                final_flat_gstate = [x[-1] for x in all_flat_gstates]
            final_gstate = GraphState.unflatten_from_const_size(
                final_flat_gstate)

            if self.train_with_query:
                if self.wipe_node_state:
                    final_gstate = final_gstate.with_updates(
                        node_states=T.zeros_like(final_gstate.node_states))

                qnsu_dropout_masks = self.query_node_state_updater.dropout_masks(
                    self.srng, states_mask)
                query_gstate, _ = self.query_node_state_updater.process(
                    final_gstate, query_repr, qnsu_dropout_masks)

                if len(self.word_node_mapping) > 0:
                    qdru_dropout_masks = self.query_direct_reference_updater.dropout_masks(
                        self.srng, states_mask)
                    query_gstate, _ = self.query_direct_reference_updater.process(
                        query_gstate, query_ref_matrix, qdru_dropout_masks)

                fp_dropout_masks = self.final_propagator.dropout_masks(
                    self.srng, states_mask)
                propagated_gstate, _ = self.final_propagator.process_multiple(
                    query_gstate, self.final_propagate, fp_dropout_masks)

                agg_dropout_masks = self.aggregator.dropout_masks(self.srng)
                aggregated_repr, _ = self.aggregator.process(
                    propagated_gstate,
                    agg_dropout_masks)  # shape (n_batch, output_repr_size)

                if self.sequence_representation:
                    # aggregated_repr is of shape (n_batch*n_sentences, repr_width)
                    # We want to split back to timesteps: (n_batch, n_sentences, repr_width)
                    agg_repr_seq = aggregated_repr.reshape(
                        [n_batch, n_sentences, -1])
                    # Now collapse it to a summary representation
                    aggsum_dropout_masks = self.aggregate_summarizer.dropout_masks(
                        self.srng)
                    aggregated_repr, _ = self.aggregate_summarizer.process(
                        agg_repr_seq, aggsum_dropout_masks)
                    # At this point aggregated_repr is (n_batch, repr_width) as desired

                max_seq_len = correct_output.shape[1]
                if self.output_format == ModelOutputFormat.sequence:
                    final_output = self.output_processor.process(
                        aggregated_repr,
                        max_seq_len)  # shape (n_batch, ?, num_output_words)
                else:
                    final_output = self.output_processor.process(
                        aggregated_repr)

                if snap_to_best:
                    final_output = self.output_processor.snap_to_best(
                        final_output)

                if self.output_format == ModelOutputFormat.subset:
                    elemwise_loss = T.nnet.binary_crossentropy(
                        final_output, correct_output)
                    query_loss = T.sum(elemwise_loss)
                else:
                    flat_final_output = final_output.reshape(
                        [-1, self.num_output_words])
                    flat_correct_output = correct_output.reshape(
                        [-1, self.num_output_words])
                    timewise_loss = T.nnet.categorical_crossentropy(
                        flat_final_output, flat_correct_output)
                    query_loss = T.sum(timewise_loss)
                query_loss = query_loss / T.cast(n_batch, 'floatX')
                info["query_loss"] = query_loss
            else:
                final_output = T.zeros([])

            full_loss = np.array(0.0, np.float32)
            if with_correct_graph:
                full_loss = full_loss + avg_graph_loss
            if self.train_with_query:
                full_loss = full_loss + query_loss

            if self.train_with_query:
                adjusted_query_gstates = [
                    x.reshape(T.concatenate([[n_batch, n_sentences],
                                             x.shape[1:]]),
                              ndim=(x.ndim + 1))
                    if self.sequence_representation else T.shape_padaxis(x, 1)
                    for x in query_gstate.flatten()
                ]
                adjusted_prop_gstates = [
                    x.reshape(T.concatenate([[n_batch, n_sentences],
                                             x.shape[1:]]),
                              ndim=(x.ndim + 1))
                    if self.sequence_representation else T.shape_padaxis(x, 1)
                    for x in propagated_gstate.flatten()
                ]
                full_flat_gstates = [
                    T.concatenate([a.swapaxes(0, 1), b, c], 1) for a, b, c in
                    zip(all_flat_gstates[:-1], adjusted_query_gstates,
                        adjusted_prop_gstates)
                ]
            else:
                full_flat_gstates = [
                    a.swapaxes(0, 1) for a in all_flat_gstates[:-1]
                ]
                max_seq_len = T.iscalar()
            return full_loss, final_output, full_flat_gstates, graph_accurate_list, max_seq_len, info

        train_loss, _, _, _, _, train_info = _build(self.train_with_graph,
                                                    False, True, False)
        adam_updates = Adam(train_loss, self.params, lr=self.learning_rate_var)

        self.info_keys = list(train_info.keys())

        print("Compiling...")

        optimizer = theano.compile.predefined_optimizers[
            'fast_run' if self.check_mode ==
            'debug' else theano.config.optimizer]
        optimizer = optimizer.excluding(
            "scanOp_pushout_output", "remove_constants_and_unused_inputs_scan")
        if self.check_mode == 'nan':
            mode = NanGuardMode(optimizer=optimizer,
                                nan_is_error=True,
                                inf_is_error=True,
                                big_is_error=True)
        elif self.check_mode == 'debug':
            mode = DebugMode(optimizer=optimizer,
                             check_isfinite=False,
                             check_py_code=False,
                             stability_patience=1)
            theano.tensor.TensorType.filter_checks_isfinite = False
        else:
            mode = theano.Mode(optimizer=optimizer)
        self.train_fn = theano.function([
            input_words, query_words, correct_output, graph_num_new_nodes,
            graph_new_node_strengths, graph_new_node_ids, graph_new_edges
        ], [train_loss] + list(train_info.values()),
                                        updates=adam_updates,
                                        allow_input_downcast=True,
                                        on_unused_input='ignore',
                                        mode=mode)

        eval_loss, _, full_flat_gstates, graph_accurate_list, _, eval_info = _build(
            self.train_with_graph, False, False, True)
        self.eval_info_keys = list(eval_info.keys())
        self.eval_fn = theano.function([
            input_words, query_words, correct_output, graph_num_new_nodes,
            graph_new_node_strengths, graph_new_node_ids, graph_new_edges
        ], [eval_loss, graph_accurate_list] + list(eval_info.values()),
                                       allow_input_downcast=True,
                                       on_unused_input='ignore',
                                       mode=mode)

        self.debug_test_fn = theano.function([
            input_words, query_words, correct_output, graph_num_new_nodes,
            graph_new_node_strengths, graph_new_node_ids, graph_new_edges
        ],
                                             full_flat_gstates,
                                             allow_input_downcast=True,
                                             on_unused_input='ignore',
                                             mode=mode)

        test_loss, final_output, full_flat_gstates, _, max_seq_len, _ = _build(
            False, False, False, False)
        self.fuzzy_test_fn = theano.function(
            [input_words, query_words] +
            ([max_seq_len] if self.output_format == ModelOutputFormat.sequence
             else []), [final_output] + full_flat_gstates,
            allow_input_downcast=True,
            on_unused_input='ignore',
            mode=mode)

        test_loss, final_output, full_flat_gstates, _, max_seq_len, _ = _build(
            False, True, False, False)
        self.snap_test_fn = theano.function(
            [input_words, query_words] +
            ([max_seq_len] if self.output_format == ModelOutputFormat.sequence
             else []), [final_output] + full_flat_gstates,
            allow_input_downcast=True,
            on_unused_input='ignore',
            mode=mode)
Example #25
0
    def update_opt(self, loss, target, leq_constraint, inputs, extra_inputs=None, constraint_name="constraint", *args,
                   **kwargs):
        """
        :param loss: Symbolic expression for the loss function.
        :param target: A parameterized object to optimize over. It should implement methods of the
        :class:`rllab.core.paramerized.Parameterized` class.
        :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon.
        :param inputs: A list of symbolic variables as inputs, which could be subsampled if needed. It is assumed
        that the first dimension of these inputs should correspond to the number of data points
        :param extra_inputs: A list of symbolic variables as extra inputs which should not be subsampled
        :return: No return value.
        """

        inputs = tuple(inputs)
        if extra_inputs is None:
            extra_inputs = tuple()
        else:
            extra_inputs = tuple(extra_inputs)

        constraint_term, constraint_value = leq_constraint

        params = target.get_params(trainable=True)
        grads = theano.grad(loss, wrt=params)
        flat_grad = ext.flatten_tensor_variables(grads)

        constraint_grads = theano.grad(constraint_term, wrt=params)
        xs = tuple([ext.new_tensor_like("%s x" % p.name, p) for p in params])
        Hx_plain_splits = TT.grad(
            TT.sum([TT.sum(g * x) for g, x in itertools.izip(constraint_grads, xs)]),
            wrt=params,
        )
        Hx_plain = TT.concatenate([TT.flatten(s) for s in Hx_plain_splits])

        self._target = target
        self._max_constraint_val = constraint_value
        self._constraint_name = constraint_name


        if self._debug_nan:
            from theano.compile.nanguardmode import NanGuardMode
            mode = NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)
        else:
            mode = None

        self._opt_fun = ext.lazydict(
            f_loss=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=loss,
                log_name="f_loss",
                mode=mode,
            ),
            f_grad=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=flat_grad,
                log_name="f_grad",
                mode=mode,
            ),
            f_Hx_plain=lambda: ext.compile_function(
                inputs=inputs + extra_inputs + xs,
                outputs=Hx_plain,
                log_name="f_Hx_plain",
                mode=mode,
            ),
            f_constraint=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=constraint_term,
                log_name="constraint",
                mode=mode,
            ),
            f_loss_constraint=lambda: ext.compile_function(
                inputs=inputs + extra_inputs,
                outputs=[loss, constraint_term],
                log_name="f_loss_constraint",
                mode=mode,
            ),
        )
Example #26
0
def main(num_epochs=1,
         n_songs_train=1,
         n_songs_val=1,
         n_songs_test=1,
         batch_size=256,
         learning_rate=1e-4):
    """
    Main function
    """

    # Theano config
    theano.config.floatX = 'float32'

    train, val, test = None, None, None
    try:
        train, val, test = use_preparsed_data(outputdir='/zap/tsob/audio/', )
    except:
        train, val, test = get_data(n_songs_train=n_songs_train,
                                    n_songs_val=n_songs_val,
                                    n_songs_test=n_songs_test,
                                    outputdir='/zap/tsob/audio/',
                                    seed=None)

    # Save the returned metadata
    np.savez('/zap/tsob/audio/metadata', train, val, test)

    # Print the dimensions
    print "Data dimensions:"
    for datapt in [
            train['Xshape'], train['yshape'], val['Xshape'], val['yshape'],
            test['Xshape'], test['yshape']
    ]:
        print datapt

    # Parse dimensions
    n_train = train['yshape'][0]
    n_val = val['yshape'][0]
    n_test = test['yshape'][0]
    n_chan = train['Xshape'][1]
    n_feats = train['Xshape'][2]
    n_frames = train['Xshape'][3]

    print "n_train  = {0}".format(n_train)
    print "n_val    = {0}".format(n_val)
    print "n_test   = {0}".format(n_test)
    print "n_chan   = {0}".format(n_chan)
    print "n_feats  = {0}".format(n_feats)
    print "n_frames = {0}".format(n_frames)

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4(name='inputs')
    target_var = T.fcol(name='targets')

    # Create neural network model (depending on first command line parameter)
    print("Building model and compiling functions..."),
    network = build_cnn(input_var)
    print("Done.")

    # Create a loss expression for training, i.e., a scalar objective we want to minimize
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.binary_hinge_loss(prediction, target_var)
    loss = loss.mean()

    # Create update expressions for training
    # Here, we'll use adam
    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.adam(loss,
                                   params,
                                   learning_rate=learning_rate,
                                   beta1=0.95,
                                   beta2=0.999,
                                   epsilon=1e-08)

    # Create a loss expression for validation/testing.
    # The crucial difference here is that we do a deterministic forward pass
    # through the network, disabling dropout layers.
    test_prediction = lasagne.layers.get_output(network, deterministic=True)

    test_loss = lasagne.objectives.binary_hinge_loss(test_prediction,
                                                     target_var)
    test_loss = test_loss.mean()

    test_pred_fn = theano.function([input_var],
                                   test_prediction,
                                   allow_input_downcast=True)

    # As a bonus, also create an expression for the classification accuracy:
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    train_fn = theano.function(
        [input_var, target_var],
        loss,
        updates=updates,
        mode=NanGuardMode(  #TODO remove
            nan_is_error=True,
            inf_is_error=True,
            big_is_error=True  #TODO remove
        ),  #TODO remove
        allow_input_downcast=True)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc],
                             allow_input_downcast=True)

    # Finally, launch the training loop.
    print("Starting training...")

    train_error_hist = []

    # We iterate over epochs:
    for epoch in range(num_epochs):

        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()

        for batch in iterate_minibatches(train, batch_size, shuffle=True):
            inputs, targets = batch
            train_err_increment = train_fn(inputs, targets)
            train_err += train_err_increment
            train_error_hist.append(train_err_increment)
            train_batches += 1

        # And a full pass over the validation data:
        val_err = 0
        val_acc = 0
        val_batches = 0
        for batch in iterate_minibatches(val, batch_size, shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            val_err += err
            val_acc += acc
            val_batches += 1

        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs,
                                                   time.time() - start_time))
        print("  training loss:\t\t{:.8f}".format(train_err / train_batches))
        print("  validation loss:\t\t{:.8f}".format(val_err / val_batches))
        print("  validation accuracy:\t\t{:.2f} %".format(val_acc /
                                                          val_batches * 100))
    print("Done training.")

    # After training, we compute and print the test error:
    test_err = 0
    test_acc = 0
    test_batches = 0
    test_predictions = []
    for batch in iterate_minibatches(test, batch_size, shuffle=False):
        inputs, targets = batch
        err, acc = val_fn(inputs, targets)
        test_predictions.append(test_pred_fn(inputs))
        test_err += err
        test_acc += acc
        test_batches += 1
    print("Final results:")
    print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
    print("  test accuracy:\t\t{:.2f} %".format(test_acc / test_batches * 100))

    # Optionally, you could now dump the network weights to a file like this:
    timestr = str(time.time())
    np.savez('/zap/tsob/audio/model' + timestr + '.npz',
             *lasagne.layers.get_all_param_values(network))
    np.save('/zap/tsob/audio/train_error_hist' + timestr + '.npy',
            train_error_hist)
    np.save('/zap/tsob/audio/test_predictions' + timestr + '.npy',
            test_predictions)
    print "Wrote model to {0}, test error histogram to {1}, and test predictions to {2}".format(
        'model' + timestr + '.npz', 'train_error_hist' + timestr + '.npy',
        'test_predictions' + timestr + '.npy')
Example #27
0
def main(exp_config, source_vocab, target_vocab, dev_stream, use_bokeh=True):

    # def setup_model_and_stream(exp_config, source_vocab, target_vocab):
    # def setup_model_and_stream(exp_config, source_vocab, target_vocab):
    train_encoder, train_decoder, theano_sampling_source_input, theano_sampling_context_input, generated, masked_stream = setup_model_and_stream(
        exp_config, source_vocab, target_vocab)
    cost = create_model(train_encoder, train_decoder,
                        exp_config.get('imt_smoothing_constant', 0.005))

    # Set up training model
    logger.info("Building model")
    train_model = Model(cost)

    # Set the parameters from a trained models (.npz file)
    logger.info("Loading parameters from model: {}".format(
        exp_config['saved_parameters']))
    # Note the brick delimeter='-' is here for legacy reasons because blocks changed the serialization API
    param_values = LoadNMT.load_parameter_values(
        exp_config['saved_parameters'],
        brick_delimiter=exp_config.get('brick_delimiter', None))
    LoadNMT.set_model_parameters(train_model, param_values)

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # GRAPH TRANSFORMATIONS FOR BETTER TRAINING
    if exp_config.get('l2_regularization', False) is True:
        l2_reg_alpha = exp_config['l2_regularization_alpha']
        logger.info(
            'Applying l2 regularization with alpha={}'.format(l2_reg_alpha))
        model_weights = VariableFilter(roles=[WEIGHT])(cg.variables)

        for W in model_weights:
            cost = cost + (l2_reg_alpha * (W**2).sum())

        # why do we need to rename the cost variable? Where did the original name come from?
        cost.name = 'decoder_cost_cost'

    cg = ComputationGraph(cost)

    # apply dropout for regularization
    # Note dropout variables are hard-coded here
    if exp_config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        # this is the probability of dropping out, so you probably want to make it <=0.5
        logger.info('Applying dropout')
        dropout_inputs = [
            x for x in cg.intermediary_variables
            if x.name == 'maxout_apply_output'
        ]
        cg = apply_dropout(cg, dropout_inputs, exp_config['dropout'])

    # create the training directory, and copy this config there if directory doesn't exist
    if not os.path.isdir(exp_config['saveto']):
        os.makedirs(exp_config['saveto'])
        # TODO: mv the actual config file once we switch to .yaml for min-risk
        shutil.copy(exp_config['config_file'], exp_config['saveto'])

    # Set extensions
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=exp_config['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(exp_config['saveto'],
                      every_n_batches=exp_config['save_freq'])
    ]

    # Set up beam search and sampling computation graphs if necessary
    # TODO: change the if statement here
    if exp_config['hook_samples'] >= 1 or exp_config['bleu_script'] is not None:
        logger.info("Building sampling model")
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[train_decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

    # Add sampling -- TODO: sampling is broken for min-risk
    #if config['hook_samples'] >= 1:
    #    logger.info("Building sampler")
    #    extensions.append(
    #        Sampler(model=search_model, data_stream=tr_stream,
    #                hook_samples=config['hook_samples'],
    #                every_n_batches=config['sampling_freq'],
    #                src_vocab_size=config['src_vocab_size']))

    # Add early stopping based on bleu
    # TODO: use multimodal meteor and BLEU validator
    # TODO: add 'validator' key to IMT config
    # Add early stopping based on bleu
    if exp_config.get('bleu_script', None) is not None:
        logger.info("Building bleu validator")
        extensions.append(
            BleuValidator(theano_sampling_source_input,
                          theano_sampling_context_input,
                          samples=samples,
                          config=exp_config,
                          model=search_model,
                          data_stream=dev_stream,
                          src_vocab=source_vocab,
                          trg_vocab=target_vocab,
                          normalize=exp_config['normalized_bleu'],
                          every_n_batches=exp_config['bleu_val_freq']))

    if exp_config.get('imt_f1_validation', False) is not False:
        logger.info("Building imt F1 validator")
        extensions.append(
            IMT_F1_Validator(theano_sampling_source_input,
                             theano_sampling_context_input,
                             samples=samples,
                             config=exp_config,
                             model=search_model,
                             data_stream=dev_stream,
                             src_vocab=source_vocab,
                             trg_vocab=target_vocab,
                             normalize=exp_config['normalized_bleu'],
                             every_n_batches=exp_config['bleu_val_freq']))

    # Add early stopping based on Meteor
    # if exp_config.get('meteor_directory', None) is not None:
    #     logger.info("Building meteor validator")
    #     extensions.append(
    #         MeteorValidator(theano_sampling_source_input, theano_sampling_context_input,
    #                         samples=samples,
    #                         config=config,
    #                         model=search_model, data_stream=dev_stream,
    #                         src_vocab=src_vocab,
    #                         trg_vocab=trg_vocab,
    #                         normalize=config['normalized_bleu'],
    #                         every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if exp_config['reload']:
        extensions.append(LoadNMT(exp_config['saveto']))

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            Plot(exp_config['model_save_directory'],
                 channels=[[
                     'decoder_cost_cost', 'validation_set_imt_f1_score',
                     'validation_set_bleu_score', 'validation_set_meteor_score'
                 ]],
                 every_n_batches=10))

    # Set up training algorithm
    logger.info("Initializing training algorithm")

    # if there is l2_regularization, dropout or random noise, we need to use the output of the modified graph
    # WORKING: try to catch and fix nan
    if exp_config['dropout'] < 1.0:
        if exp_config.get('nan_guard', False):
            from theano.compile.nanguardmode import NanGuardMode
            algorithm = GradientDescent(cost=cg.outputs[0],
                                        parameters=cg.parameters,
                                        step_rule=CompositeRule([
                                            StepClipping(
                                                exp_config['step_clipping']),
                                            eval(exp_config['step_rule'])()
                                        ]),
                                        on_unused_sources='warn',
                                        theano_func_kwargs={
                                            'mode':
                                            NanGuardMode(nan_is_error=True,
                                                         inf_is_error=True)
                                        })
        else:
            algorithm = GradientDescent(cost=cg.outputs[0],
                                        parameters=cg.parameters,
                                        step_rule=CompositeRule([
                                            StepClipping(
                                                exp_config['step_clipping']),
                                            eval(exp_config['step_rule'])()
                                        ]),
                                        on_unused_sources='warn')
    else:
        algorithm = GradientDescent(cost=cost,
                                    parameters=cg.parameters,
                                    step_rule=CompositeRule([
                                        StepClipping(
                                            exp_config['step_clipping']),
                                        eval(exp_config['step_rule'])()
                                    ]),
                                    on_unused_sources='warn')

    # enrich the logged information
    extensions.append(Timing(every_n_batches=100))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=train_model,
                         algorithm=algorithm,
                         data_stream=masked_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()
Example #28
0
    def build(self):
        config = self.config
        processor = self.processor

        source_inputs = T.imatrix()
        target_inputs = T.imatrix()
        target_outputs = T.imatrix()
        source_mask_inputs = T.matrix()
        target_mask_inputs = T.matrix()
        # map_inputs = T.tensor3()

        l_source_inputs = lasagne.layers.InputLayer(shape=(None,
                                                           config.source_len),
                                                    input_var=source_inputs)
        l_target_inputs = lasagne.layers.InputLayer(shape=(None,
                                                           config.target_len),
                                                    input_var=target_inputs)
        l_output = lasagne.layers.InputLayer(shape=(None, config.target_len),
                                             input_var=target_outputs)
        l_source_mask_inputs = lasagne.layers.InputLayer(
            shape=(None, config.source_len), input_var=source_mask_inputs)
        l_target_mask_inputs = lasagne.layers.InputLayer(
            shape=(None, config.target_len), input_var=target_mask_inputs)
        # l_map_inputs = lasagne.layers.InputLayer(shape=(None, config.source_len, processor.target_vocab_size),
        #                                          input_var=map_inputs)

        l_source = lasagne.layers.EmbeddingLayer(l_source_inputs,
                                                 processor.source_vocab_size,
                                                 config.embedding_size)
        l_target = lasagne.layers.EmbeddingLayer(l_target_inputs,
                                                 processor.target_vocab_size,
                                                 config.embedding_size)
        self.W1 = l_source.W
        self.W2 = l_target.W
        # T.sum(l_source.W)
        # l_s_gru_fw = lasagne.layers.GRULayer(l_source, config.enc_units, mask_input=l_source_mask_inputs,
        #                                      grad_clipping=config.grad_clipping)
        # l_s_gru_bw = lasagne.layers.GRULayer(l_source, config.enc_units, mask_input=l_source_mask_inputs,
        #                                      grad_clipping=config.grad_clipping)
        # l_source = lasagne.layers.ConcatLayer([l_s_gru_fw, l_s_gru_bw], axis=2)
        # l_source = lasagne.layers.GRULayer(l_source, config.enc_units, mask_input=l_source_mask_inputs,
        #                                    grad_clipping=config.grad_clipping)
        # l_source_last = lasagne.layers.ElemwiseSumLayer(l_source) #lasagne.layers.SliceLayer(l_source, -1, axis=1)

        l_target_outputs = layers.GRUCoverageTrainLayer(
            l_target_inputs,
            config.dec_units,
            mask_input=l_target_mask_inputs,
            grad_clipping=config.grad_clipping,
            source_token_cnt=processor.source_vocab_size,
            target_token_cnt=processor.target_vocab_size,
            l_enc_feat=l_source,
            l_enc_mask=l_source_mask_inputs,
            l_output=l_output,
            W_emb=self.W2,
            unk_index=processor.get_char_index(
                'UNK', False))  #, hid_init=l_source_last)
        l_t = l_target_outputs
        l_target_outputs = lasagne.layers.ReshapeLayer(
            l_target_outputs, (-1, [2]))  # (batch * dec_len, vocab + extra)

        l_gen = layers.GRUCoverageTestLayer(
            config.dec_units,
            grad_clipping=config.grad_clipping,
            source_token_cnt=processor.source_vocab_size,
            target_token_cnt=processor.target_vocab_size,
            l_enc_feat=l_source,
            l_enc_mask=l_source_mask_inputs,
            W_emb=self.W2,
            resetgate=l_t.resetgate,
            updategate=l_t.updategate,
            hidden_update=l_t.hidden_update,  #hid_init=l_source_last,
            unk_index=processor.get_char_index('UNK', False),
            start_index=processor.get_char_index('START', False),
            W_gen=l_t.W_gen,
            gen_len=config.target_len)
        l_att = layers.GRUCoverageAttLayer(
            config.dec_units,
            grad_clipping=config.grad_clipping,
            source_token_cnt=processor.source_vocab_size,
            target_token_cnt=processor.target_vocab_size,
            l_enc_feat=l_source,
            l_enc_mask=l_source_mask_inputs,
            W_emb=self.W2,
            resetgate=l_t.resetgate,
            updategate=l_t.updategate,
            hidden_update=l_t.hidden_update,  #hid_init=l_source_last,
            unk_index=processor.get_char_index('UNK', False),
            start_index=processor.get_char_index('START', False),
            W_gen=l_t.W_gen,
            gen_len=config.target_len)
        self.l = l_target_outputs

        py = lasagne.layers.get_output(l_target_outputs)
        loss = (py * T.extra_ops.to_one_hot(target_outputs.flatten(),
                                            processor.target_vocab_size)).sum(
                                                axis=1)  # (batch * dec_len)
        loss = -(loss * target_mask_inputs.flatten()).mean()

        params = lasagne.layers.get_all_params(self.l, trainable=True)
        updates = lasagne.updates.adam(loss,
                                       params,
                                       learning_rate=config.learning_rate)

        gen_y = lasagne.layers.get_output(l_gen)

        gen_att = lasagne.layers.get_output(l_att)

        self.train_fn = theano.function([
            source_inputs, target_inputs, target_outputs, source_mask_inputs,
            target_mask_inputs
        ],
                                        None,
                                        updates=updates,
                                        on_unused_input='ignore',
                                        mode=NanGuardMode(nan_is_error=True,
                                                          inf_is_error=True,
                                                          big_is_error=True))
        self.loss_fn = theano.function([
            source_inputs, target_inputs, target_outputs, source_mask_inputs,
            target_mask_inputs
        ],
                                       loss,
                                       on_unused_input='ignore',
                                       mode=NanGuardMode(nan_is_error=True,
                                                         inf_is_error=True,
                                                         big_is_error=True))
        self.test_fn = theano.function([source_inputs, source_mask_inputs],
                                       gen_y,
                                       on_unused_input='ignore')
        self.att_fn = theano.function([source_inputs, source_mask_inputs],
                                      gen_att,
                                      on_unused_input='ignore')

        l_samp = layers.GRUCopyPureSampleLayer(
            config.dec_units,
            grad_clipping=config.grad_clipping,
            source_token_cnt=processor.source_vocab_size,
            target_token_cnt=processor.target_vocab_size,
            l_enc_feat=l_source,
            l_enc_mask=l_source_mask_inputs,
            W_emb=self.W2,
            resetgate=l_t.resetgate,
            updategate=l_t.updategate,
            hidden_update=l_t.hidden_update,  #hid_init=l_source_last,
            unk_index=processor.get_char_index('UNK', False),
            start_index=processor.get_char_index('START', False),
            gen_len=config.target_len,
            W_gen=l_t.W_gen,
            MRG_stream=self.MRG_stream)  # (batch, dec_len)
        samp_y = lasagne.layers.get_output(l_samp)
        self.sample_fn = theano.function([source_inputs, source_mask_inputs],
                                         samp_y,
                                         updates=l_samp.updates,
                                         on_unused_input='ignore')

        reward_inputs = T.matrix()  # (batch, dec_len)
        reinforce_loss = (py * T.extra_ops.to_one_hot(
            target_outputs.flatten(), processor.target_vocab_size)).sum(
                axis=1)  # (batch * dec_len)
        reinforce_loss = -(reinforce_loss * target_mask_inputs.flatten() *
                           reward_inputs.flatten()).mean()
        reinforce_updates = lasagne.updates.adam(
            reinforce_loss,
            params,
            learning_rate=config.reinforce_learning_rate)
        self.reinforce_fn = theano.function([
            source_inputs, target_inputs, target_outputs, source_mask_inputs,
            target_mask_inputs, reward_inputs
        ],
                                            None,
                                            updates=reinforce_updates,
                                            on_unused_input='ignore')

        print('params', lasagne.layers.count_params(self.l, trainable=True))
Example #29
0
def main(num_epochs=500, mode="run", batchsize=96):
    # Debug
    #theano.config.profile=True
    #theano.config.optimizer_profile=True
    #theano.config.warn_float64='warn'

    # Loading all preprocessed data
    global Ws, bs
    Xtr, Ytr, Xva, Yva, imgMean_vals, Ws, bs = data_prep()

    # Sanity check: try to overfit a tiny (eg 40 instances) subset of the data
    if mode == "toy":
        batchsize = 10
        np.random.RandomState(11)
        idx = np.random.randint(0, Xtr.shape[0] / 10, batchsize * 4)
        Xtr = Xtr[idx, :, :, :]
        Ytr = Ytr[idx, :]
    """
    COMPILING THEANO function
    """
    start_time = time.time()
    # Prepare Theano variables for inputs and targets
    input_var = T.ftensor4('inputs')
    target_var = T.imatrix('targets')

    # Center the input images
    imgMean = T.TensorType(dtype='float32',
                           broadcastable=(True, False, False,
                                          False))('imgMean')
    z = (input_var - imgMean)
    center_fn = theano.function([input_var, imgMean],
                                z,
                                mode=NanGuardMode(nan_is_error=True,
                                                  inf_is_error=True,
                                                  big_is_error=True))

    print "\nbuilding model... "
    net0 = build_model(input_var)

    print "\ncompiling functions... "
    '''
    # Build loss function
    prediction = lasagne.layers.get_output(net0)
    loss = lasagne.objectives.categorical_crossentropy(prediction,
                                                       target_var)
    loss = loss.mean(axis=0)

    # Create update expression for training
    # using RMSprop
    params = lasagne.layers.get_all_params(net0, 
                                           trainable=True)
    updates = lasagne.updates.rmsprop(loss, params, 
                                      learning_rate=0.01, rho=0.9, epsilon=1e-06)
    train_fn = theano.function([input_var, target_var], loss,
                               updates=updates,
                               mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True)
                               )
    '''

    ## Building loss evaluation for validation set
    va_prediction = lasagne.layers.get_output(net0, deterministic=True)
    va_loss = lasagne.objectives.categorical_crossentropy(
        va_prediction, target_var)
    va_loss = va_loss.mean(axis=0)

    va_fn = theano.function(
        [input_var, target_var],
        #mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True),
        va_loss)

    print("compilation finished in {:.2f}").format(time.time() - start_time)
    """
    TRAINING - HAVENT SUBTRACT IMAGE MEAN YET!!!
    """

    print "Starting training with batchsize of %d ..." % (batchsize)
    for epoch in range(num_epochs):
        '''
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        for inputs, targets in iterate_minibatches(Xtr, Ytr, batchsize, shuffle=True):            
            inputs = center_fn(inputs, imgMean_vals)
            train_err += train_fn(inputs, targets)
            train_batches += 1
        '''
        # And a full pass over the validation data:
        if mode != "toy":
            va_err = 0
            va_batches = 0
            for inputs, targets in iterate_minibatches(Xtr,
                                                       Ytr,
                                                       batchsize,
                                                       shuffle=True):
                inputs = center_fn(inputs, imgMean_vals)
                va_err += va_fn(inputs, targets)
                va_batches += 1

        # Then we print the results for this epoch:
        '''
        print("Epoch {} of {} took {:.3f}s".format(
            epoch + 1, num_epochs, time.time() - start_time))
        print("  training loss:\t\t{:.6f}\t{:d}".format(train_err / train_batches, train_batches))
        '''
        if mode != "toy":
            print("  validation loss:\t\t{:.6f}".format(va_err / va_batches))

            # Save the model after every 5 epochs
            '''
Example #30
0
    def setup_train(self):

        # dimensions: (batch, time, 12)
        chord_types = T.btensor3()
        # dimensions: (batch, time)
        chord_roots = T.imatrix()
        # dimensions: (batch, time)
        relative_posns = [T.imatrix() for _ in self.encodings]
        # dimesions: (batch, time, output_data)
        encoded_melodies = [T.btensor3() for _ in self.encodings]
        # dimesions: (batch, time)
        correct_notes = T.imatrix()
        n_batch, n_time = chord_roots.shape

        def _build(det_dropout):
            all_activations = []
            for encoding, enc_lstmstack, encoded_melody, relative_pos in zip(
                    self.encodings, self.enc_lstmstacks, encoded_melodies,
                    relative_posns):
                activations = enc_lstmstack.do_preprocess_scan(
                    timestep=T.tile(T.arange(n_time), (n_batch, 1)),
                    relative_position=relative_pos,
                    cur_chord_type=chord_types,
                    cur_chord_root=chord_roots,
                    cur_input=encoded_melody,
                    deterministic_dropout=det_dropout)
                all_activations.append(activations)
            reduced_activations = functools.reduce((lambda x, y: x + y),
                                                   all_activations)
            queue_loss, feat_strengths, feat_vects, queue_info = self.qman.process(
                reduced_activations, extra_info=True)
            features = QueueManager.queue_transform(feat_strengths, feat_vects)

            all_out_probs = []
            for encoding, dec_lstmstack, encoded_melody, relative_pos in zip(
                    self.encodings, self.dec_lstmstacks, encoded_melodies,
                    relative_posns):
                activations = dec_lstmstack.do_preprocess_scan(
                    timestep=T.tile(T.arange(n_time), (n_batch, 1)),
                    relative_position=relative_pos,
                    cur_chord_type=chord_types,
                    cur_chord_root=chord_roots,
                    cur_feature=features,
                    last_output=T.concatenate([
                        T.tile(encoding.initial_encoded_form(),
                               (n_batch, 1, 1)), encoded_melody[:, :-1, :]
                    ], 1),
                    deterministic_dropout=det_dropout)
                out_probs = encoding.decode_to_probs(activations, relative_pos,
                                                     self.bounds.lowbound,
                                                     self.bounds.highbound)
                all_out_probs.append(out_probs)

            reduced_out_probs = functools.reduce((lambda x, y: x * y),
                                                 all_out_probs)
            normsum = T.sum(reduced_out_probs, 2, keepdims=True)
            normsum = T.maximum(normsum, constants.EPSILON)
            norm_out_probs = reduced_out_probs / normsum
            reconstruction_loss, reconstruction_info = Encoding.compute_loss(
                norm_out_probs, correct_notes, extra_info=True)

            queue_surrogate_loss_parts = self.qman.surrogate_loss(
                reconstruction_loss, queue_info)

            updates = []
            full_info = queue_info.copy()
            full_info.update(reconstruction_info)
            full_info["queue_loss"] = queue_loss
            full_info["reconstruction_loss"] = reconstruction_loss

            float_n_batch = T.cast(n_batch, 'float32')
            if self.loss_mode is "add":
                full_loss = queue_loss + reconstruction_loss
            elif self.loss_mode is "priority":
                curviness = np.array(self.loss_mode_params[0],
                                     np.float32) * float_n_batch
                # ln( e^x + e^y - 1 )
                # ln( C(e^x + e^y - 1) ) - ln(C)
                # ln( e^c(e^x + e^y - 1) ) - c
                # ln( e^(x+c) + e^(y+c) - e^c ) - c
                # ln( e^(x-c) + e^(y-c) - e^(-c) ) + c
                # Now let c = maximum(x,y), d = minimum(x,y). WOLOG replace x=c, y=d
                # ln( e^(c-c) + e^(d-c) - e^(-c) ) + c
                # ln( 1 + e^(d-c) - e^(-c) ) + c
                x = reconstruction_loss / curviness
                y = queue_loss / curviness
                c = T.maximum(x, y)
                d = T.minimum(x, y)
                full_loss = (T.log(1 + T.exp(d - c) - T.exp(-c)) +
                             c) * curviness
            elif self.loss_mode is "cutoff":
                cutoff_val = np.array(self.loss_mode_params[0], np.float32)
                full_loss = T.switch(
                    reconstruction_loss < cutoff_val * float_n_batch,
                    reconstruction_loss + queue_loss, reconstruction_loss)
            elif self.loss_mode is "trigger":
                trigger_val = np.array(self.loss_mode_params[0], np.float32)
                trigger_speed = np.array(1.0 / self.loss_mode_params[1],
                                         np.float32)
                trigger_is_on = theano.shared(np.array(0, np.int8))
                trigger_scale = theano.shared(np.array(0.0, np.float32))
                full_loss = reconstruction_loss + trigger_scale * queue_loss
                updates.append(
                    (trigger_is_on,
                     T.or_(trigger_is_on,
                           reconstruction_loss < trigger_val * float_n_batch)))
                updates.append((trigger_scale,
                                T.switch(
                                    trigger_is_on,
                                    T.minimum(trigger_scale + trigger_speed,
                                              np.array(1.0, np.float32)),
                                    np.array(0.0, np.float32))))
                full_info["trigger_scale"] = trigger_scale

            if queue_surrogate_loss_parts is not None:
                surrogate_loss, addtl_updates = queue_surrogate_loss_parts
                full_loss = full_loss + surrogate_loss
                updates.extend(addtl_updates)
                full_info["surrogate_loss"] = surrogate_loss

            return full_loss, full_info, updates

        train_loss, train_info, train_updates = _build(False)
        if self.train_decoder_only:
            params = list(
                itertools.chain(*(lstmstack.params
                                  for lstmstack in self.dec_lstmstacks)))
        else:
            params = self.params
        adam_updates = Adam(train_loss, params, lr=self.learning_rate_var)

        eval_loss, eval_info, _ = _build(True)

        self.loss_info_keys = list(train_info.keys())

        self.update_fun = theano.function(
            inputs=[chord_types, chord_roots, correct_notes] + relative_posns +
            encoded_melodies,
            outputs=[train_loss] + list(train_info.values()),
            updates=train_updates + adam_updates,
            allow_input_downcast=True,
            mode=(NanGuardMode(nan_is_error=True,
                               inf_is_error=True,
                               big_is_error=True) if self.nanguard else None))

        self.eval_fun = theano.function(
            inputs=[chord_types, chord_roots, correct_notes] + relative_posns +
            encoded_melodies,
            outputs=[eval_loss] + list(eval_info.values()),
            allow_input_downcast=True,
            mode=(NanGuardMode(nan_is_error=True,
                               inf_is_error=True,
                               big_is_error=True) if self.nanguard else None))