コード例 #1
0
if activate_cuda:
    encoder.cuda()
    decoder_A2B.cuda()
    decoder_B2A.cuda()
    netD_A.cuda()
    netD_B.cuda()

if os.path.isfile('output modified/encoder.pth'):
    encoder.load_state_dict('output modified/encoder.pth')
    decoder_A2B.load_state_dict(torch.load('output modified/decoder_A2B.pth'))
    decoder_B2A.load_state_dict(torch.load('output modified/decoder_B2A.pth'))
    netD_A.load_state_dict(torch.load('output modified/netD_A.pth'))
    netD_B.load_state_dict(torch.load('output modified/netD_B.pth'))
else:
    encoder.apply(weights_init_normal)
    decoder_A2B.apply(weights_init_normal)
    decoder_B2A.apply(weights_init_normal)
    netD_A.apply(weights_init_normal)
    netD_B.apply(weights_init_normal)

# Lossess
criterion_GAN = torch.nn.MSELoss()
criterion_cycle = torch.nn.L1Loss()
criterion_identity = torch.nn.L1Loss()

# Optimizers & LR schedulers
optimizer_G = torch.optim.Adam(itertools.chain(encoder.parameters(),
                                               decoder_A2B.parameters(),
                                               decoder_B2A.parameters()),
                               lr=lr,
                               betas=(0.5, 0.999))
コード例 #2
0
class EncoderDecoder(object):
    def __init__(self, rng, **kwargs):
        self.n_in_src = kwargs.pop('nembed_src')
        self.n_in_trg = kwargs.pop('nembed_trg')
        self.n_hids_src = kwargs.pop('nhids_src')
        self.n_hids_trg = kwargs.pop('nhids_trg')
        self.src_vocab_size = kwargs.pop('src_vocab_size')
        self.trg_vocab_size = kwargs.pop('trg_vocab_size')
        self.method = kwargs.pop('method')
        self.dropout = kwargs.pop('dropout')
        self.maxout_part = kwargs.pop('maxout_part')
        self.path = kwargs.pop('saveto')
        self.clip_c = kwargs.pop('clip_c')
        self.rng = rng
        self.trng = RandomStreams(rng.randint(1e5))

        # added by  Zhaopeng  Tu, 2016-06-09
        self.with_attention = kwargs.pop('with_attention')

        # added by Zhaopeng Tu, 2016-04-29
        self.with_coverage = kwargs.pop('with_coverage')
        self.coverage_dim = kwargs.pop('coverage_dim')
        self.coverage_type = kwargs.pop('coverage_type')
        self.max_fertility = kwargs.pop('max_fertility')
        if self.coverage_type is 'linguistic':
            # make sure the dimension of linguistic coverage is always 1
            self.coverage_dim = 1

        # added by Zhaopeng Tu, 2016-05-30
        self.with_context_gate = kwargs.pop('with_context_gate')

        self.params = []
        self.layers = []

        self.table_src = LookupTable(self.rng,
                                     self.src_vocab_size,
                                     self.n_in_src,
                                     name='table_src')
        self.layers.append(self.table_src)

        self.encoder = BidirectionalEncoder(self.rng,
                                            self.n_in_src,
                                            self.n_hids_src,
                                            self.table_src,
                                            name='birnn_encoder')
        self.layers.append(self.encoder)

        # added by Longyue
        self.encoder_hist_1 = Encoder(self.rng,
                                      self.n_in_src,
                                      self.n_hids_src,
                                      self.table_src,
                                      name='rnn_encoder_hist_1')
        self.layers.append(self.encoder_hist_1)
        self.encoder_hist_2 = Encoder(self.rng,
                                      self.n_hids_src,
                                      self.n_hids_src,
                                      self.table_src,
                                      name='rnn_encoder_hist_2')
        self.layers.append(self.encoder_hist_2)

        self.table_trg = LookupTable(self.rng,
                                     self.trg_vocab_size,
                                     self.n_in_trg,
                                     name='table_trg')
        self.layers.append(self.table_trg)

        self.decoder = Decoder(self.rng, self.n_in_trg, self.n_hids_trg, 2*self.n_hids_src, self.n_hids_src, \
                               # added by Zhaopeng Tu, 2016-06-09

                               with_attention=self.with_attention, \
                               # added by Zhaopeng Tu, 2016-04-29

                               with_coverage=self.with_coverage, coverage_dim=self.coverage_dim, coverage_type=self.coverage_type, max_fertility=self.max_fertility, \
                               # added by Zhaopeng Tu, 2016-05-30

                               with_context_gate=self.with_context_gate, \
                               maxout_part=self.maxout_part, name='rnn_decoder')
        self.layers.append(self.decoder)
        self.logistic_layer = LogisticRegression(self.rng, self.n_in_trg,
                                                 self.trg_vocab_size)
        self.layers.append(self.logistic_layer)

        # added by Zhaopeng Tu, 2016-07-12
        # for reconstruction
        self.with_reconstruction = kwargs.pop('with_reconstruction')
        if self.with_reconstruction:
            # added by Zhaopeng Tu, 2016-07-27
            self.reconstruction_weight = kwargs.pop('reconstruction_weight')
            # note the source and target sides are reversed
            self.inverse_decoder = InverseDecoder(self.rng, self.n_in_src, 2*self.n_hids_src, self.n_hids_trg, \
                                   # added by Zhaopeng Tu, 2016-06-09

                                   with_attention=self.with_attention, \
                                   maxout_part=self.maxout_part, name='rnn_inverse_decoder')
            self.layers.append(self.inverse_decoder)

            self.srng = RandomStreams(rng.randint(1e5))
            self.inverse_logistic_layer = LogisticRegression(
                self.rng,
                self.n_in_src,
                self.src_vocab_size,
                name='inverse_LR')
            self.layers.append(self.inverse_logistic_layer)

        for layer in self.layers:
            self.params.extend(layer.params)

    def build_trainer(self, src, src_mask, src_hist, src_hist_mask, trg,
                      trg_mask, ite):

        # added by Longyue
        # checked by Zhaopeng: sentence dim = n_steps, hist_len, batch_size (4, 3, 25)
        # hist = (bath_size, sent_num, sent_len) --.T-->
        # hist = (sent_len, sent_num, bath_size) --lookup table-->
        # (sent_len, sent_num, bath_size, word_emb) --reshape-->
        # (sent_len, sent_num*bath_size, word_emb) --word-level rnn-->
        # (sent_len, sent_num*bath_size, hidden_size) --reshape-->
        # (sent_len, sent_num, bath_size, hidden_size) --[-1]-->
        # (sent_num, bath_size, hidden_size) --sent-level rnn-->
        # (sent_num, bath_size, hidden_size) --[-1]-->
        # (bath_size, hidden_size) = cross-sent context vector

        annotations_1 = self.encoder_hist_1.apply_1(src_hist, src_hist_mask)
        annotations_1 = annotations_1[-1]  # get last hidden states
        annotations_2 = self.encoder_hist_2.apply_2(annotations_1)
        annotations_3 = annotations_2[-1]  # get last hidden states

        #modified by Longyue
        annotations = self.encoder.apply(src, src_mask, annotations_3)
        # init_context = annotations[0, :, -self.n_hids_src:]
        # modification #1
        # mean pooling
        init_context = (annotations *
                        src_mask[:, :, None]).sum(0) / src_mask.sum(0)[:, None]

        #added by Longyue
        init_context = concatenate([init_context, annotations_3],
                                   axis=annotations_3.ndim - 1)

        trg_emb = self.table_trg.apply(trg)
        trg_emb_shifted = T.zeros_like(trg_emb)
        trg_emb_shifted = T.set_subtensor(trg_emb_shifted[1:], trg_emb[:-1])
        # modified by Longyue
        hiddens, readout, alignment = self.decoder.run_pipeline(
            state_below=trg_emb_shifted,
            mask_below=trg_mask,
            init_context=init_context,
            c=annotations,
            c_mask=src_mask,
            hist=annotations_3)

        # apply dropout
        if self.dropout < 1.0:
            logger.info('Apply dropout with p = {}'.format(self.dropout))
            readout = Dropout(self.trng, readout, 1, self.dropout)

        p_y_given_x = self.logistic_layer.get_probs(readout)

        self.cost = self.logistic_layer.cost(p_y_given_x, trg,
                                             trg_mask) / trg.shape[1]

        # self.cost = theano.printing.Print('likilihood cost:')(self.cost)

        # added by Zhaopeng Tu, 2016-07-12
        # for reconstruction
        if self.with_reconstruction:
            # now hiddens is the annotations
            inverse_init_context = (hiddens * trg_mask[:, :, None]
                                    ).sum(0) / trg_mask.sum(0)[:, None]

            src_emb = self.table_src.apply(src)
            src_emb_shifted = T.zeros_like(src_emb)
            src_emb_shifted = T.set_subtensor(src_emb_shifted[1:],
                                              src_emb[:-1])
            inverse_hiddens, inverse_readout, inverse_alignment = self.inverse_decoder.run_pipeline(
                state_below=src_emb_shifted,
                mask_below=src_mask,
                init_context=inverse_init_context,
                c=hiddens,
                c_mask=trg_mask)

            # apply dropout
            if self.dropout < 1.0:
                # logger.info('Apply dropout with p = {}'.format(self.dropout))
                inverse_readout = Dropout(self.srng, inverse_readout, 1,
                                          self.dropout)

            p_x_given_y = self.inverse_logistic_layer.get_probs(
                inverse_readout)

            self.reconstruction_cost = self.inverse_logistic_layer.cost(
                p_x_given_y, src, src_mask) / src.shape[1]

            # self.reconstruction_cost = theano.printing.Print('reconstructed cost:')(self.reconstruction_cost)
            self.cost += self.reconstruction_cost * self.reconstruction_weight

        self.L1 = sum(T.sum(abs(param)) for param in self.params)
        self.L2 = sum(T.sum(param**2) for param in self.params)

        params_regular = self.L1 * 1e-6 + self.L2 * 1e-6
        # params_regular = theano.printing.Print('params_regular:')(params_regular)

        # train cost
        train_cost = self.cost + params_regular

        # gradients
        grads = T.grad(train_cost, self.params)

        # apply gradient clipping here
        grads = grad_clip(grads, self.clip_c)

        # updates
        updates = adadelta(self.params, grads)

        # train function
        # modified by Longyue
        inps = [src, src_mask, src_hist, src_hist_mask, trg, trg_mask]

        self.train_fn = theano.function(inps, [train_cost],
                                        updates=updates,
                                        name='train_function')
        # self.train_fn = theano.function(inps, [train_cost], updates=updates, name='train_function', mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True))

    def build_sampler(self):

        # added by Longyue
        x_hist = T.ltensor3()
        x_hist_mask = T.tensor3()
        annotations_1 = self.encoder_hist_1.apply_1(x_hist, x_hist_mask)
        annotations_1 = annotations_1[-1]
        annotations_2 = self.encoder_hist_2.apply_2(annotations_1)
        annotations_3 = annotations_2[-1]

        x = T.lmatrix()

        # Build Networks
        # src_mask is None
        c = self.encoder.apply(x, None, annotations_3)
        #init_context = ctx[0, :, -self.n_hids_src:]
        # mean pooling
        init_context = c.mean(0)

        # added by Longyue
        init_context = concatenate([init_context, annotations_3],
                                   axis=annotations_3.ndim - 1)

        init_state = self.decoder.create_init_state(init_context)

        outs = [init_state, c, annotations_3]
        if not self.with_attention:
            outs.append(init_context)

        # compile function
        print 'Building compile_init_state_and_context function ...'
        self.compile_init_and_context = theano.function(
            [x, x_hist, x_hist_mask], outs, name='compile_init_and_context')
        print 'Done'

        y = T.lvector()
        cur_state = T.matrix()
        # if it is the first word, emb should be all zero, and it is indicated by -1
        trg_emb = T.switch(y[:, None] < 0, T.alloc(0., 1, self.n_in_trg),
                           self.table_trg.apply(y))

        # added by Zhaopeng Tu, 2016-06-09
        # for with_attention=False
        if self.with_attention and self.with_coverage:
            cov_before = T.tensor3()
            if self.coverage_type is 'linguistic':
                print 'Building compile_fertility ...'
                fertility = self.decoder._get_fertility(c)
                fertility = T.addbroadcast(fertility, 1)
                self.compile_fertility = theano.function(
                    [c], [fertility], name='compile_fertility')
                print 'Done'
            else:
                fertility = None
        else:
            cov_before = None
            fertility = None

        # apply one step
        # modified by Zhaopeng Tu, 2016-04-29
        # [next_state, ctxs] = self.decoder.apply(state_below=trg_emb,
        results = self.decoder.apply(
            state_below=trg_emb,
            init_state=cur_state,
            # added by Zhaopeng Tu, 2016-06-09
            init_context=None if self.with_attention else init_context,
            c=c if self.with_attention else None,
            hist=annotations_3,  # added by Longyue
            one_step=True,
            # added by Zhaopeng Tu, 2016-04-27
            cov_before=cov_before,
            fertility=fertility)
        next_state = results[0]
        if self.with_attention:
            ctxs, alignment = results[1], results[2]
            if self.with_coverage:
                cov = results[3]
        else:
            # if with_attention=False, we always use init_context as the source representation
            ctxs = init_context

        readout = self.decoder.readout(next_state, ctxs, trg_emb)

        # maxout
        if self.maxout_part > 1:
            readout = self.decoder.one_step_maxout(readout)

        # apply dropout
        if self.dropout < 1.0:
            readout = Dropout(self.trng, readout, 0, self.dropout)

        # compute the softmax probability
        next_probs = self.logistic_layer.get_probs(readout)

        # sample from softmax distribution to get the sample
        next_sample = self.trng.multinomial(pvals=next_probs).argmax(1)

        # compile function
        print 'Building compile_next_state_and_probs function ...'
        inps = [y, cur_state]
        if self.with_attention:
            inps.append(c)
        else:
            inps.append(init_context)

        # added by Longyue
        inps.append(annotations_3)

        outs = [next_probs, next_state, next_sample]
        # added by Zhaopeng Tu, 2016-06-09
        if self.with_attention:
            outs.append(alignment)
            # added by Zhaopeng Tu, 2016-04-29
            if self.with_coverage:
                inps.append(cov_before)
                if self.coverage_type is 'linguistic':
                    inps.append(fertility)
                outs.append(cov)

        self.compile_next_state_and_probs = theano.function(
            inps, outs, name='compile_next_state_and_probs')
        print 'Done'

        # added by Zhaopeng Tu, 2016-07-18
        # for reconstruction
        if self.with_reconstruction:
            # Build Networks
            # trg_mask is None
            inverse_c = T.tensor3()
            # mean pooling
            inverse_init_context = inverse_c.mean(0)

            inverse_init_state = self.inverse_decoder.create_init_state(
                inverse_init_context)

            outs = [inverse_init_state]
            if not self.with_attention:
                outs.append(inverse_init_context)

            # compile function
            print 'Building compile_inverse_init_state_and_context function ...'
            self.compile_inverse_init_and_context = theano.function(
                [inverse_c], outs, name='compile_inverse_init_and_context')
            print 'Done'

            src = T.lvector()
            inverse_cur_state = T.matrix()
            trg_mask = T.matrix()
            # if it is the first word, emb should be all zero, and it is indicated by -1
            src_emb = T.switch(src[:, None] < 0, T.alloc(0., 1, self.n_in_src),
                               self.table_src.apply(src))

            # apply one step
            # modified by Zhaopeng Tu, 2016-04-29
            inverse_results = self.inverse_decoder.apply(
                state_below=src_emb,
                init_state=inverse_cur_state,
                # added by Zhaopeng Tu, 2016-06-09
                init_context=None
                if self.with_attention else inverse_init_context,
                c=inverse_c if self.with_attention else None,
                c_mask=trg_mask,
                one_step=True)
            inverse_next_state = inverse_results[0]
            if self.with_attention:
                inverse_ctxs, inverse_alignment = inverse_results[
                    1], inverse_results[2]
            else:
                # if with_attention=False, we always use init_context as the source representation
                inverse_ctxs = init_context

            inverse_readout = self.inverse_decoder.readout(
                inverse_next_state, inverse_ctxs, src_emb)

            # maxout
            if self.maxout_part > 1:
                inverse_readout = self.inverse_decoder.one_step_maxout(
                    inverse_readout)

            # apply dropout
            if self.dropout < 1.0:
                inverse_readout = Dropout(self.srng, inverse_readout, 0,
                                          self.dropout)

            # compute the softmax probability
            inverse_next_probs = self.inverse_logistic_layer.get_probs(
                inverse_readout)

            # sample from softmax distribution to get the sample
            inverse_next_sample = self.srng.multinomial(
                pvals=inverse_next_probs).argmax(1)

            # compile function
            print 'Building compile_inverse_next_state_and_probs function ...'
            inps = [src, trg_mask, inverse_cur_state]
            if self.with_attention:
                inps.append(inverse_c)
            else:
                inps.append(inverse_init_context)
            outs = [
                inverse_next_probs, inverse_next_state, inverse_next_sample
            ]
            # added by Zhaopeng Tu, 2016-06-09
            if self.with_attention:
                outs.append(inverse_alignment)

            self.compile_inverse_next_state_and_probs = theano.function(
                inps, outs, name='compile_inverse_next_state_and_probs')
            print 'Done'

    def save(self, path=None):
        if path is None:
            path = self.path
        filenpz = open(path, "w")
        val = dict([(value.name, value.get_value())
                    for index, value in enumerate(self.params)])
        logger.info("save the model {}".format(path))
        numpy.savez(path, **val)
        filenpz.close()

    def load(self, path=None):
        if path is None:
            path = self.path
        if os.path.isfile(path):
            logger.info("load params {}".format(path))
            val = numpy.load(path)
            for index, param in enumerate(self.params):
                logger.info('Loading {} with shape {}'.format(
                    param.name,
                    param.get_value(borrow=True).shape))
                if param.name not in val.keys():
                    logger.info('Adding new param {} with shape {}'.format(
                        param.name,
                        param.get_value(borrow=True).shape))
                    continue
                if param.get_value().shape != val[param.name].shape:
                    logger.info("Error: model param != load param shape {} != {}".format(\
                                        param.get_value().shape, val[param.name].shape))
                    raise Exception("loading params shape mismatch")
                else:
                    param.set_value(val[param.name], borrow=True)
        else:
            logger.error("file {} does not exist".format(path))
            self.save()
コード例 #3
0
class EncoderDecoder(object):
    def __init__(self, rng, **kwargs):
        self.n_in_src = kwargs.get('nembed_src')
        self.n_in_trg = kwargs.get('nembed_trg')
        self.n_hids_src = kwargs.get('nhids_src')
        self.n_hids_trg = kwargs.get('nhids_trg')
        self.src_vocab_size = kwargs.get('src_vocab_size')
        self.trg_vocab_size = kwargs.get('trg_vocab_size')
        self.method = kwargs.get('method')
        self.dropout = kwargs.get('dropout')
        self.maxout_part = kwargs.get('maxout_part')
        self.path = kwargs.get('saveto')
        self.clip_c = kwargs.get('clip_c')
        self.rng = rng
        self.trng = RandomStreams(rng.randint(1e5))

        # added by Zhaopeng Tu, 2016-04-29
        self.with_coverage = kwargs.get('with_coverage')
        self.coverage_dim = kwargs.get('coverage_dim')
        self.coverage_type = kwargs.get('coverage_type')
        self.max_fertility = kwargs.get('max_fertility')
        if self.coverage_type is 'linguistic':
            # make sure the dimension of linguistic coverage is always 1
            self.coverage_dim = 1

        # added by Zhaopeng Tu, 2016-05-30
        self.with_context_gate = kwargs.get('with_context_gate')

        # added by Zhaopeng Tu, 2017-11-29
        self.with_layernorm = kwargs.get('with_layernorm', False)

        self.params = []
        self.layers = []

        self.table_src = LookupTable(self.rng,
                                     self.src_vocab_size,
                                     self.n_in_src,
                                     name='table_src')
        self.layers.append(self.table_src)

        self.encoder = BidirectionalEncoder(self.rng,
                                            self.n_in_src,
                                            self.n_hids_src,
                                            self.table_src,
                                            name='birnn_encoder')
        self.layers.append(self.encoder)

        self.table_trg = LookupTable(self.rng,
                                     self.trg_vocab_size,
                                     self.n_in_trg,
                                     name='table_trg')
        self.layers.append(self.table_trg)

        self.decoder = Decoder(self.rng, self.n_in_trg, self.n_hids_trg, 2*self.n_hids_src, \
                               maxout_part=self.maxout_part, name='rnn_decoder', \
                               # added by Zhaopeng Tu, 2016-04-29

                               with_coverage=self.with_coverage, coverage_dim=self.coverage_dim, coverage_type=self.coverage_type, max_fertility=self.max_fertility, \
                               # added by Zhaopeng Tu, 2016-05-30

                               with_context_gate=self.with_context_gate, \
                               with_layernorm=self.with_layernorm)
        self.layers.append(self.decoder)

        self.logistic_layer = LogisticRegression(self.rng, self.n_in_trg,
                                                 self.trg_vocab_size)
        self.layers.append(self.logistic_layer)

        # added by Zhaopeng Tu, 2016-07-12
        # for reconstruction
        self.with_reconstruction = kwargs.get('with_reconstruction')
        if self.with_reconstruction:
            # added by Zhaopeng Tu, 2016-07-27
            self.reconstruction_weight = kwargs.get('reconstruction_weight')
            # note the source and target sides are reversed
            self.inverse_decoder = Decoder(self.rng, self.n_in_src, 2*self.n_hids_src, self.n_hids_trg, \
                                   maxout_part=self.maxout_part, name='rnn_inverse_decoder', \
                                   with_layernorm=self.with_layernorm)
            self.layers.append(self.inverse_decoder)

            self.srng = RandomStreams(rng.randint(1e5))
            self.inverse_logistic_layer = LogisticRegression(
                self.rng,
                self.n_in_src,
                self.src_vocab_size,
                name='inverse_LR')
            self.layers.append(self.inverse_logistic_layer)

        for layer in self.layers:
            self.params.extend(layer.params)

    def build_trainer(self, src, src_mask, trg, trg_mask):
        annotations = self.encoder.apply(src, src_mask)
        # init_context = annotations[0, :, -self.n_hids_src:]
        # modification #1
        # mean pooling
        init_context = (annotations *
                        src_mask[:, :, None]).sum(0) / src_mask.sum(0)[:, None]

        trg_emb = self.table_trg.apply(trg)
        trg_emb_shifted = T.zeros_like(trg_emb)
        trg_emb_shifted = T.set_subtensor(trg_emb_shifted[1:], trg_emb[:-1])
        results = self.decoder.run_pipeline(state_below=trg_emb_shifted,
                                            mask_below=trg_mask,
                                            init_context=init_context,
                                            c=annotations,
                                            c_mask=src_mask)

        hiddens, ctxs, readout, alignment = results[:4]

        # apply dropout
        if self.dropout < 1.0:
            logger.info('Apply dropout with p = {}'.format(self.dropout))
            readout = Dropout(self.trng, readout, 1, self.dropout)

        p_y_given_x = self.logistic_layer.get_probs(readout)

        self.cost = self.logistic_layer.cost(p_y_given_x, trg,
                                             trg_mask) / trg.shape[1]

        # self.cost = theano.printing.Print('likilihood cost:')(self.cost)

        # added by Zhaopeng Tu, 2016-07-12
        # for reconstruction
        if self.with_reconstruction:
            # now hiddens is the annotations
            inverse_init_context = (hiddens * trg_mask[:, :, None]
                                    ).sum(0) / trg_mask.sum(0)[:, None]

            src_emb = self.table_src.apply(src)
            src_emb_shifted = T.zeros_like(src_emb)
            src_emb_shifted = T.set_subtensor(src_emb_shifted[1:],
                                              src_emb[:-1])
            inverse_results = self.inverse_decoder.run_pipeline(
                state_below=src_emb_shifted,
                mask_below=src_mask,
                init_context=inverse_init_context,
                c=hiddens,
                c_mask=trg_mask)

            inverse_hiddens, inverse_ctxs, inverse_readout, inverse_alignment = inverse_results[:
                                                                                                4]

            # apply dropout
            if self.dropout < 1.0:
                # logger.info('Apply dropout with p = {}'.format(self.dropout))
                inverse_readout = Dropout(self.srng, inverse_readout, 1,
                                          self.dropout)

            p_x_given_y = self.inverse_logistic_layer.get_probs(
                inverse_readout)

            self.reconstruction_cost = self.inverse_logistic_layer.cost(
                p_x_given_y, src, src_mask) / src.shape[1]

            # self.reconstruction_cost = theano.printing.Print('reconstructed cost:')(self.reconstruction_cost)
            self.cost += self.reconstruction_cost * self.reconstruction_weight

        self.L1 = sum(T.sum(abs(param)) for param in self.params)
        self.L2 = sum(T.sum(param**2) for param in self.params)

        params_regular = self.L1 * 1e-6 + self.L2 * 1e-6
        # params_regular = theano.printing.Print('params_regular:')(params_regular)

        # train cost
        train_cost = self.cost + params_regular

        # gradients
        grads = T.grad(train_cost, self.params)

        # apply gradient clipping here
        grads = grad_clip(grads, self.clip_c)

        # train function
        inps = [src, src_mask, trg, trg_mask]
        outs = [train_cost]

        if self.with_layernorm:
            inps = [src, src_mask, trg, trg_mask]
            lr = T.scalar(name='lr')
            print 'Building optimizers...',
            self.train_fn, self.update_fn = adam(lr, self.params, grads, inps,
                                                 outs)
        else:
            # updates
            updates = adadelta(self.params, grads)

            # mode=theano.Mode(linker='vm') for ifelse
            # Unless linker='vm' or linker='cvm' are used, ifelse will compute both variables and take the same computation time as switch.
            self.train_fn = theano.function(inps,
                                            outs,
                                            updates=updates,
                                            name='train_function',
                                            mode=theano.Mode(linker='vm'))
            # self.train_fn = theano.function(inps, outs, updates=updates, name='train_function', mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True))

    def build_sampler(self):

        x = T.lmatrix()

        # Build Networks
        # src_mask is None
        c = self.encoder.apply(x, None)
        #init_context = ctx[0, :, -self.n_hids_src:]
        # mean pooling
        init_context = c.mean(0)

        init_state = self.decoder.create_init_state(init_context)

        # compile function
        print 'Building compile_init_state_and_context function ...'
        self.compile_init_and_context = theano.function(
            [x], [init_state, c], name='compile_init_and_context')
        print 'Done'

        y = T.lvector()
        cur_state = T.matrix()

        # if it is the first word, emb should be a1l zero, and it is indicated by -1
        trg_emb = T.switch(y[:, None] < 0, T.alloc(0., 1, self.n_in_trg),
                           self.table_trg.apply(y))

        # added by Zhaopeng Tu, 2016-06-09
        if self.with_coverage:
            cov_before = T.tensor3()
            if self.coverage_type is 'linguistic':
                print 'Building compile_fertility ...'
                fertility = self.decoder._get_fertility(c)
                fertility = T.addbroadcast(fertility, 1)
                self.compile_fertility = theano.function(
                    [c], [fertility], name='compile_fertility')
                print 'Done'
            else:
                fertility = None
        else:
            cov_before = None
            fertility = None

        # apply one step
        # modified by Zhaopeng Tu, 2016-04-29
        results = self.decoder.apply(
            state_below=trg_emb,
            init_state=cur_state,
            c=c,
            one_step=True,
            # added by Zhaopeng Tu, 2016-04-27
            cov_before=cov_before,
            fertility=fertility)
        next_state, ctxs, alignment = results[:3]
        idx = 3
        if self.with_coverage:
            cov = results[idx]
            idx += 1

        readout = self.decoder.readout(next_state, ctxs, trg_emb)

        # maxout
        if self.maxout_part > 1:
            readout = self.decoder.one_step_maxout(readout)

        # apply dropout
        if self.dropout < 1.0:
            readout = Dropout(self.trng, readout, 0, self.dropout)

        # compute the softmax probability
        next_probs = self.logistic_layer.get_probs(readout)

        # sample from softmax distribution to get the sample
        next_sample = self.trng.multinomial(pvals=next_probs).argmax(1)

        # compile function
        print 'Building compile_next_state_and_probs function ...'
        inps = [y, cur_state, c]
        outs = [next_probs, next_state, next_sample, alignment]

        # added by Zhaopeng Tu, 2016-04-29
        if self.with_coverage:
            inps.append(cov_before)
            if self.coverage_type is 'linguistic':
                inps.append(fertility)
            outs.append(cov)

        # mode=theano.Mode(linker='vm') for ifelse
        # Unless linker='vm' or linker='cvm' are used, ifelse will compute both variables and take the same computation time as switch.
        self.compile_next_state_and_probs = theano.function(
            inps,
            outs,
            name='compile_next_state_and_probs',
            mode=theano.Mode(linker='vm'))
        print 'Done'

        # added by Zhaopeng Tu, 2016-07-18
        # for reconstruction
        if self.with_reconstruction:
            # Build Networks
            # trg_mask is None
            inverse_c = T.tensor3()
            # mean pooling
            inverse_init_context = inverse_c.mean(0)

            inverse_init_state = self.inverse_decoder.create_init_state(
                inverse_init_context)

            outs = [inverse_init_state]

            # compile function
            print 'Building compile_inverse_init_state_and_context function ...'
            self.compile_inverse_init_and_context = theano.function(
                [inverse_c], outs, name='compile_inverse_init_and_context')
            print 'Done'

            src = T.lvector()
            inverse_cur_state = T.matrix()

            trg_mask = T.matrix()
            # if it is the first word, emb should be all zero, and it is indicated by -1
            src_emb = T.switch(src[:, None] < 0, T.alloc(0., 1, self.n_in_src),
                               self.table_src.apply(src))

            # apply one step
            # modified by Zhaopeng Tu, 2016-04-29
            inverse_results = self.inverse_decoder.apply(
                state_below=src_emb,
                init_state=inverse_cur_state,
                c=inverse_c,
                c_mask=trg_mask,
                one_step=True)
            inverse_next_state, inverse_ctxs, inverse_alignment = inverse_results[:
                                                                                  3]

            inverse_readout = self.inverse_decoder.readout(
                inverse_next_state, inverse_ctxs, src_emb)

            # maxout
            if self.maxout_part > 1:
                inverse_readout = self.inverse_decoder.one_step_maxout(
                    inverse_readout)

            # apply dropout
            if self.dropout < 1.0:
                inverse_readout = Dropout(self.srng, inverse_readout, 0,
                                          self.dropout)

            # compute the softmax probability
            inverse_next_probs, inverse_next_energy = self.inverse_logistic_layer.get_probs(
                inverse_readout)

            # sample from softmax distribution to get the sample
            inverse_next_sample = self.srng.multinomial(
                pvals=inverse_next_probs).argmax(1)

            # compile function
            print 'Building compile_inverse_next_state_and_probs function ...'
            inps = [src, trg_mask, inverse_cur_state, inverse_c]
            outs = [
                inverse_next_probs, inverse_next_state, inverse_next_sample,
                inverse_alignment
            ]

            self.compile_inverse_next_state_and_probs = theano.function(
                inps, outs, name='compile_inverse_next_state_and_probs')
            print 'Done'

    def save(self, path=None):
        if path is None:
            path = self.path
        filenpz = open(path, "w")
        val = dict([(value.name, value.get_value())
                    for index, value in enumerate(self.params)])
        logger.info("save the model {}".format(path))
        numpy.savez(path, **val)
        filenpz.close()

    def load(self, path=None):
        if path is None:
            path = self.path
        if os.path.isfile(path):
            logger.info("load params {}".format(path))
            val = numpy.load(path)
            for index, param in enumerate(self.params):
                logger.info('Loading {} with shape {}'.format(
                    param.name,
                    param.get_value(borrow=True).shape))
                if param.name not in val.keys():
                    logger.info('Adding new param {} with shape {}'.format(
                        param.name,
                        param.get_value(borrow=True).shape))
                    continue
                if param.get_value().shape != val[param.name].shape:
                    logger.info("Error: model param != load param shape {} != {}".format(\
                                        param.get_value().shape, val[param.name].shape))
                    raise Exception("loading params shape mismatch")
                else:
                    param.set_value(val[param.name], borrow=True)
        else:
            logger.error("file {} does not exist".format(path))
            self.save()
コード例 #4
0
class EncoderDecoder(object):
    def __init__(self, **kwargs):
        self.n_in_src = kwargs.pop('nembed_src')
        self.n_in_trg = kwargs.pop('nembed_trg')
        self.n_hids_src = kwargs.pop('nhids_src')
        self.n_hids_trg = kwargs.pop('nhids_trg')
        self.src_vocab_size = kwargs.pop('src_vocab_size')
        self.trg_vocab_size = kwargs.pop('trg_vocab_size')
        self.method = kwargs.pop('method')
        self.dropout = kwargs.pop('dropout')
        self.maxout_part = kwargs.pop('maxout_part')
        self.path = kwargs.pop('saveto')
        self.clip_c = kwargs.pop('clip_c')
        self.mkl = kwargs.pop('mkl')
        self.with_attention = kwargs.pop('with_attention')

        self.with_coverage = kwargs.pop('with_coverage')
        self.coverage_dim = kwargs.pop('coverage_dim')
        self.coverage_type = kwargs.pop('coverage_type')
        self.max_fertility = kwargs.pop('max_fertility')
        if self.coverage_type is 'linguistic':
            # make sure the dimension of linguistic coverage is always 1
            self.coverage_dim = 1

        self.with_context_gate = kwargs.pop('with_context_gate')

        self.params = []
        self.layers = []

        self.table_src = LookupTable(self.src_vocab_size, self.n_in_src, name='table_src')
        self.layers.append(self.table_src)

        self.encoder = BidirectionalEncoder(self.n_in_src, self.n_hids_src, self.table_src, self.mkl, name='birnn_encoder')
        self.layers.append(self.encoder)

        self.table_trg = LookupTable(self.trg_vocab_size, self.n_in_trg, name='table_trg')
        self.layers.append(self.table_trg)

        self.decoder = Decoder(self.mkl,
			       self.n_in_trg,
                               self.n_hids_trg,
                               2 * self.n_hids_src,
                               with_attention=self.with_attention,
                               with_coverage=self.with_coverage,
                               coverage_dim=self.coverage_dim,
                               coverage_type=self.coverage_type,
                               max_fertility=self.max_fertility,
                               with_context_gate=self.with_context_gate,
                               maxout_part=self.maxout_part,
                               name='rnn_decoder')

        self.layers.append(self.decoder)
        self.logistic_layer = LogisticRegression(self.n_in_trg, self.trg_vocab_size)
        self.layers.append(self.logistic_layer)

        # for reconstruction
        self.with_reconstruction = kwargs.pop('with_reconstruction')

        self.reconstruction_weight = kwargs.pop('reconstruction_weight')

        if self.with_reconstruction:
            # note the source and target sides are reversed
            self.inverse_decoder = InverseDecoder(self.n_in_src, 2 * self.n_hids_src, self.n_hids_trg,
                                                  with_attention=self.with_attention,
                                                  maxout_part=self.maxout_part, name='rnn_inverse_decoder')

            self.layers.append(self.inverse_decoder)

            self.inverse_logistic_layer = LogisticRegression(self.n_in_src, self.src_vocab_size, name='inverse_LR')
            self.layers.append(self.inverse_logistic_layer)

        for layer in self.layers:
            self.params.extend(layer.params)

    def build_trainer_with_data_parallel(self, src, src_mask, trg, trg_mask, ite, devices,
                                         l1_reg_weight=1e-6,
                                         l2_reg_weight=1e-6,
                                         softmax_output_num_sampled=100000):

        assert K._BACKEND == 'tensorflow'

        src_mask_3d = [K.expand_dims(mask) for mask in src_mask]
        trg_mask_3d = [K.expand_dims(mask) for mask in trg_mask]

        num_devices = len(devices)

        loss_list = []
        grads_list = []

        # TODO: group the devices by hosts, first calculate the averaged gradients for each host
        for i, device in enumerate(devices):
            with tf.device(device):
                loss = self.calc_loss(src[i],
                                      src_mask_3d[i],
                                      trg[i],
                                      trg_mask_3d[i],
                                      l1_reg_weight=l1_reg_weight,
                                      l2_reg_weight=l2_reg_weight,
                                      softmax_output_num_sampled=softmax_output_num_sampled)

                loss_list.append(loss)
                grads = K.gradients(loss, self.params)
                grads_list.append(grads)

        avg_loss = sum(loss_list) / num_devices
        # use customized version of gradient to enable colocate_gradients with_ops
        # to ensure the gradient are computed by the same device that do the forward computation
        grads = avg_grads(grads_list)
        grads = grad_clip(grads, self.clip_c)
        updates = adadelta(self.params, grads)

        inps = src + src_mask + trg + trg_mask

        self.train_fn = K.function(inps,
                                   [avg_loss] + loss_list,
                                   updates=updates)

    def calc_loss(self, src, src_mask_3d, trg, trg_mask_3d,
                  l1_reg_weight=1e-6,
                  l2_reg_weight=1e-6,
                  softmax_output_num_sampled=100000):

        annotations = self.encoder.apply(src, src_mask_3d)
        # init_context = annotations[0, :, -self.n_hids_src:]
        # modification #1
        # mean pooling
        init_context = K.sum(annotations * src_mask_3d, axis=0) / K.sum(src_mask_3d, axis=0)

        trg_emb = self.table_trg.apply(trg)
        # shift_right assumes a 3D tensor, and time steps is dimension one
        trg_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions(trg_emb, [1, 0, 2])),
                                               [1, 0, 2])

        hiddens, readout, alignment = self.decoder.run_pipeline(state_below=trg_emb_shifted,
                                                                mask_below=trg_mask_3d,
                                                                init_context=init_context,
                                                                c=annotations,
                                                                c_mask=src_mask_3d)

        # apply dropout
        if self.dropout > 0.:
            logger.info('Apply dropout with p = {}'.format(self.dropout))
            readout = Dropout(readout, self.dropout)

        cost = calc_loss_from_readout(readout=readout,
                                      targets=trg,
                                      targets_mask=trg_mask_3d,
                                      logisticRegressionLayer=self.logistic_layer,
                                      softmax_output_num_sampled=softmax_output_num_sampled)

        if self.with_reconstruction:
            inverse_init_context = K.sum(hiddens * trg_mask_3d, axis=0) / K.sum(trg_mask_3d, axis=0)
            src_emb = self.table_src.apply(src)
            src_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions(src_emb, [1, 0, 2])),
                                                   [1, 0, 2])

            inverse_hiddens, inverse_readout, inverse_alignment = self.inverse_decoder.run_pipeline(
                state_below=src_emb_shifted,
                mask_below=src_mask_3d,
                init_context=inverse_init_context,
                c=hiddens,
                c_mask=trg_mask_3d)

            if self.dropout > 0.:
                inverse_readout = Dropout(inverse_readout, self.dropout)

            inverse_logits = self.inverse_logistic_layer.get_logits(inverse_readout)
            inverse_logits_flat = K.reshape(inverse_logits, shape=(-1, self.inverse_logistic_layer.n_out))
            reconstruction_cost = get_category_cross_entropy_from_flat_logits(inverse_logits_flat, src, src_mask_3d)

            cost += reconstruction_cost * self.reconstruction_weight

        L1 = sum([K.sum(K.abs(param)) for param in self.params])
        L2 = sum([K.sum(K.square(param)) for param in self.params])

        params_regular = L1 * l1_reg_weight + L2 * l2_reg_weight

        cost += params_regular

        return cost

    def build_trainer_with_model_parallel(self, src, src_mask, trg, trg_mask, ite, ps_device, devices,
                                          l1_reg_weight=1e-6, l2_reg_weight=1e-6):
        assert K._BACKEND == 'tensorflow'

        src_mask_3d = K.expand_dims(src_mask)
        trg_mask_3d = K.expand_dims(trg_mask)

        # compute loss and grads
        loss = self.calc_loss_with_model_parallel(src,
                                                  src_mask_3d,
                                                  trg,
                                                  trg_mask_3d,
                                                  ps_device=ps_device,
                                                  devices=devices,
                                                  l1_reg_weight=l1_reg_weight,
                                                  l2_reg_weight=l2_reg_weight)

        grads = tf.gradients(loss, self.params, colocate_gradients_with_ops=True)

        grads = grad_clip(grads, self.clip_c)
        updates = adadelta(self.params, grads)
        inps = [src, src_mask, trg, trg_mask]

        self.train_fn = K.function(inps,
                                   [loss],
                                   updates=updates)

    def calc_loss_with_model_parallel(self, src, src_mask_3d, trg, trg_mask_3d, ps_device, devices, l1_reg_weight=1e-6,
                                      l2_reg_weight=1e-6):
        assert K._BACKEND == 'tensorflow'

        with tf.device(devices[0]):

            annotations = self.encoder.apply(src, src_mask_3d)

            init_context = K.sum(annotations * src_mask_3d, axis=0) / K.sum(src_mask_3d, axis=0)

            trg_emb = self.table_trg.apply(trg)
            # shift_right assumes a 3D tensor, and time steps is dimension one
            trg_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions(trg_emb, [1, 0, 2])),
                                                   [1, 0, 2])
            hiddens, readout, alignment = self.decoder.run_pipeline(
                state_below=trg_emb_shifted,
                mask_below=trg_mask_3d,
                init_context=init_context,
                c=annotations,
                c_mask=src_mask_3d)

            if self.dropout > 0.:
                logger.info('Apply dropout with p = {}'.format(self.dropout))
                readout = Dropout(readout, self.dropout)

        logits = self.logistic_layer.get_logits_with_multiple_devices(readout, ps_device, devices)

        with tf.device(devices[0]):
            logits_flat = K.reshape(logits, shape=(-1, self.logistic_layer.n_out))
            cost = get_category_cross_entropy_from_flat_logits(logits_flat, trg, trg_mask_3d)

        if self.with_reconstruction:
            with tf.device(devices[0]):
                inverse_init_context = K.sum(hiddens * trg_mask_3d, axis=0) / K.sum(trg_mask_3d, axis=0)
                src_emb = self.table_src.apply(src)
                src_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions(
                    src_emb, [1, 0, 2])), [1, 0, 2])
                inverse_hiddens, inverse_readout, inverse_alignment = self.inverse_decoder.run_pipeline(
                    state_below=src_emb_shifted,
                    mask_below=src_mask_3d,
                    init_context=inverse_init_context,
                    c=hiddens,
                    c_mask=trg_mask_3d)
            with tf.device(devices[0]):
                if self.dropout > 0.:
                    inverse_readout = Dropout(inverse_readout, self.dropout)

            inverse_logits = self.inverse_logistic_layer.get_logits_with_multiple_devices(inverse_readout, ps_device,
                                                                                          devices)
            with tf.device(devices[0]):
                inverse_logits_flat = K.reshape(inverse_logits, shape=(-1, self.inverse_logistic_layer.n_out))
                reconstruction_cost = get_category_cross_entropy_from_flat_logits(inverse_logits_flat, src, src_mask_3d)

            with tf.device(devices[0]):
                cost += reconstruction_cost * self.reconstruction_weight

        L1 = sum([K.sum(K.abs(param)) for param in self.params])
        L2 = sum([K.sum(K.square(param)) for param in self.params])

        params_regular = L1 * l1_reg_weight + L2 * l2_reg_weight

        cost += params_regular

        return cost

    def build_trainer(self, src, src_mask, trg, trg_mask, ite,
                      l1_reg_weight=1e-6,
                      l2_reg_weight=1e-6,
                      softmax_output_num_sampled=100000):

        src_mask_3d = K.expand_dims(src_mask)
        trg_mask_3d = K.expand_dims(trg_mask)
        annotations = self.encoder.apply(src, src_mask_3d)
        
        # init_context = annotations[0, :, -self.n_hids_src:]
        # modification #1
        # mean pooling
        init_context = K.sum(annotations * src_mask_3d, axis=0) / K.sum(src_mask_3d, axis=0)

        trg_emb = self.table_trg.apply(trg)
        # shift_right assumes a 3D tensor, and time steps is dimension one
        trg_emb_shifted = K.permute_dimensions(K.shift_right(K.permute_dimensions(trg_emb, [1, 0, 2])),
                                               [1, 0, 2])

        hiddens, readout, _ = self.decoder.run_pipeline(state_below=trg_emb_shifted,
                                                        mask_below=trg_mask_3d,
                                                        init_context=init_context,
                                                        c=annotations,
                                                        c_mask=src_mask_3d)
        # apply dropout
        if self.dropout > 0.:
            logger.info('Apply dropout with p = {}'.format(self.dropout))
            readout = Dropout(readout, self.dropout)

        self.cost = calc_loss_from_readout(readout=readout,
                                           targets=trg,
                                           targets_mask=trg_mask_3d,
                                           logisticRegressionLayer=self.logistic_layer,
                                           softmax_output_num_sampled=softmax_output_num_sampled)
        # for reconstruction

        self.L1 = sum([K.sum(K.abs(param)) for param in self.params])
        self.L2 = sum([K.sum(K.square(param)) for param in self.params])

        params_regular = self.L1 * l1_reg_weight + self.L2 * l2_reg_weight

        # train cost
        train_cost = self.cost + params_regular

        # gradients
        grads = K.gradients(train_cost, self.params)

        # apply gradient clipping here
        grads = grad_clip(grads, self.clip_c)

        # updates
        updates = adadelta(self.params, grads)

        # train function
        inps = [src, src_mask, trg, trg_mask]

        self.train_fn = K.function(inps, [train_cost], updates=updates, name='train_func')

    def build_sampler(self):

        # time steps, nb_samples
        x = K.placeholder((None, None), dtype='int32')

        c = self.encoder.apply(x, None)  # None,None,None

        init_context = K.mean(c, axis=0)  # None,None

        init_state = self.decoder.create_init_state(init_context)

        outs = [init_state, c]


        if not self.with_attention:
            outs.append(init_context)

        # compile function
        logger.info('Building compile_init_state_and_context function ...')
        self.compile_init_and_context = K.function([x], outs)
        logger.info('Done')

        if self.with_attention:
            c = K.placeholder((None, None, None))
            init_context = K.mean(c, axis=0)
        else:
            init_context = K.placeholder((None, None))
        # nb_samples
        y = K.placeholder((None,), dtype='int32')
        # nb_samples, state_dim
        cur_state = K.placeholder((None, None))
        # if it is the first word, emb should be all zero, and it is indicated by -1
        trg_emb = lookup_table(self.table_trg.W, y, name='trg_emb')

        if self.with_attention and self.with_coverage:
            cov_before = K.placeholder(shape=(None, None, None))
            if self.coverage_type is 'linguistic':
                logger.info('Building compile_fertility ...')
                fertility = self.decoder._get_fertility(c)
                self.compile_fertility = K.function([c], [fertility])
                logger.info('Done')
            else:
                fertility = None
        else:
            cov_before = None
            fertility = None

        # apply one step
        results = self.decoder.apply(state_below=trg_emb,
                                     init_state=cur_state,
                                     init_context=None if self.with_attention else init_context,
                                     c=c if self.with_attention else None,
                                     one_step=True,
                                     cov_before=cov_before,
                                     fertility=fertility)
        next_state = results[0]
        if self.with_attention:
            ctxs, alignment = results[1], results[2]
            if self.with_coverage:
                cov = results[3]
        else:
            # if with_attention=False, we always use init_context as the source representation
            ctxs = init_context

        readout = self.decoder.readout(next_state, ctxs, trg_emb)

        # maxout
        if self.maxout_part > 1:
            readout = self.decoder.one_step_maxout(readout)

        # compute the softmax probability
        next_probs = get_probs_from_logits(self.logistic_layer.get_logits(readout))

        # sample from softmax distribution to get the sample
        # TODO: batch_size* nb_classes
        next_sample = K.argmax(K.random_multinomial(pvals=next_probs))

        # compile function
        logger.info('Building compile_next_state_and_probs function ...')
        inps = [y, cur_state]
        if self.with_attention:
            inps.append(c)
        else:
            inps.append(init_context)
        outs = [next_probs, next_state, next_sample]
        if self.with_attention:
            outs.append(alignment)
            if self.with_coverage:
                inps.append(cov_before)
                outs.append(cov)

        self.compile_next_state_and_probs = K.function(inps, outs)
        logger.info('Done')

        # for reconstruction
        if self.with_reconstruction:
            if self.with_attention:
                # time steps, nb_samples, context_dim
                inverse_c = K.placeholder((None, None, None))
                # mean pooling
                inverse_init_context = K.mean(inverse_c, axis=0)
            else:
                inverse_init_context = K.placeholder((None, None))

            inverse_init_state = self.inverse_decoder.create_init_state(inverse_init_context)

            outs = [inverse_init_state]
            if not self.with_attention:
                outs.append(inverse_init_context)

            # compile function
            logger.info('Building compile_inverse_init_state_and_context function ...')
            self.compile_inverse_init_and_context = K.function([inverse_c], outs)
            logger.info('Done')

            # nb_samples
            src = K.placeholder(shape=(None,), dtype='int32')
            # nb_samples, state_dim
            inverse_cur_state = K.placeholder(shape=(None, None))
            # time_steps, nb_samples
            trg_mask = K.placeholder(shape=(None, None))
            # to 3D mask
            trg_mask_3d = K.expand_dims(trg_mask)
            # if it is the first word, emb should be all zero, and it is indicated by -1
            src_emb = lookup_table(self.table_src.W, src, name='src_emb')

            # apply one step
            inverse_results = self.inverse_decoder.apply(state_below=src_emb,
                                                         init_state=inverse_cur_state,
                                                         init_context=None if self.with_attention else inverse_init_context,
                                                         c=inverse_c if self.with_attention else None,
                                                         c_mask=trg_mask_3d,
                                                         one_step=True)

            inverse_next_state = inverse_results[0]
            if self.with_attention:
                inverse_ctxs, inverse_alignment = inverse_results[1], inverse_results[2]
            else:
                # if with_attention=False, we always use init_context as the source representation
                inverse_ctxs = init_context

            inverse_readout = self.inverse_decoder.readout(inverse_next_state, inverse_ctxs, src_emb)

            # maxout
            if self.maxout_part > 1:
                inverse_readout = self.inverse_decoder.one_step_maxout(inverse_readout)

            # apply dropout
            if self.dropout > 0.:
                inverse_readout = Dropout(inverse_readout, self.dropout)

            # compute the softmax probability
            inverse_next_probs = get_probs_from_logits(self.inverse_logistic_layer.get_logits(inverse_readout))

            # sample from softmax distribution to get the sample
            inverse_next_sample = K.argmax(K.random_multinomial(pvals=inverse_next_probs))

            # compile function
            logger.info('Building compile_inverse_next_state_and_probs function ...')
            inps = [src, trg_mask, inverse_cur_state]
            if self.with_attention:
                inps.append(inverse_c)
            else:
                inps.append(inverse_init_context)
            outs = [inverse_next_probs, inverse_next_state, inverse_next_sample]
            if self.with_attention:
                outs.append(inverse_alignment)

            self.compile_inverse_next_state_and_probs = K.function(inps, outs)
            logger.info('Done')

    def save(self, path=None):
        if path is None:
            path = self.path
        filenpz = open(path, "w")
        # parameter will have different name under tensorflow and theano
        val = dict([(self.norm_para_name(value.name), K.get_value(value)) for _, value in enumerate(self.params)])
        logger.info("save the model {}".format(path))
        numpy.savez(path, **val)
        filenpz.close()

    def norm_para_name(self, name):
        # LR_W:0
        pos = name.find(':')
        if pos != -1:
            return name[:pos]
        else:
            return name

    def hot_fix_parameter_names(self, params):
        new_model_parameters = {}
        for k in params.keys():
            val = params[k]
            new_name = self.norm_para_name(k)
            new_model_parameters[new_name] = val

        return new_model_parameters

    def load(self, path=None):
        if path is None:
            path = self.path
        if os.path.isfile(path):
            logger.info("load params {}".format(path))
            val = numpy.load(path)
            val = self.hot_fix_parameter_names(val)
            for _, param in enumerate(self.params):
                param_name = self.norm_para_name(param.name)
                logger.info('Loading {} with shape {}'.format(param_name, K.get_value(param).shape))
                if param_name not in val.keys():
                    logger.info('Adding new param {} with shape {}'.format(param_name, K.get_value(param).shape))
                    continue
                if K.get_value(param).shape != val[param_name].shape:
                    logger.info("Error: model param != load param shape {} != {}".format( \
                        K.get_value(param).shape, val[param_name].shape))
                    raise Exception("loading params shape mismatch")
                else:
                    K.set_value(param, val[param_name])
        else:
            logger.warn("file {} does not exist, ignoring load".format(path))
コード例 #5
0
def train(config):
    if not os.path.exists(config.out):
        os.makedirs(config.out)

    comp_transform = transforms.Compose([
        transforms.CenterCrop(config.crop),
        transforms.Resize(config.resize),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    domain_a_train = CustomDataset(os.path.join(config.root, 'trainA.txt'), transform=comp_transform)
    domain_b_train = CustomDataset(os.path.join(config.root, 'trainB.txt'), transform=comp_transform)

    a_label = torch.full((config.bs,), 1)
    b_label = torch.full((config.bs,), 0)
    b_separate = torch.full((config.bs,
                             config.sep,
                             config.resize // (2 ** (config.n_blocks + 1)),
                             config.resize // (2 ** (config.n_blocks + 1))), 0)

    # build networks
    e1 = E1(sep=config.sep, size=config.resize)
    e2 = E2(n_feats=config.n_tot_feats, sep=config.sep)
    decoder = Decoder(n_feats=config.n_tot_feats)
    disc = Disc(size=config.resize, sep=config.sep)
    rho_clipper = RhoClipper(0., 1.)

    mse = nn.MSELoss()
    bce = nn.BCELoss()

    if torch.cuda.is_available():
        e1 = e1.cuda()
        e2 = e2.cuda()
        decoder = decoder.cuda()
        disc = disc.cuda()

        a_label = a_label.cuda()
        b_label = b_label.cuda()
        b_separate = b_separate.cuda()

        mse = mse.cuda()
        bce = bce.cuda()

    ae_params = list(e1.parameters()) + list(e2.parameters()) + list(decoder.parameters())
    ae_optimizer = optim.Adam(ae_params, lr=config.lr,
                              betas=(config.beta1, config.beta2), eps=config.eps)

    disc_params = disc.parameters()
    disc_optimizer = optim.Adam(disc_params, lr=config.d_lr,
                                betas=(config.beta1, config.beta2), eps=config.eps)

    _iter: int = 0
    if config.load != '':
        save_file = os.path.join(config.load, 'checkpoint')
        _iter = load_model(save_file, e1, e2, decoder, ae_optimizer, disc, disc_optimizer)

    e1 = e1.train()
    e2 = e2.train()
    decoder = decoder.train()
    disc = disc.train()

    print('[*] Started training...')
    while True:
        domain_a_loader = torch.utils.data.DataLoader(domain_a_train, batch_size=config.bs,
                                                      shuffle=True, num_workers=config.n_threads)
        domain_b_loader = torch.utils.data.DataLoader(domain_b_train, batch_size=config.bs,
                                                      shuffle=True, num_workers=config.n_threads)
        if _iter >= config.iters:
            break

        for domain_a_img, domain_b_img in zip(domain_a_loader, domain_b_loader):
            if domain_a_img.size(0) != config.bs or domain_b_img.size(0) != config.bs:
                break

            domain_a_img = Variable(domain_a_img)
            domain_b_img = Variable(domain_b_img)

            if torch.cuda.is_available():
                domain_a_img = domain_a_img.cuda()
                domain_b_img = domain_b_img.cuda()

            domain_a_img = domain_a_img.view((-1, 3, config.resize, config.resize))
            domain_b_img = domain_b_img.view((-1, 3, config.resize, config.resize))

            ae_optimizer.zero_grad()

            a_common = e1(domain_a_img)
            a_separate = e2(domain_a_img)
            a_encoding = torch.cat([a_common, a_separate], dim=1)

            b_common = e1(domain_b_img)
            b_encoding = torch.cat([b_common, b_separate], dim=1)

            a_decoding = decoder(a_encoding)
            b_decoding = decoder(b_encoding)

            g_loss = mse(a_decoding, domain_a_img) + mse(b_decoding, domain_b_img)

            preds_a = disc(a_common)
            preds_b = disc(b_common)
            g_loss += config.adv_weight * (bce(preds_a, b_label) + bce(preds_b, b_label))

            g_loss.backward()
            torch.nn.utils.clip_grad_norm_(ae_params, 5.)
            ae_optimizer.step()

            disc_optimizer.zero_grad()

            a_common = e1(domain_a_img)
            b_common = e1(domain_b_img)

            disc_a = disc(a_common)
            disc_b = disc(b_common)

            d_loss = bce(disc_a, a_label) + bce(disc_b, b_label)

            d_loss.backward()
            torch.nn.utils.clip_grad_norm_(disc_params, 5.)
            disc_optimizer.step()

            decoder.apply(rho_clipper)

            if _iter % config.progress_iter == 0:
                print('[*] [%07d/%07d] d_loss : %.4f, g_loss : %.4f' %
                      (_iter, config.iters, d_loss, g_loss))

            if _iter % config.display_iter == 0:
                e1 = e1.eval()
                e2 = e2.eval()
                decoder = decoder.eval()

                save_images(config, e1, e2, decoder, _iter)

                e1 = e1.train()
                e2 = e2.train()
                decoder = decoder.train()

            if _iter % config.save_iter == 0:
                save_file = os.path.join(config.out, 'checkpoint')
                save_model(save_file, e1, e2, decoder, ae_optimizer, disc, disc_optimizer, _iter)

            _iter += 1