Exemple #1
0
    def __init__(self, hidden_layer_size, vocab_size, huffman_tree=None):
        super(Skipgram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size,
                                       hidden_layer_size,
                                       sparse=True)

        if huffman_tree is None:
            self.softmax_layer = nn.Linear(hidden_layer_size, vocab_size)
            self.use_hierarchical_softmax = False
        else:
            self.softmax_layer = HierarchicalSoftmax(huffman_tree)
            self.use_hierarchical_softmax = True
def create_o_test_model(train_model, examples, hidden_size, embed_size, glove, batch_size = 64, prem_len = 22):
    
    
    graph = Graph()
    
    hypo_layer = LSTM(output_dim= hidden_size, batch_input_shape=(batch_size, 1, embed_size), 
                      return_sequences=True, stateful = True, trainable = False)
    
    
    graph.add_input(name='hypo_input', batch_input_shape=(batch_size, 1), dtype = 'int32')
    graph.add_node(make_fixed_embeddings(glove, 1), name = 'hypo_word_vec', input='hypo_input')
    graph.add_node(hypo_layer, name = 'hypo', input='hypo_word_vec')
    
    graph.add_input(name='premise', batch_input_shape=(batch_size, prem_len, embed_size))
    graph.add_input(name='creative', batch_input_shape=(batch_size, embed_size))
    
    attention = LstmAttentionLayer(hidden_size, return_sequences=True, stateful = True, trainable = False, feed_state = False)
    
    
    graph.add_node(attention, name='attention', inputs=['premise', 'hypo', 'creative'], merge_mode='join')
   
    
    graph.add_input(name='train_input', batch_input_shape=(batch_size, 1), dtype='int32')
    hs = HierarchicalSoftmax(len(glove), input_dim = hidden_size, input_length = 1, trainable = False)
    
    graph.add_node(hs, 
                   name = 'softmax', inputs=['attention','train_input'], 
                   merge_mode = 'join')
    graph.add_output(name='output', input='softmax')
    
    hypo_layer.set_weights(train_model.nodes['hypo'].get_weights())
    attention.set_weights(train_model.nodes['attention'].get_weights())
    hs.set_weights(train_model.nodes['softmax'].get_weights())    
    
    graph.compile(loss={'output': hs_categorical_crossentropy}, optimizer='adam')
    
    func_premise = theano.function([train_model.inputs['premise_input'].get_input()],
                                    train_model.nodes['premise'].get_output(False), 
                                    allow_input_downcast=True)
    func_noise = theano.function([train_model.inputs['noise_input'].get_input(),
                                  train_model.inputs['class_input'].get_input()],
                                  train_model.nodes['creative'].get_output(False),
                                  allow_input_downcast=True)                            

    return graph, func_premise, func_noise
def gen_train(noise_examples, hidden_size, noise_dim, glove, hypo_len,
              version):
    if version == 9:
        return baseline_train(noise_examples, hidden_size, noise_dim, glove,
                              hypo_len, version)
    elif version == 6 or version == 7:
        return autoe_train(hidden_size, noise_dim, glove, hypo_len, version)

    prem_input = Input(shape=(None, ), dtype='int32', name='prem_input')
    hypo_input = Input(shape=(hypo_len + 1, ),
                       dtype='int32',
                       name='hypo_input')
    noise_input = Input(shape=(1, ), dtype='int32', name='noise_input')
    train_input = Input(shape=(None, ), dtype='int32', name='train_input')
    class_input = Input(shape=(3, ), name='class_input')

    prem_embeddings = make_fixed_embeddings(glove, None)(prem_input)
    hypo_embeddings = make_fixed_embeddings(glove, hypo_len + 1)(hypo_input)
    premise_layer = LSTM(output_dim=hidden_size,
                         return_sequences=True,
                         inner_activation='sigmoid',
                         name='premise')(prem_embeddings)

    hypo_layer = LSTM(output_dim=hidden_size,
                      return_sequences=True,
                      inner_activation='sigmoid',
                      name='hypo')(hypo_embeddings)
    noise_layer = Embedding(noise_examples,
                            noise_dim,
                            input_length=1,
                            name='noise_embeddings')(noise_input)
    flat_noise = Flatten(name='noise_flatten')(noise_layer)
    if version == 8:
        create_input = merge([class_input, flat_noise], mode='concat')
    if version == 5:
        create_input = flat_noise

    creative = Dense(hidden_size, name='cmerge')(create_input)
    attention = LstmAttentionLayer(
        output_dim=hidden_size,
        return_sequences=True,
        feed_state=True,
        name='attention')([hypo_layer, premise_layer, creative])

    hs = HierarchicalSoftmax(len(glove), trainable=True,
                             name='hs')([attention, train_input])

    inputs = [prem_input, hypo_input, noise_input, train_input, class_input]
    if version == 5:
        inputs = inputs[:4]

    model_name = 'version' + str(version)
    model = Model(input=inputs, output=hs, name=model_name)
    model.compile(loss=hs_categorical_crossentropy, optimizer='adam')

    return model
def baseline_train(noise_examples, hidden_size, noise_dim, glove, hypo_len,
                   version):
    prem_input = Input(shape=(None, ), dtype='int32', name='prem_input')
    hypo_input = Input(shape=(hypo_len + 1, ),
                       dtype='int32',
                       name='hypo_input')
    noise_input = Input(shape=(1, ), dtype='int32', name='noise_input')
    train_input = Input(shape=(None, ), dtype='int32', name='train_input')
    class_input = Input(shape=(3, ), name='class_input')
    concat_dim = hidden_size + noise_dim + 3
    prem_embeddings = make_fixed_embeddings(glove, None)(prem_input)
    hypo_embeddings = make_fixed_embeddings(glove, hypo_len + 1)(hypo_input)

    premise_layer = LSTM(output_dim=hidden_size,
                         return_sequences=False,
                         inner_activation='sigmoid',
                         name='premise')(prem_embeddings)

    noise_layer = Embedding(noise_examples,
                            noise_dim,
                            input_length=1,
                            name='noise_embeddings')(noise_input)
    flat_noise = Flatten(name='noise_flatten')(noise_layer)
    merged = merge([premise_layer, class_input, flat_noise], mode='concat')
    creative = Dense(concat_dim, name='cmerge')(merged)
    fake_merge = Lambda(lambda x: x[0], output_shape=lambda x: x[0])(
        [hypo_embeddings, creative])
    hypo_layer = FeedLSTM(output_dim=concat_dim,
                          return_sequences=True,
                          feed_layer=creative,
                          inner_activation='sigmoid',
                          name='attention')([fake_merge])

    hs = HierarchicalSoftmax(len(glove), trainable=True,
                             name='hs')([hypo_layer, train_input])
    inputs = [prem_input, hypo_input, noise_input, train_input, class_input]

    model_name = 'version' + str(version)
    model = Model(input=inputs, output=hs, name=model_name)
    model.compile(loss=hs_categorical_crossentropy, optimizer='adam')

    return model
def baseline_test(train_model, glove, batch_size):
    version = int(train_model.name[-1])
    hidden_size = train_model.get_layer('attention').output_shape[-1]

    premise_input = Input(batch_shape=(batch_size, None, None))
    hypo_input = Input(batch_shape=(batch_size, 1), dtype='int32')
    creative_input = Input(batch_shape=(batch_size, None))
    train_input = Input(batch_shape=(batch_size, 1), dtype='int32')

    hypo_embeddings = make_fixed_embeddings(glove, 1)(hypo_input)
    hypo_layer = FeedLSTM(output_dim=hidden_size,
                          return_sequences=True,
                          stateful=True,
                          trainable=False,
                          feed_layer=premise_input,
                          name='attention')([hypo_embeddings])
    hs = HierarchicalSoftmax(len(glove), trainable=False,
                             name='hs')([hypo_layer, train_input])

    inputs = [hypo_input, creative_input, train_input]
    outputs = [hs]

    model = Model(input=inputs, output=outputs, name=train_model.name)
    model.compile(loss=hs_categorical_crossentropy, optimizer='adam')

    update_gen_weights(model, train_model)
    f_inputs = [
        train_model.get_layer('noise_embeddings').output,
        train_model.get_layer('class_input').input,
        train_model.get_layer('prem_input').input
    ]
    func_noise = theano.function(f_inputs,
                                 train_model.get_layer('cmerge').output,
                                 allow_input_downcast=True)

    return model, None, func_noise
def autoe_train(hidden_size, noise_dim, glove, hypo_len, version):

    prem_input = Input(shape=(None, ), dtype='int32', name='prem_input')
    hypo_input = Input(shape=(hypo_len + 1, ),
                       dtype='int32',
                       name='hypo_input')
    train_input = Input(shape=(None, ), dtype='int32', name='train_input')
    class_input = Input(shape=(3, ), name='class_input')

    prem_embeddings = make_fixed_embeddings(glove, None)(prem_input)
    hypo_embeddings = make_fixed_embeddings(glove, hypo_len + 1)(hypo_input)
    premise_encoder = LSTM(output_dim=hidden_size,
                           return_sequences=True,
                           inner_activation='sigmoid',
                           name='premise_encoder')(prem_embeddings)

    hypo_encoder = LSTM(output_dim=hidden_size,
                        return_sequences=True,
                        inner_activation='sigmoid',
                        name='hypo_encoder')(hypo_embeddings)
    class_encoder = Dense(hidden_size, activation='tanh')(class_input)

    encoder = LstmAttentionLayer(
        output_dim=hidden_size,
        return_sequences=False,
        feed_state=True,
        name='encoder')([hypo_encoder, premise_encoder, class_encoder])
    if version == 6:
        reduction = Dense(noise_dim, name='reduction',
                          activation='tanh')(encoder)
    elif version == 7:
        z_mean = Dense(noise_dim, name='z_mean')(encoder)
        z_log_sigma = Dense(noise_dim, name='z_log_sigma')(encoder)

        def sampling(args):
            z_mean, z_log_sigma = args
            epsilon = K.random_normal(shape=(
                64,
                noise_dim,
            ),
                                      mean=0.,
                                      std=0.01)
            return z_mean + K.exp(z_log_sigma) * epsilon

        reduction = Lambda(sampling,
                           output_shape=lambda sh: (
                               sh[0][0],
                               noise_dim,
                           ),
                           name='reduction')([z_mean, z_log_sigma])

        def vae_loss(args):
            z_mean, z_log_sigma = args
            return -0.5 * K.mean(
                1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma),
                axis=-1)

        vae = Lambda(vae_loss,
                     output_shape=lambda sh: (
                         sh[0][0],
                         1,
                     ),
                     name='vae_output')([z_mean, z_log_sigma])

    merged = merge([class_input, reduction], mode='concat')
    creative = Dense(hidden_size, name='expansion', activation='tanh')(merged)
    premise_decoder = LSTM(output_dim=hidden_size,
                           return_sequences=True,
                           inner_activation='sigmoid',
                           name='premise')(prem_embeddings)

    hypo_decoder = LSTM(output_dim=hidden_size,
                        return_sequences=True,
                        inner_activation='sigmoid',
                        name='hypo')(hypo_embeddings)
    attention = LstmAttentionLayer(
        output_dim=hidden_size,
        return_sequences=True,
        feed_state=True,
        name='attention')([hypo_decoder, premise_decoder, creative])

    hs = HierarchicalSoftmax(len(glove), trainable=True,
                             name='hs')([attention, train_input])

    inputs = [prem_input, hypo_input, train_input, class_input]

    model_name = 'version' + str(version)
    model = Model(input=inputs,
                  output=(hs if version == 6 else [hs, vae]),
                  name=model_name)
    if version == 6:
        model.compile(loss=hs_categorical_crossentropy, optimizer='adam')
    elif version == 7:

        def minimize(y_true, y_pred):
            return y_pred

        def metric(y_true, y_pred):
            return K.mean(y_pred)

        model.compile(loss=[hs_categorical_crossentropy, minimize],
                      metrics={
                          'hs': word_loss,
                          'vae_output': metric
                      },
                      optimizer='adam')
    return model
def gen_test(train_model, glove, batch_size):

    version = int(train_model.name[-1])
    if version == 9:
        return baseline_test(train_model, glove, batch_size)
    hidden_size = train_model.get_layer('premise').output_shape[-1]

    premise_input = Input(batch_shape=(batch_size, None, None))
    hypo_input = Input(batch_shape=(batch_size, 1), dtype='int32')
    creative_input = Input(batch_shape=(batch_size, None))
    train_input = Input(batch_shape=(batch_size, 1), dtype='int32')

    hypo_embeddings = make_fixed_embeddings(glove, 1)(hypo_input)

    hypo_layer = LSTM(output_dim=hidden_size,
                      return_sequences=True,
                      stateful=True,
                      unroll=False,
                      trainable=False,
                      inner_activation='sigmoid',
                      name='hypo')(hypo_embeddings)

    att_inputs = [hypo_layer, premise_input] if version == 5 else [
        hypo_layer, premise_input, creative_input
    ]
    attention = LstmAttentionLayer(output_dim=hidden_size, return_sequences=True, stateful = True, unroll =False,
        trainable = False, feed_state = False, name='attention') \
            (att_inputs)

    hs = HierarchicalSoftmax(len(glove), trainable=False,
                             name='hs')([attention, train_input])

    inputs = [premise_input, hypo_input, creative_input, train_input]
    outputs = [hs]

    model = Model(input=inputs, output=outputs, name=train_model.name)
    model.compile(loss=hs_categorical_crossentropy, optimizer='adam')

    update_gen_weights(model, train_model)

    func_premise = theano.function([train_model.get_layer('prem_input').input],
                                   train_model.get_layer('premise').output,
                                   allow_input_downcast=True)
    if version == 5 or version == 8:
        f_inputs = [train_model.get_layer('noise_embeddings').output]
        if version == 8:
            f_inputs += [train_model.get_layer('class_input').input]

        func_noise = theano.function(f_inputs,
                                     train_model.get_layer('cmerge').output,
                                     allow_input_downcast=True)
    elif version == 6 or version == 7:
        noise_input = train_model.get_layer('reduction').output
        class_input = train_model.get_layer('class_input').input
        noise_output = train_model.get_layer('expansion').output

        func_noise = theano.function([noise_input, class_input],
                                     noise_output,
                                     allow_input_downcast=True,
                                     on_unused_input='ignore')

    return model, func_premise, func_noise
Exemple #8
0
class Skipgram(nn.Module):
    """
    Skipgram model

    Args:
        hidden_layer_size: The second dimension of the hidden layer
        vocab_size: The vocabulary size. This should be the size of your word dictionary.
    """
    def __init__(self, hidden_layer_size, vocab_size, huffman_tree=None):
        super(Skipgram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size,
                                       hidden_layer_size,
                                       sparse=True)

        if huffman_tree is None:
            self.softmax_layer = nn.Linear(hidden_layer_size, vocab_size)
            self.use_hierarchical_softmax = False
        else:
            self.softmax_layer = HierarchicalSoftmax(huffman_tree)
            self.use_hierarchical_softmax = True

    def forward(self, input, id_list=None):
        if self.use_hierarchical_softmax:
            word_vector = self.embeddings(input).squeeze()
            probabilities = self.softmax_layer(word_vector, id_list.squeeze())
        else:
            word_vector = self.embeddings(input).squeeze(1)
            probabilities = self.softmax_layer(word_vector)

        return probabilities

    def lookup(self, word, word_dictionary):
        """
        Extracts the word vector for a word given the word and a dictionary that converts
        words to word ids.

        Args:
            word: The word whose vector you want.
            word_dictionary: A dictionary from words to id numbers.
        """
        word_id = word_dictionary[word]
        start_vec = Variable(torch.LongTensor([word_id]).unsqueeze(0)).cuda()

        return self.embeddings(start_vec).squueze()

    def backprop(self, id_list, lr):
        """
            Applies stochastic gradient descent to the weights that involve the id_list. Backwards
            should have been called before this. The reason to use this instead of an optimizer is
            to avoid iterating over all parameters.
        """
        if not self.use_hierarchical_softmax:
            raise ValueError(
                'You can only call backprop when using hierarchical softmax.')

        self.softmax_layer.backprop(id_list, lr)

        for p in self.embeddings.parameters():
            p.data = p.data + (-lr) * p.grad.data
            # zero gradients after we make the calculation
            p.grad.data.zero_()
def test_hierarchical_softmax(timesteps=15,
                              input_dim=50,
                              batch_size=32,
                              output_dim=3218,
                              batches=300,
                              epochs=30):

    model = Graph()
    model.add_input(name='real_input',
                    batch_input_shape=(batch_size, timesteps, input_dim))
    model.add_input(name='train_input',
                    batch_input_shape=(batch_size, timesteps),
                    dtype='int32')
    model.add_node(HierarchicalSoftmax(output_dim,
                                       input_dim=input_dim,
                                       input_length=timesteps),
                   name='hs',
                   inputs=['real_input', 'train_input'],
                   merge_mode='join',
                   create_output=True)

    model.compile(loss={'hs': hs_categorical_crossentropy}, optimizer='adam')
    print "hs model compiled"

    model2 = Sequential()
    model2.add(
        TimeDistributedDense(output_dim,
                             batch_input_shape=(batch_size, timesteps,
                                                input_dim)))
    model2.add(Activation('softmax'))
    model2.compile(loss='categorical_crossentropy', optimizer='adam')
    print "softmax model compiled"

    learn_f = np.random.normal(size=(input_dim, output_dim))
    learn_f = np.divide(learn_f, norm(learn_f, axis=1)[:, None])
    print "learn_f generated"

    for j in range(epochs):

        batch_data = generate_batch(learn_f, batch_size, timesteps, input_dim,
                                    output_dim, batches)

        print "Epoch", j, "data genrated"

        p = Progbar(batches * batch_size)
        for b in batch_data:
            data_train = {'real_input': b[0], 'train_input': b[1], 'hs': b[2]}
            loss = float(model.train_on_batch(data_train)[0])
            p.add(batch_size, [('hs_loss', loss)])
        p2 = Progbar(batches * batch_size)
        for b in batch_data:
            loss, acc = model2.train_on_batch(b[0], b[3], accuracy=True)
            p2.add(batch_size, [('softmax_loss', loss), ('softmax_acc', acc)])

    test_data = generate_batch(learn_f, batch_size, timesteps, input_dim,
                               output_dim, batches)

    p = Progbar(batches * batch_size)
    for b in test_data:
        data_test = {'real_input': b[0], 'train_input': b[1], 'hs': b[3]}
        loss = float(model.test_on_batch(data_test)[0])
        p.add(batch_size, [('hs__test_loss', loss)])

    p2 = Progbar(batches * batch_size)
    for b in batch_data:
        loss = float(model2.train_on_batch(b[0], b[3])[0])
        p2.add(batch_size, [('softmax_loss', loss)])