Esempio n. 1
0
def keras_skip_gram(trainlist1,weight1,weight2):
    N,d=weight1.shape
    negative_num=trainlist1[0][2].shape[1]
    shared_layer1 = Embedding(input_dim=N, output_dim=d, weights=[weight1])
    #shared_layer1 is the output layer
    shared_layer2 = Embedding(input_dim=N, output_dim=d, weights=[weight2])
    #shared_layer2 is the hidden layer
    input_target = Input(shape=(1,), dtype='int32', name='input_1')
    input_source = Input(shape=(1,), dtype='int32', name='input_2')
    input_negative = Input(shape=(negative_num,),dtype='int32',name='input_3')
    target= shared_layer1(input_target)
    source= shared_layer2(input_source)
    negative= shared_layer1(input_negative)
    positive_dot = dot([source, target], axes=(2), normalize=False)
    negative_dot = dot([source, negative], axes=(2), normalize=False)
    all_dot = concatenate([positive_dot, negative_dot],axis=2)
    sigmoid_sample = Activation('sigmoid')(all_dot)
    
    model = Model(inputs=[input_target,input_source,input_negative], outputs=[sigmoid_sample])
    sgd2 = optimizers.SGD(lr=0.025, nesterov=True)
    model.compile(loss='binary_crossentropy', optimizer=sgd2)
    
    for [a1,a2,a4,y1] in trainlist1:
        loss = model.train_on_batch([a1, a2, a4], y1)
    embed_output=shared_layer1.get_weights()[0]
    embed_hidden=shared_layer2.get_weights()[0]
    return embed_output,embed_hidden
Esempio n. 2
0
class SkipModelNS(Model):
    def __init__(self, vocab_size, embedding_dim, num_ns=4):
        super(SkipModelNS, self).__init__()
        self.target_embedding = Embedding(
            vocab_size,
            embedding_dim,
            input_length=1,
            name="skip_embedding",
        )

        self.context_embedding = Embedding(
            vocab_size,
            embedding_dim,
            input_length=num_ns + 1,
        )

        self.dots = Dot(axes=(3, 2))

        self.flatten = Flatten()

    def call(self, input, **kwargs):
        target, context = input
        print(target)
        print(context)
        targets = self.target_embedding(target)
        contexts = self.context_embedding(context)
        d = self.dots([contexts, targets])
        fl = self.flatten(d)
        return fl

    def get_embedding_matrix(self):
        weights = np.array(self.target_embedding.get_weights())
        return weights
Esempio n. 3
0
class _OptimizerEmbedding(_OptimizerParametrical):
    patience = 25
    steps = 20

    def __init__(self, **kwargs):
        super(_OptimizerEmbedding, self).__init__(**kwargs)
    
    def _store_weights(self):
        with open(self.weightpath, "wb") as handle:
            _pickle.dump(self.embedding.get_weights()[0], handle)
    
    def _prep(self):
        super(_OptimizerEmbedding, self)._prep()
        self.orig_emb = K.constant(self.weights["emb"][0])

        self.embedding = Embedding(\
            input_dim = self.length, 
            output_dim = self.configs["emb"]["output_dim"],
            name = "emb_exp")

        if self.calc_hard:
            self.snap = SnapToClosestLayer(self.orig_emb, mode = "max", name = "snap")
            self.cosine = PairwiseCosinesLayer(self.orig_emb, name = "cosine")
            self.max = Lambda(lambda x:K.max(x, axis = -1), output_shape = lambda shape:shape[:-1], name = "max")

    def _get_output(self, embedded):
        encoded = self.encoder(embedded)
        
        if self.with_projection:
            encoded = self.projection(encoded)
        return self.selector(encoded)
    
    def _get_best_ngram(self):
        similarities = cosine_similarity(self.embedding.get_weights()[0], self.weights["emb"][0])
        return similarities.argmax(-1)
Esempio n. 4
0
class CbowModelNS(Model):
    def __init__(self, vocab_size, embedding_dim, num_ns, window):
        super(CbowModelNS, self).__init__()

        self.embedding_layer = Embedding(vocab_size,
                                         embedding_dim,
                                         input_length=window * 2,
                                         name="cbow_embedding")
        self.target_layer = Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=num_ns + 1)

        self.dot = Dot(axes=(3, 2))
        self.flatten = Flatten()

    def call(self, inputs, training=None, mask=None):

        context, target = inputs

        ce = self.embedding_layer(context)
        s = reduce_sum(ce, 1, keepdims=True)
        te = self.target_layer(target)
        dot = self.dot([te, s])
        return self.flatten(dot)

    def get_embedding_matrix(self):
        weights = np.array(self.embedding_layer.get_weights())
        return weights
Esempio n. 5
0
    def test_on_masked_input(self):
        # Average over a dimension in which some elements are masked, and
        # check that they are masked correctly in the average.
        dimension_to_average = 1
        num_dimensions = 3
        sentence_length = 5
        embedding_dim = 10
        vocabulary_size = 15
        input_layer = Input(shape=(sentence_length, ), dtype='int32')
        # Embedding masks zeros
        embedding = Embedding(input_dim=vocabulary_size,
                              output_dim=embedding_dim,
                              mask_zero=True)
        encoder = AveragedBOWEncoder(dimension_to_average, num_dimensions)
        embedded_input = embedding(input_layer)
        encoded_input = encoder(embedded_input)
        encoder_mask = OutputMask()(encoded_input)
        model = Model(inputs=input_layer,
                      outputs=[encoded_input, encoder_mask])
        test_input = numpy.asarray([[0, 3, 1, 7, 10]], dtype='int32')
        embedding_weights = embedding.get_weights()[
            0]  # get_weights returns a list with one element.

        # Don't take the first element because it should be masked.
        expected_output = numpy.mean(embedding_weights[test_input[:, 1:]],
                                     axis=dimension_to_average)
        actual_output, actual_mask = model.predict(test_input)
        # Mask should now

        numpy.testing.assert_array_equal(actual_mask, numpy.array([True]))
        numpy.testing.assert_array_almost_equal(expected_output, actual_output)
Esempio n. 6
0
    def test_mask_is_propagated_if_required(self):
        # Here we test averaging over a dimension which is not masked, but in which the
        # output still requires a mask.
        dimension_to_average = 2
        num_dimensions = 3
        sentence_length = 5
        embedding_dim = 10
        vocabulary_size = 15
        input_layer = Input(shape=(sentence_length, ), dtype='int32')
        # Embedding masks zeros
        embedding = Embedding(input_dim=vocabulary_size,
                              output_dim=embedding_dim,
                              mask_zero=True)
        encoder = AveragedBOWEncoder(dimension_to_average, num_dimensions)
        embedded_input = embedding(input_layer)
        encoded_input = encoder(embedded_input)
        encoder_mask = OutputMask()(encoded_input)
        model = Model(inputs=input_layer,
                      outputs=[encoded_input, encoder_mask])
        test_input = numpy.asarray([[0, 3, 1, 7, 10]], dtype='int32')
        embedding_weights = embedding.get_weights()[
            0]  # get_weights returns a list with one element.

        # Here, the dimension we are reducing is the embedding dimension. In this case,
        # the actual value of the returned output should be equal to averaging without masking,
        # (as there is nothing to mask in a dimension not covered by the mask) but the mask should
        # be propagated through the layer, still masking the correct index.
        expected_output = numpy.mean(embedding_weights[test_input],
                                     axis=dimension_to_average)
        actual_output, actual_mask = model.predict(test_input)
        # First index should still be masked.
        numpy.testing.assert_array_equal(
            actual_mask, numpy.array([[False, True, True, True, True]]))
        numpy.testing.assert_array_almost_equal(expected_output, actual_output)
def run_training(num_classes, X, y):
    """
    Perform the training run
    Args:
      num_classes - number of classes for the labels
      X - ground truth data
      y - ground truth labels
    """
    inputs = Input((window_size * 2, ))

    ##TODO##: Complete embedding_layer code
    embedding_layer = Embedding(num_classes,
                                embedding_size,
                                input_length=2 * window_size,
                                name='embedding_layer')

    ##TODO##: Complete mean_layer code
    mean_layer = Lambda(lambda x: K.mean(x, axis=1))

    ##TODO##: Complete output layer code
    output_layer = Dense(num_classes, activation='softmax')

    output = embedding_layer(inputs)
    output = mean_layer(output)
    output = output_layer(output)

    model = Model(inputs=[inputs], outputs=output)
    model.compile(loss='categorical_crossentropy',
                  optimizer=RMSprop(lr=0.1, rho=0.99),
                  metrics=['accuracy'])

    model.fit(X, y, batch_size=16, epochs=170, validation_split=0.1, verbose=2)

    return embedding_layer.get_weights()[0]
Esempio n. 8
0
class _OptimizerProbabilities(_OptimizerParametrical):
    steps = 200
    def __init__(self, **kwargs):
        super(_OptimizerProbabilities, self).__init__(**kwargs)

    def _prep(self):
        super(_OptimizerProbabilities, self)._prep()
        
        self.argmax = Argmax(name = "argmax")
        
        tmp = Dense(units = self.configs["emb"]["output_dim"], activation = "linear", use_bias = False)
        self.embedding = TimeDistributed(tmp, name = "emb_exp", trainable = False, weights = self.weights["emb"])

        self.logits = Embedding(input_dim = self.length, output_dim = len(self.dictionary), name = "logits")
    
    def _store_weights(self):
        with open(self.weightpath, "wb") as handle:
            _pickle.dump(self.logits.get_weights()[0], handle)

    def _get_output(self, probabilities):
        encoded = self.encoder(self.embedding(probabilities))
        if self.with_projection:
            encoded = self.projection(encoded)
        return self.selector(encoded)
            
    def _build(self):
    
        logits = self.logits(self.dummy_input)

        selection_soft = self._get_output(self._get_probabilities(logits))
        
        self.outputs.append(NameLayer("s")(selection_soft))
        self.outputs.append(NameLayer("L")(selection_soft))
        self.outputs.append(NameLayer("o_logits")(logits))
        
        if self.calc_hard:
            argmax = self.argmax(logits)
            self.outputs.append(NameLayer("o_argmax")(argmax))

            selection_hard = self._get_output(argmax)
            self.outputs.append(NameLayer("h")(selection_hard))
    
    def _get_best_ngram(self):
        return self.logits.get_weights()[0].argmax(axis = -1)
    def build(self):
        question, answer = self._get_inputs()

        # add embedding layers
        embedding = Embedding(self.config['n_words'], self.model_params.get('n_embed_dims', 141))
        question_embedding = embedding(question)

        a_embedding = Embedding(self.config['n_words'], self.model_params.get('n_embed_dims', 141))
        answer_embedding = embedding(answer)

        a_embedding.set_weights(embedding.get_weights())

        # dropout
        dropout = Dropout(0.5)
        question_dropout = dropout(question_embedding)
        answer_dropout = dropout(answer_embedding)

        # rnn
        forward_lstm = LSTM(self.config.get('n_lstm_dims', 141), consume_less='mem', return_sequences=True)
        backward_lstm = LSTM(self.config.get('n_lstm_dims', 141), consume_less='mem', return_sequences=True)
        question_lstm = merge([forward_lstm(question_dropout), backward_lstm(question_dropout)], mode='concat', concat_axis=-1)

        # dropout
        question_dropout = dropout(question_lstm)

        # maxpooling
        maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]))
        question_pool = maxpool(question_dropout)

        # activation
        activation = Activation('tanh')
        question_output = activation(question_pool)

        question_model = Model(input=[question], output=[question_output])

        # attentional rnn
        forward_lstm = AttentionLSTM(self.config.get('n_lstm_dims', 141), question_output, consume_less='mem', return_sequences=True)
        backward_lstm = AttentionLSTM(self.config.get('n_lstm_dims', 141), question_output, consume_less='mem', return_sequences=True)
        answer_lstm = merge([forward_lstm(answer_dropout), backward_lstm(answer_dropout)], mode='concat', concat_axis=-1)

        # dropout
        answer_dropout = dropout(answer_lstm)

        # maxpooling
        maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]))
        answer_pool = maxpool(answer_dropout)

        # activation
        activation = Activation('tanh')
        answer_output = activation(answer_pool)

        answer_model = Model(input=[question, answer], output=[answer_output])

        return question_model, answer_model
Esempio n. 10
0
def keras_multiclass(trainlist, weight1, weight2):
    N, d = weight1.shape
    Nc, d = weight2.shape
    shared_layer1 = Embedding(input_dim=N, output_dim=d, weights=[weight1])
    shared_layer2 = Embedding(input_dim=Nc, output_dim=d, weights=[weight2])
    input_target = Input(shape=(1, ), dtype='int32', name='input_target')
    input_negative = Input(shape=(Nc, ), dtype='int32', name='input_beta')
    target = shared_layer1(input_target)
    beta = shared_layer2(input_negative)
    score_dot = dot([target, beta], axes=(2), normalize=False)
    sigmoid_out = Activation('softmax')(score_dot)

    model = Model(inputs=[input_target, input_negative], outputs=[sigmoid_out])
    sgd = optimizers.SGD(lr=0.025, nesterov=True)
    model.compile(loss='categorical_crossentropy', optimizer=sgd)

    for [a1, a2, y1] in trainlist:
        loss2 = model.train_on_batch([a1, a2], y1)
    embed_emb = shared_layer1.get_weights()[0]
    embed_beta = shared_layer2.get_weights()[0]
    return embed_emb, embed_beta
Esempio n. 11
0
def keras_sg_embedding(trainlist,weight1,weight2):
    #weight1, weight2 are Nxd numpy matrix 
    """The initial weights are weight1(output weight), weight2(hidden weight)
    the train input will update the weights by gradient descent"""
    N,d=weight1.shape
    negative_num=trainlist[0][2].shape[1]
    emb_target = Embedding(input_dim=N, output_dim=d, name='emb_target', weights=[weight1])
    #shared_layer1 is the output layer
    emb_source = Embedding(input_dim=N, output_dim=d, name='emb_source', weights=[weight2])
    #shared_layer2 is the hidden layer
    input_target = Input(shape=(1,), dtype='int32', name='input_target')
    input_source = Input(shape=(1,), dtype='int32', name='input_source')
    input_negative = Input(shape=(negative_num,),dtype='int32',name='input_negative')
    target = emb_target(input_target)
    source = emb_source(input_source)
    negative = emb_target(input_negative)
    positive_dot = dot([source, target], axes=(2), normalize=False)
    negative_dot = dot([source, negative], axes=(2), normalize=False)
    all_dot = concatenate([positive_dot, negative_dot],axis=2)
    sigmoid_sample = Activation('softmax')(all_dot)
    
    model = Model(inputs=[input_target,input_source,input_negative], outputs=[sigmoid_sample])
    model.summary()
    sgd = optimizers.SGD(lr=0.025, nesterov=True)
    model.compile(loss='binary_crossentropy', optimizer=sgd)
    
    ind = 0
    batch_size = len(trainlist)/20
    for [a1,a2,a4,y1] in trainlist:
        loss = model.train_on_batch([a1, a2, a4], y1)
#        print("Epoch %2d batch %4d: loss = %.4f" % (ind/batch_size + 1, ind%batch_size + 1, loss))
        ind += 1
    emb_target1 = emb_target.get_weights()[0]
    emb_source1 = emb_source.get_weights()[0]
    
    return emb_target1, emb_source1
Esempio n. 12
0
def Keras_skip_gram(G, walks, iteration):
    """
    Keras to run word2vec algorithm with skip_gram model.
    """

    walks_sentences = [list(np.array(walk)) for walk in walks]

    embedding1 = np.random.uniform(-1 / G.embedding_size, 1 / G.embedding_size,
                                   (G.vocabulary, G.embedding_size))
    embedding2 = np.random.uniform(-1 / G.embedding_size, 1 / G.embedding_size,
                                   (G.vocabulary, G.embedding_size))
    shared_layer1 = Embedding(input_dim=G.vocabulary,
                              output_dim=G.embedding_size,
                              weights=[embedding1])
    shared_layer2 = Embedding(input_dim=G.vocabulary,
                              output_dim=G.embedding_size,
                              weights=[embedding2])

    input_target = Input(shape=(1, ), dtype='int32', name='input_1')
    input_source = Input(shape=(1, ), dtype='int32', name='input_2')
    input_negative = Input(shape=(G.negative, ), dtype='int32', name='input_3')

    target = shared_layer1(input_target)
    source = shared_layer2(input_source)
    negative = shared_layer1(input_negative)

    positive_dot = dot([source, target], axes=(2), normalize=False)
    negative_dot = dot([source, negative], axes=(2), normalize=False)

    all_dot = concatenate([positive_dot, negative_dot], axis=2)
    sigmoid_sample = Activation('sigmoid')(all_dot)

    model = Model(inputs=[input_target, input_source, input_negative],
                  outputs=[sigmoid_sample])
    sgd2 = optimizers.SGD(lr=0.025, nesterov=True)
    model.compile(loss='binary_crossentropy', optimizer=sgd2)

    train_list = skip_train(walks_sentences, G.window_size)

    for i in range(iteration):
        for [a1, a2, a4, y1] in train_list:
            loss = model.train_on_batch([a1, a2, a4], y1)
    embed = shared_layer2.get_weights()[0]

    return embed
Esempio n. 13
0
 def test_on_masked_input(self):
     sentence_length = 5
     embedding_size = 10
     vocabulary_size = 15
     input_layer = Input(shape=(sentence_length,), dtype='int32')
     # Embedding masks zeros
     embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_size, mask_zero=True)
     encoder = BOWEncoder()
     embedded_input = embedding(input_layer)
     encoded_input = encoder(embedded_input)
     model = Model(input=input_layer, output=encoded_input)
     model.compile(loss="mse", optimizer="sgd")  # Will not train this model
     test_input = numpy.asarray([[0, 3, 1, 7, 10]], dtype='int32')
     embedding_weights = embedding.get_weights()[0]  # get_weights returns a list with one element.
     # Omitting the first element (0), because that is supposed to be masked in the model.
     expected_output = numpy.mean(embedding_weights[test_input[:, 1:]], axis=1)
     actual_output = model.predict(test_input)
     # Following comparison is till the sixth decimal.
     numpy.testing.assert_array_almost_equal(expected_output, actual_output)
Esempio n. 14
0
 def test_on_unmasked_input(self):
     sentence_length = 5
     embedding_dim = 10
     vocabulary_size = 15
     input_layer = Input(shape=(sentence_length, ), dtype='int32')
     # Embedding does not mask zeros
     embedding = Embedding(input_dim=vocabulary_size,
                           output_dim=embedding_dim)
     encoder = BOWEncoder()
     embedded_input = embedding(input_layer)
     encoded_input = encoder(embedded_input)
     model = Model(inputs=input_layer, outputs=encoded_input)
     model.compile(loss="mse", optimizer="sgd")  # Will not train this model
     test_input = numpy.asarray([[0, 3, 1, 7, 10]], dtype='int32')
     embedding_weights = embedding.get_weights()[
         0]  # get_weights returns a list with one element.
     expected_output = numpy.mean(embedding_weights[test_input], axis=1)
     actual_output = model.predict(test_input)
     numpy.testing.assert_array_almost_equal(expected_output, actual_output)
Esempio n. 15
0
 def test_on_unmasked_input(self):
     sentence_length = 5
     embedding_size = 10
     vocabulary_size = 15
     input_layer = Input(shape=(sentence_length,), dtype='int32')
     # Embedding does not mask zeros
     embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_size)
     encoder = BOWEncoder()
     embedded_input = embedding(input_layer)
     encoded_input = encoder(embedded_input)
     model = Model(input=input_layer, output=encoded_input)
     model.compile(loss="mse", optimizer="sgd")  # Will not train this model
     test_input = numpy.asarray([[0, 3, 1, 7, 10]], dtype='int32')
     embedding_weights = embedding.get_weights()[0]  # get_weights returns a list with one element.
     expected_output = numpy.mean(embedding_weights[test_input], axis=1)
     actual_output = model.predict(test_input)
     # Exact comparison of the two arrays may break because theano's floating point operations
     # usually have an epsilon. The following comparison is done till the sixth decimal, hence good enough.
     numpy.testing.assert_array_almost_equal(expected_output, actual_output)
def test_weight_initialization():
    # 3 ways to initialize weights, but copies the original in each case.
    weights = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]

    embedding_info1 = {
        "input_dim": 4,
        "output_dim": 3,
        "input_length": 10,
        "trainable": True
    }

    def weight_init(shape, dtype=None):
        return weights

    inputs = Input(shape=(4, ))
    embedding_layer = Embedding(weights=[np.array(weights)], **embedding_info1)
    # embedding_layer = Embedding(embeddings_initializer=weight_init, **embedding_info1)
    # embedding_layer = Embedding(embeddings_initializer=Constant(weights), **embedding_info1)
    res = embedding_layer(inputs)
    model = Model(inputs=inputs, outputs=res)
    model.compile(optimizer="adam", loss="mse")

    print(embedding_layer.get_weights())
Esempio n. 17
0
class EmbeddingTranier(object):
    """
    A class to train the word2vec using the skip-gram approach in keras with negative sampling.
    """
    def __init__(self, vocab_size, embedding_size, window_size=3):
        """
        Constructor method for embedding trainer.

        :param vocab_size: int; the number of words in the vocabulary
        :param embedding_size: int; the size of embeddings to train
        :param window_size: int; size of the skip-gram context window
        """
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.window_size = window_size
        self.model = None
        self.embeddings = None

    def get_skips(self, docs):
        """
        Formats the data and generates negative samples.

        :param docs: list; a list of documents; each document is a list of sentences;
        a sentence is a list of tokens (strings)
        :return: tuple; contains the center and context words, and the corresponding labels
        """
        sampling_table = make_sampling_table(self.vocab_size)
        center_words, context_words, labels = [], [], []
        for doc in docs:
            tokens = [token for sent in doc for token in sent]
            pairs, labels_ = skipgrams(tokens,
                                       self.vocab_size,
                                       window_size=self.window_size,
                                       sampling_table=sampling_table)
            try:
                center, context = zip(*pairs)
            except ValueError:
                continue
            center_words += center
            context_words += context
            labels += labels_

        return center_words, context_words, labels

    def w2v_model(self, learning_rate):
        """
        Generates the neural architecture for the word2vec skip-gram model

        :return: keras.models.Model(); the word2vec model
        """

        # Add the input and embedding layers
        input_center = Input((1, ))
        input_context = Input((1, ))
        self.embeddings = Embedding(self.vocab_size,
                                    self.embedding_size,
                                    input_length=1,
                                    name="Embeddings")

        # Get the center and context embeddings
        center = self.embeddings(input_center)
        center = Reshape((self.embedding_size, 1))(center)
        context = self.embeddings(input_context)
        context = Reshape((self.embedding_size, 1))(context)

        # Calculate the linear activations
        # dot_product = Concatenate([center, context], mode="dot", dot_axes=1)
        dot_product = dot([center, context], axes=1, normalize=False)
        dot_product = Reshape((1, ))(dot_product)

        # Sigmoid activations
        output = Dense(1, activation="sigmoid")(dot_product)

        # Define the model
        model = Model(input=[input_center, input_context], output=output)
        optimizer = RMSprop(lr=learning_rate, rho=0.9, epsilon=None, decay=0.0)
        model.compile(loss="binary_crossentropy", optimizer=optimizer)

        return model

    def train(self, docs, num_batches=2000, learning_rate=0.001, verbose=True):
        """
        Optimizes the model on the training data

        :param docs: list; a sequence of documents; each document is a list of sentences;
        a sentence is a list of tokens (strings)
        :param num_batches: int; the number of (center, context) pairs to use in training
        :param verbose: Boolean; if true, prints the loss druing training
        """

        # Get the data and the model
        center_words, context_words, labels = self.get_skips(docs)
        self.model = self.w2v_model(learning_rate)

        # Randomly sample pair/label
        loss = []
        for batch in range(num_batches):

            idx = np.random.randint(0, len(center_words) - 1)
            center_word = np.array([center_words[idx]])
            context_word = np.array([context_words[idx]])
            label = np.array([labels[idx]])
            loss += [
                self.model.train_on_batch([center_word, context_word], label)
            ]

            # Print the loss every 1000 batches
            if len(loss) >= 1000 and verbose:
                print(batch, sum(loss) / 1000)
                loss = []

    def get_embedding_array(self):
        """
        Gets the word embeddings

        :return: array; the trained word embeddings
        """
        return self.embeddings.get_weights()[0]
Esempio n. 18
0
class WordCNNModel(KerasModel):
    def __init__(self,
                 dataset,
                 filter_sizes,
                 num_filters_per_size,
                 layers=[],
                 conv_activation='linear',
                 layer_activation='relu',
                 trainable_embeddings=False):
        super().__init__(dataset)

        assert len(filter_sizes) > 0
        assert num_filters_per_size > 0
        assert len(dataset.get_input_shape()) == 1
        assert len(dataset.get_output_shape()) == 1

        self._dataset = dataset
        self._filter_sizes = filter_sizes
        self._num_filters_per_size = num_filters_per_size
        self._conv_activation = conv_activation
        self._layer_activation = layer_activation

        inputs = Input(shape=dataset.get_input_shape())
        self._emb_layer = Embedding(
            input_dim=dataset.word_embedding_model.get_vocab_size(),
            output_dim=dataset.word_embedding_model.get_embeddings_size(),
            weights=[dataset.word_embedding_model.get_embeddings()],
            trainable=trainable_embeddings)
        emb = self._emb_layer(inputs)

        filters = []
        self._conv = {}
        for filter_size in filter_sizes:
            layer = Conv1D(num_filters_per_size,
                           filter_size,
                           padding='valid',
                           activation=conv_activation)
            self._conv[filter_size] = layer
            layer = OneMaxPooling1D(axis=1, keepdims=False)(layer(emb))
            filters.append(layer)

        if len(filters) >= 2:
            result = concatenate(filters, axis=1)
        else:
            result = filters[0]

        self._layers = list()

        for l in layers:
            layer = Dense(l, activation=layer_activation)
            self._layers.append(layer)
            result = layer(result)

        self._output_layer = Dense(dataset.get_output_shape()[0],
                                   activation='sigmoid' if dataset.multilabel
                                   or dataset.binary else 'softmax')
        output = self._output_layer(result)

        self._model = Model(inputs=inputs, outputs=output)

    def export(self, fn, label_names):
        with gzip.GzipFile(fn, "w") as f:
            json_str = json.dumps(
                {
                    'w2v': {
                        'vocab':
                        self._dataset.word_embedding_model.get_vocab(),
                        'emb': self._emb_layer.get_weights()[0].tolist(),
                        'sentence_length': self._dataset.X.shape[1]
                    },
                    'label_names':
                    self._dataset.label_names.tolist(),
                    'layers': [{
                        'W': l.get_weights()[0].tolist(),
                        'b': l.get_weights()[1].tolist()
                    } for l in self._layers],
                    'filters_W': {
                        filter_size: f.get_weights()[0].tolist()
                        for (filter_size, f) in self._conv.items()
                    },
                    'filters_b': {
                        filter_size: f.get_weights()[1].tolist()
                        for (filter_size, f) in self._conv.items()
                    },
                    'filter_sizes':
                    self._filter_sizes,
                    'num_filters_per_size':
                    self._num_filters_per_size,
                    'conv_activation':
                    self._conv_activation,
                    'layer_activation':
                    self._layer_activation,
                    'output_layer': {
                        'W': self._output_layer.get_weights()[0].tolist(),
                        'b': self._output_layer.get_weights()[1].tolist()
                    },
                },
                indent=2)
            f.write(json_str.encode('utf-8'))
Esempio n. 19
0
# In[372]:

len(labels)

# In[340]:

loss

# In[341]:

model.summary()

# In[342]:

weights = embedding.get_weights()

# In[374]:

weights[0].shape  #because embedding matrix it starts from 0 so 3600

# In[365]:

weights[0][1:].shape

# In[358]:

id2word

# In[425]:
Esempio n. 20
0
w1 = embedding(input_w1)
w1 = Reshape((embedding_size, 1))(w1)

w2 = embedding(input_w2)
w2 = Reshape((embedding_size, 1))(w2)

w3 = embedding(input_w3)
w3 = Reshape((embedding_size, 1))(w3)

context_docid = concatenate([w1, w2, w3, docid])
context_docid = Conv1D(32, 4, padding="same")(context_docid)
context_docid = Flatten()(context_docid)

output = Dense(2, activation='softmax')(context_docid)
model = Model(input=[input_w1, input_w2, input_w3, input_docid], output=output)
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
model.summary()

model.fit_generator(batch_generator(contexts, targets, batch_size),
                    steps_per_epoch=batch_size,
                    epochs=epochs)

save_embedding(embedding_filename + '.txt',
               embedding.get_weights()[0], total_docs)
tsne_plot(embedding.get_weights()[0],
          total_docs,
          labels,
          figure_name=figure_filename,
          max_docs=1000)
Esempio n. 21
0
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit_generator(generateData(batch_size=batch_size),
                    steps_per_epoch=len(decoder_input_data) // batch_size,
                    epochs=3)

# model.fit([encoder_input_data, decoder_input_data], decoder_target_one_hot,
#           batch_size=32,
#           epochs=10,
#           validation_split=0.2)

network_config = {
    'vocab_size': len(vocab),
    'thought_vector_size': THOUGHT_VECTOR_SIZE,
    'sequence_length': encoder_input_data.shape[1],
    'weights': {
        'encoder_embedding': encoder_embedding_layer.get_weights(),
        'encoder_gru': encoder_gru_layer.get_weights(),
        'decoder_embedding': decoder_embedding_layer.get_weights(),
        'decoder_gru': decoder_gru_layer.get_weights(),
        'decoder_dense': decoder_dense_layer.get_weights()
    }
}

with open('network_config.pickle', 'wb') as file:
    pickle.dump(network_config, file)

print(
    'saved network config to "{}". Vocab size: {}. Thought vector size: {}. Sequence length: {}.'
    .format('network_config.pickle', network_config['vocab_size'],
            network_config['thought_vector_size'],
            network_config['sequence_length']))
Esempio n. 22
0
    def __init__(self,
                 num_feats,
                 data,
                 train=False,
                 load_original=False,
                 masking=True):
        if data == 'imdbcnn':
            num_words = 20002
            maxlen = 400
            embedding_dims = 50
            hidden_dims = 250
            weights_name = "original.h5"
            emb_name = 'embedding_1'
            batch_size = 40
            self.num_classes = 2
            num_epoch = 5
        elif data == 'yahoolstm':
            num_words = 20001
            maxlen = 400
            embedding_dims = 300
            hidden_dims = 250
            weights_name = "original-0-7.hdf5"
            emb_name = 'embedding'
            self.num_classes = 10
            batch_size = 1000
            num_epoch = 1

        Mean = Lambda(lambda x: K.sum(x, axis=1) / float(num_feats),
                      output_shape=lambda x: [x[0], x[2]])

        X_ph = Input(shape=(maxlen, ), dtype='int32')

        logits_T = construct_gumbel_selector(X_ph,
                                             num_words,
                                             embedding_dims,
                                             hidden_dims,
                                             maxlen,
                                             1,
                                             network_type='cnn')
        tau = 0.5
        sc_layer = Sample_Concrete(tau, num_feats, maxlen, masking)
        T = sc_layer(logits_T)
        if train:
            if not load_original:
                filters = 250
                kernel_size = 3
                print('transfer constucted')
                emb_layer = Embedding(num_words,
                                      embedding_dims,
                                      input_length=maxlen,
                                      trainable=False)
                emb2 = emb_layer(X_ph)
                selected_emb = Multiply()([emb2, T])
                net = Dropout(0.2, trainable=False)(selected_emb)
                net = Conv1D(filters,
                             kernel_size,
                             padding='valid',
                             activation='relu',
                             strides=1,
                             trainable=False)(net)
                net = Dense(hidden_dims, trainable=False)(net)
                net = GlobalMaxPooling1D()(net)
                net = Dense(hidden_dims, trainable=False)(net)
                net = Dropout(0.2, trainable=False)(net)
                net = Activation('relu', trainable=False)(net)
                net = Dense(self.num_classes, trainable=False)(net)
                preds = Activation('softmax', trainable=False)(net)
                model = Model(inputs=X_ph, outputs=preds)
            else:
                print('original constucted')
                emb_layer = Embedding(num_words,
                                      embedding_dims,
                                      input_length=maxlen,
                                      trainable=False)
                emb2 = emb_layer(X_ph)
                selected_emb = Multiply()([emb2, T])
                preds = construct_original_network(selected_emb,
                                                   data,
                                                   trainable=False)
                model = Model(inputs=X_ph, outputs=preds)

            model.compile(
                loss=negative_xentropy,
                optimizer='RMSprop',  #optimizer,
                metrics=['acc'])

            if load_original:
                print('Loading original models...')
                model.load_weights('{}/models/{}'.format(data, weights_name),
                                   by_name=True)
            else:
                model.load_weights('{}/models/transfer.hdf5'.format(data),
                                   by_name=True)

            if data == 'imdbcnn':
                emb_weights = emb_layer.get_weights()
                emb_weights[0][0] = np.zeros(50)
                emb_layer.set_weights(emb_weights)

            from load_data import Data
            dataset = Data(data, True)

            label_train = np.argmax(dataset.pred_train, axis=1)
            label_val = np.argmax(dataset.pred_val, axis=1)
            label_val = np.eye(self.num_classes)[label_val]
            label_train = np.argmax(dataset.pred_train, axis=1)

            filepath = "{}/models/L2X-{}-{}-mask.hdf5".format(
                data, num_feats, 'original' if load_original else 'transfer')

            checkpoint = ModelCheckpoint(filepath,
                                         monitor='val_acc',
                                         verbose=1,
                                         save_best_only=True,
                                         mode='min')

            callbacks_list = [checkpoint]

            model.fit(dataset.x_train,
                      label_train,
                      validation_data=(dataset.x_val, label_val),
                      callbacks=callbacks_list,
                      epochs=num_epoch,
                      batch_size=batch_size)

        else:
            pred_model = Model(X_ph, logits_T)
            pred_model.compile(loss=negative_xentropy,
                               optimizer='RMSprop',
                               metrics=['acc'])
            weights_name = "{}/models/L2X-{}-{}-mask.hdf5".format(
                data, num_feats, 'original' if load_original else 'transfer')
            pred_model.load_weights(weights_name, by_name=True)
            self.pred_model = pred_model
Esempio n. 23
0
	def __init__(self, data, train = False):
		self.data = data
		if data in ['imdbcnn']:

			filters = 250 
			hidden_dims = 250
			self.embedding_dims = 50
			self.maxlen = 400
			self.num_classes = 2
			self.num_words = 20002
			self.type = 'word'
			if not train:
				K.set_learning_phase(0)

			X_ph = Input(shape=(self.maxlen,), dtype='int32')
			emb_layer = Embedding(self.num_words, self.embedding_dims,
				input_length=self.maxlen, name = 'embedding_1')
			emb_out = emb_layer(X_ph) 

			if train:
				preds = construct_original_network(emb_out, data)	

			else: 
				emb_ph = Input(shape=(self.maxlen,self.embedding_dims), dtype='float32')   

				preds = construct_original_network(emb_ph, data) 


			if not train:
				model1 = Model(X_ph, emb_out)
				model2 = Model(emb_ph, preds) 
				pred_out = model2(model1(X_ph))  
				pred_model = Model(X_ph, pred_out) 
				pred_model.compile(loss='categorical_crossentropy',
							  optimizer='adam',
							  metrics=['accuracy']) 
				self.pred_model = pred_model 
				grads = []
				for c in range(self.num_classes):
					grads.append(tf.gradients(preds[:,c], emb_ph))

				grads = tf.concat(grads, axis = 0)  
				# [num_classes, batchsize, maxlen, embedding_dims]

				approxs = grads * tf.expand_dims(emb_ph, 0) 
				# [num_classes, batchsize, maxlen, embedding_dims]
				self.sess = K.get_session()  
				self.grads = grads 
				self.approxs = approxs
				self.input_ph = X_ph
				self.emb_out = emb_out
				self.emb_ph = emb_ph
				weights_name = 'original.h5'#[i for i in os.listdir('imdblstm/models/') if i.startswith('original')][0]
				model1.load_weights('{}/models/{}'.format(data, weights_name), 
					by_name=True)
				model2.load_weights('{}/models/{}'.format(data, weights_name), 
					by_name=True)  
				print('Model constructed.')
				# For validating the data. 
				emb_weights = emb_layer.get_weights() 
				emb_weights[0][0] = np.zeros(50)
				emb_layer.set_weights(emb_weights)
			else:
				pred_model = Model(X_ph, preds)
				
				pred_model.compile(loss='categorical_crossentropy',
							  optimizer='adam',
							  metrics=['accuracy']) 
				self.pred_model = pred_model
				from load_data import Data
				dataset = Data(self.data)
				self.train(dataset) 
				print('Training is done.') 
Esempio n. 24
0
model = Sequential()
embedding = Embedding(vocab_size,
                      embedding_size,
                      input_length=max_len,
                      weights=[embedding_matrix])
model.add(embedding)
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(100, activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
model.fit(data,
          labels,
          epochs=100,
          verbose=1,
          batch_size=32,
          shuffle=True,
          validation_data=(test_data, test_labels))

save_embedding('glove-embedding_labeled.txt',
               embedding.get_weights()[0], vocab)
tsne_plot(embedding,
          vocab,
          figure_name='glove-embedding_labeled',
          max_words=200,
          pos=['ADJ', 'VERB', 'NOUN'])
Esempio n. 25
0
class Doubler:
    def __init__(self, embedding_node_dim: int, embedding_doc_dim: int,
                 bow_feature: dict, vocabulary: set):
        self._name = 'doubler_'
        self._batch_negative = None
        self._node_embedding = None
        self._relation_embedding_node = None
        self._relation_embedding_doc = None
        self._doc_embedding = None
        self._graph_properties = None
        self._embedding_node_dim = embedding_node_dim
        self._embedding_doc_dim = embedding_doc_dim
        self._bow_feature = {
            str(key): value
            for key, value in bow_feature.items()
        }
        self._vocabulary = vocabulary

    def build_model(self, model_params: namedtuple,
                    graph_properties: namedtuple,
                    input_layer_positive: keras.layers.Input,
                    input_layer_negative: keras.layers.Input,
                    input_layer_relation: keras.layers.Input):

        self._graph_properties = graph_properties

        # Input Documents
        input_layer_doc_positive = Input(shape=(len(self._vocabulary), ),
                                         name='input_document_positive')
        input_layer_doc_negative = Input(shape=(
            model_params.num_negative + 1,
            len(self._vocabulary),
        ),
                                         name='input_document_negative')

        # node, relation, and document embeddings
        self._node_embedding = Embedding(graph_properties.num_vertices,
                                         self._embedding_node_dim,
                                         name=self._name + 'node_embedding')
        self._doc_embedding = Dense(self._embedding_doc_dim,
                                    name=self._name + 'document_embedding')
        self._relation_embedding_node = Embedding(
            graph_properties.num_relations,
            self._embedding_node_dim,
            name=self._name + 'relation_embedding')

        if self._embedding_node_dim == self._embedding_doc_dim:
            self._relation_embedding_doc = self._relation_embedding_node
        else:
            self._relation_embedding_doc = Embedding(
                graph_properties.num_relations,
                self._embedding_doc_dim,
                name=self._name + 'relation_embedding_2')

        # connect node, relation, and doc input and corresponding embedding layer
        embedding_node_layer_positive = self._node_embedding(
            input_layer_positive)
        embedding_node_layer_negative = self._node_embedding(
            input_layer_negative)
        embedding_doc_layer_positive = self._doc_embedding(
            input_layer_doc_positive)
        embedding_doc_layer_negative = self._doc_embedding(
            input_layer_doc_negative)
        embedding_layer_relation_node = self._relation_embedding_node(
            input_layer_relation)

        # we have to create a separate relation embedding layer for the documents in case that the
        # target dimension of the node and document embeddings differs
        embedding_layer_relation_doc = embedding_layer_relation_node if self._embedding_node_dim == self._embedding_doc_dim \
            else self._relation_embedding_doc(input_layer_relation)

        # compute the center of the negative node embeddings
        mean_layer_node = custom_layers.Mean(name=self._name +
                                             'avg_node_layer_negative')
        avg_node_layer_negative = mean_layer_node(
            embedding_node_layer_negative)

        # compute the distance between the positive node and the avg. negative node embedding
        shape = embedding_node_layer_positive.get_shape().as_list()
        tmp = Reshape(((shape[1] * shape[2]), ))(
            embedding_node_layer_positive)  # shape[1] is always 1
        diff_layer_node = custom_layers.L2Diff(name=self._name +
                                               'node_L2_diff')
        node_dis = diff_layer_node([avg_node_layer_negative, tmp])

        # compute the center of the negative doc embeddings
        mean_layer_doc = custom_layers.Mean(name=self._name +
                                            'avg_doc_layer_negative')
        avg_doc_layer_negative = mean_layer_doc(embedding_doc_layer_negative)

        # compute the distance between the positive doc and the avg. negative doc embedding
        diff_layer_doc = custom_layers.L2Diff(name=self._name + 'doc_L2_diff')
        doc_dis = diff_layer_doc(
            [avg_doc_layer_negative, embedding_doc_layer_positive])

        # compute L2_Offset
        l2_offset_layer = custom_layers.L2Off(name=self._name +
                                              'L2_offset')([node_dis, doc_dis])

        # create node score layer
        embedding_layer_node_joint = Multiply(
            name=self._name + 'embedding_layer_node_joint')(
                [embedding_layer_relation_node, embedding_node_layer_positive])
        output_layer_node_score = Dot(axes=2, name=self._name + 'node_score')(
            [embedding_layer_node_joint, embedding_node_layer_negative])
        output_layer_node_score = Reshape(
            (model_params.num_negative + 1, ))(output_layer_node_score)

        # create doc score layer
        embedding_layer_doc_joint = Multiply(name=self._name +
                                             'embedding_layer_doc_joint')([
                                                 embedding_layer_relation_doc,
                                                 embedding_doc_layer_positive
                                             ])
        output_layer_doc_score = Dot(axes=2,
                                     name=self._name + 'document_score')([
                                         embedding_layer_doc_joint,
                                         embedding_doc_layer_negative
                                     ])
        output_layer_doc_score = Reshape(
            (model_params.num_negative + 1, ))(output_layer_doc_score)

        # create final score/predicate layer
        output_layer_score = Add(name=self._name + 'score')(
            [output_layer_node_score, output_layer_doc_score])

        return [input_layer_doc_positive,
                input_layer_doc_negative], output_layer_score, l2_offset_layer

    def predict_node(self, node_idx: int, relation_idx: int):
        vertex_emb_matrix = self._node_embedding.get_weights()[0]
        relation_emb_node_matrix = self._relation_embedding_node.get_weights(
        )[0]
        scores_emb_node = np.dot((vertex_emb_matrix[node_idx] *
                                  relation_emb_node_matrix[relation_idx]),
                                 vertex_emb_matrix.T)

        nodes_candidate = np.zeros(
            (self._graph_properties.num_vertices, len(self._vocabulary)),
            dtype=np.int8)
        for node_idx_candidate, node_candidate in enumerate(
                self._graph_properties.vertices):
            tail_bow = self._bow_feature[
                node_candidate] if node_candidate in self._bow_feature else np.zeros(
                    len(self._vocabulary))
            nodes_candidate[node_idx_candidate, :] = tail_bow

        doc_emb_matrix = np.dot(nodes_candidate,
                                self._doc_embedding.get_weights()
                                [0]) + self._doc_embedding.get_weights()[1]
        relation_emb_doc_matrix = self._relation_embedding_doc.get_weights()[0]
        scores_emb_doc = np.dot(
            (doc_emb_matrix[node_idx] * relation_emb_doc_matrix[relation_idx]),
            doc_emb_matrix.T)

        return scores_emb_node, scores_emb_doc

    def predict_nodes(self, nodes_idx: list, relations_idx: list):
        scores_node = list()
        scores_doc = list()

        for idx, node_idx in enumerate(nodes_idx):
            score_node, score_doc = self.predict_node(node_idx,
                                                      relations_idx[idx])
            scores_node.append(score_node)
            scores_doc.append(score_doc)

        return np.array(scores_node), np.array(scores_doc)

    def init_batch_triples(self, model_params: namedtuple, batch_idx: int,
                           num_batches: int, triples_batch: list):
        batch_size = (
            2 * len(triples_batch)
        ) if batch_idx == num_batches - 1 else 2 * model_params.batch_size
        self._batch_negative = np.zeros(
            (batch_size, model_params.num_negative + 1, len(self._vocabulary)),
            dtype=np.int8)

    def generate_training_data(self, nodes_train_idx: list,
                               nodes_train_idx_candidate: list):
        batch_positive = np.zeros(
            (len(nodes_train_idx), len(self._vocabulary)), dtype=np.int8)

        for idx, node_train_idx in enumerate(nodes_train_idx):
            node_train = self._graph_properties.index_vertex[node_train_idx[0]]

            if node_train not in self._bow_feature:
                continue

            batch_positive[idx, ] = self._bow_feature[node_train]

            tmp_array = np.zeros(
                (self._batch_negative.shape[1], self._batch_negative.shape[2]))
            nodes_train_idx_neg = nodes_train_idx_candidate[idx]
            for idx2, node_train_idx_neg in enumerate(nodes_train_idx_neg):
                node_train_neg = self._graph_properties.index_vertex[
                    node_train_idx_neg]

                if node_train_neg not in self._bow_feature:
                    continue

                node_train_doc_neg = self._bow_feature[node_train_neg]
                tmp_array[idx2, ] = node_train_doc_neg

            self._batch_negative[idx, ] = tmp_array

        return {
            'input_document_positive': batch_positive,
            'input_document_negative': self._batch_negative
        }
Esempio n. 26
0
    def __init__(self,
                 data,
                 num_feats,
                 max_words,
                 method,
                 train=False,
                 load_original=False,
                 masking=False):
        self.k = num_feats
        self.maxlen = 400
        self.max_words = max_words
        if data == 'imdbcnn':
            self.num_words = 20002
            embedding_dims = 50
            maxlen = 400
            hidden_dims = 250
            weights_name = "original.h5"
            emb_name = 'embedding_1'
            num_classes = 2
            num_epoch = 5
        elif data in ['yahoolstm']:
            self.num_words = 20001
            embedding_dims = 300
            maxlen = 400
            hidden_dims = 250
            weights_name = "original-0-7.hdf5"
            emb_name = 'embedding'
            num_classes = 10
            num_epoch = 1
        X_ph = Input(shape=(maxlen, ), dtype='int32')
        weights_extractor_ph = Input(shape=(max_words, ), dtype='int32')
        Selected_ph = Input(shape=(maxlen, ), dtype='float32')

        logits_T = construct_gumbel_selector(X_ph,
                                             self.num_words,
                                             embedding_dims,
                                             hidden_dims,
                                             maxlen,
                                             max_words,
                                             network_type='cnn')
        tau = 0.5
        T = Sample_Concrete(tau)(logits_T)
        batch_size = 40

        emb2_layer = Embedding(self.num_words,
                               embedding_dims,
                               input_length=maxlen,
                               name=emb_name,
                               trainable=False)

        embedding_weights = emb2_layer(weights_extractor_ph)

        X_emb = emb2_layer(X_ph)

        Xnew_emb = MakeChange()([X_emb, T, Selected_ph, embedding_weights])

        preds = construct_original_network(Xnew_emb, data, trainable=False)

        if train:
            model = Model(inputs=[X_ph, Selected_ph, weights_extractor_ph],
                          outputs=preds)

            model.compile(loss=negative_xentropy,
                          optimizer='RMSprop',
                          metrics=['acc'])

            if load_original:
                print('Loading original models...')

                model.load_weights('{}/models/{}'.format(data, weights_name),
                                   by_name=True)

                if data == 'imdbcnn':
                    emb_weights = emb2_layer.get_weights()
                    emb_weights[0][0] = np.zeros(50)
                    emb2_layer.set_weights(emb_weights)

            dataset = Data(data, True)

            if method == 'L2X':
                scores_train = np.load(
                    '{}/results/scores-train-{}-{}-original{}-mask{}.npy'.
                    format(data, method, num_feats, load_original, masking))
                scores_val = np.load(
                    '{}/results/scores-val-{}-{}-original{}-mask{}.npy'.format(
                        data, method, num_feats, load_original, masking))
                label_train = np.argmax(dataset.pred_train, axis=1)
                label_train = np.eye(num_classes)[label_train]
                training_x = dataset.x_train
                filepath = "{}/models/gumbel-change-{}-{}-original{}-mask{}.hdf5".format(
                    data, num_feats, max_words, load_original, masking)

            checkpoint = ModelCheckpoint(filepath,
                                         monitor='val_acc',
                                         verbose=1,
                                         save_best_only=True,
                                         mode='min')
            callbacks_list = [checkpoint]
            print("data loaded")

            selected_train_index = np.argsort(
                scores_train, axis=-1)[:,
                                       -self.k:]  # indices of largest k score.
            selected_train = np.zeros(scores_train.shape)
            selected_train[
                np.expand_dims(np.arange(len(scores_train)), axis=-1),
                selected_train_index] = 1.0

            selected_val_index = np.argsort(
                scores_val, axis=-1)[:,
                                     -self.k:]  # indices of largest k score.
            selected_val = np.zeros(scores_val.shape)
            selected_val[np.expand_dims(np.arange(len(scores_val)), axis=-1),
                         selected_val_index] = 1.0
            weights_extractor_value = np.tile(
                [list(range(0, max_words - 1)) + [self.num_words - 1]],
                [len(scores_train), 1])
            weights_extractor_value_val = np.tile(
                [list(range(0, max_words - 1)) + [self.num_words - 1]],
                [len(scores_val), 1])
            label_val = np.argmax(dataset.pred_val, axis=1)
            label_val = np.eye(num_classes)[label_val]

            model.fit([training_x, selected_train, weights_extractor_value],
                      label_train,
                      validation_data=([
                          dataset.x_val, selected_val,
                          weights_extractor_value_val
                      ], label_val),
                      callbacks=callbacks_list,
                      epochs=num_epoch,
                      batch_size=batch_size)
            label_train = np.argmax(dataset.pred_train, axis=1)
            label_val = np.argmax(dataset.pred_val, axis=1)

        else:
            pred_model = Model([X_ph, Selected_ph, weights_extractor_ph], [T])
            pred_model.compile(loss=negative_xentropy,
                               optimizer='RMSprop',
                               metrics=['acc'])

            weights_name = "{}/models/gumbel-change-{}-{}-original{}-mask{}.hdf5".format(
                data, num_feats, max_words, load_original, masking)

            pred_model.load_weights(weights_name, by_name=True)
            self.pred_model = pred_model
    model)  # activation='linear' (they are the same)
crf = CRF()  # CRF layer { SHOULD I SET -> number_labels+1 (+1 -> PAD) }
out = crf(model)  # output
model = Model(inputs=inpt, outputs=out)

# set optimizer
# decay=learning_rate / epochs
opt = SGD(learning_rate=0.0, momentum=0.9, clipvalue=5.0
          )  # clipvalue (Gradient Clipping): clip the gradient to [-5 to 5]
#opt = SGD(learning_rate=0.05, decay=0.01, momentum=0.9, clipvalue=5.0)  # clipvalue (Gradient Clipping): clip the gradient to [-5 to 5]

# compile Bi-LSTM-CRF
model.compile(optimizer=opt, loss=crf.loss, metrics=[crf.accuracy])
# model.compile(optimizer=opt, loss=crf.loss, metrics=[crf.viterbi_accuracy])

print('BEFORE TRAINING', model.get_weights())

# ======================================================================================================================
# Data Generators
# ======================================================================================================================

# ceil of the scalar x is THE SMALLER INTEGER i, such that i >= x
test_steps = np.ceil(
    test_data_size /
    batch_size)  # number of validation and testing batches (same size)
print('test_steps', test_steps)

# (number of batches) -1, because batches start from 0
# test_batch_generator = batch_generator(x_test_filename, '', batch_size, test_steps - 1)  # testing batch generator
test_generator = DataGenerator(x_test_filename,
                               '',
Esempio n. 28
0
        for ix in range(n_commit_split):

            z_p = model.predict(x=[
                anDataValid[idxValidSplit[ix]], anDataConstValid[
                    idxValidSplit[ix]]
            ])
            mse = (z_p * z_p).mean()
            print(f"Valid set MSE = {mse:.4f}")
        """c"""
    """c"""
"""c"""

z_emb = model_emb.predict(x=anDataValid[idxValidSplit[0]])

w = emb_obj.get_weights()[0]

w[0]

anDataValid
anDataConstValid

# Unit model

encoder_inputs = Input(shape=(sentenceLength, ), name="Encoder_input")
target_inputs = Input(shape=(sentenceLength, ), name="target_input")

emb_obj = Embedding(emb.shape[0], emb.shape[1], weights=[emb], trainable=False)

x = emb_obj(encoder_inputs)
x = Flatten()(x)
Esempio n. 29
0
            output = validationModel.predict_on_batch(
                [internalArr1, internalArr2])
            sim[i] = output
        return sim


simCallback = SimCallback()

for count in range(epochs):
    idx = np.random.randint(0, len(labels) - 1)
    wordTargetArray[0, ] = wordTarget[idx]
    wordContextArray[0, ] = wordContext[idx]
    labelsArray[0, ] = labels[idx]
    loss = model.train_on_batch([wordTargetArray, wordContextArray],
                                labelsArray)
    if count % 100 == 0:  #10
        print("Iteration {}, loss={}".format(count, loss))
    if count % 10000 == 0:  #100
        simCallback.runSim()
zerosRow = np.array([0] * vectorDim)
zerosRow.shape = (1, vectorDim)
embeddingMatrix = embedding.get_weights()[0]
embeddingMatrix = np.concatenate((zerosRow, embeddingMatrix), axis=0)
np.savetxt(COMPUTE_DATA_PATH + 'embedding_matrix.txt',
           embeddingMatrix,
           fmt="%.5f")
#np.savetxt('embeddingMatrix.txt', embeddingMatrix)

np.save(COMPUTE_DATA_PATH + 'inverse_dictionary.npy', inverseDict)
np.save(COMPUTE_DATA_PATH + 'dictionary.npy', dictionary)
Esempio n. 30
0
    def build(self):
        question, answer = self._get_inputs()

        # add embedding layers
        embedding = Embedding(self.config['n_words'],
                              self.model_params.get('n_embed_dims', 141))
        question_embedding = embedding(question)

        a_embedding = Embedding(self.config['n_words'],
                                self.model_params.get('n_embed_dims', 141))
        answer_embedding = embedding(answer)

        a_embedding.set_weights(embedding.get_weights())

        # dropout
        dropout = Dropout(0.5)
        question_dropout = dropout(question_embedding)
        answer_dropout = dropout(answer_embedding)

        # rnn
        forward_lstm = LSTM(self.config.get('n_lstm_dims', 141),
                            consume_less='mem',
                            return_sequences=True)
        backward_lstm = LSTM(self.config.get('n_lstm_dims', 141),
                             consume_less='mem',
                             return_sequences=True)
        question_lstm = merge(
            [forward_lstm(question_dropout),
             backward_lstm(question_dropout)],
            mode='concat',
            concat_axis=-1)

        # dropout
        question_dropout = dropout(question_lstm)

        # maxpooling
        maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False),
                         output_shape=lambda x: (x[0], x[2]))
        question_pool = maxpool(question_dropout)

        # activation
        activation = Activation('tanh')
        question_output = activation(question_pool)

        question_model = Model(input=[question], output=[question_output])

        # attentional rnn
        forward_lstm = AttentionLSTM(self.config.get('n_lstm_dims', 141),
                                     question_output,
                                     consume_less='mem',
                                     return_sequences=True)
        backward_lstm = AttentionLSTM(self.config.get('n_lstm_dims', 141),
                                      question_output,
                                      consume_less='mem',
                                      return_sequences=True)
        answer_lstm = merge(
            [forward_lstm(answer_dropout),
             backward_lstm(answer_dropout)],
            mode='concat',
            concat_axis=-1)

        # dropout
        answer_dropout = dropout(answer_lstm)

        # maxpooling
        maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False),
                         output_shape=lambda x: (x[0], x[2]))
        answer_pool = maxpool(answer_dropout)

        # activation
        activation = Activation('tanh')
        answer_output = activation(answer_pool)

        answer_model = Model(input=[question, answer], output=[answer_output])

        return question_model, answer_model
Esempio n. 31
0
	def __init__(self, data, train = False):
		self.data = data
		print('Loading TextModel...')
		if data == 'imdbcnn':
			filters = 250 
			hidden_dims = 250
			self.embedding_dims = 50
			self.maxlen = 400
			self.num_classes = 2
			self.num_words = 20002
			self.type = 'word'
			if not train:
				K.set_learning_phase(0)

			X_ph = Input(shape=(self.maxlen,), dtype='int32')
			emb_layer = Embedding(
				self.num_words, 
				self.embedding_dims,
				input_length=self.maxlen, 
				name = 'embedding_1'
			)
			emb_out = emb_layer(X_ph) 

			if train:
				preds = construct_original_network(emb_out, data)	

			else: 
				emb_ph = Input(
					shape=(self.maxlen, self.embedding_dims), 
					dtype='float32'
				)   
				preds = construct_original_network(emb_ph, data) 

			if not train:
				model1 = Model(X_ph, emb_out)
				model2 = Model(emb_ph, preds) 
				pred_out = model2(model1(X_ph))  
				pred_model = Model(X_ph, pred_out) 
				pred_model.compile(
					loss='categorical_crossentropy',
					optimizer='adam', 
					metrics=['accuracy']
				) 
				self.pred_model = pred_model 
				grads = []
				for c in range(self.num_classes):
					grads.append(tf.gradients(preds[:,c], emb_ph))

				grads = tf.concat(grads, axis = 0)  
				# [num_classes, batchsize, maxlen, embedding_dims]

				approxs = grads * tf.expand_dims(emb_ph, 0) 
				# [num_classes, batchsize, maxlen, embedding_dims]
				self.sess = K.get_session()  
				self.grads = grads 
				self.approxs = approxs
				self.input_ph = X_ph
				self.emb_out = emb_out
				self.emb_ph = emb_ph
				weights_name = 'original.h5'
				model1.load_weights('{}/models/{}'.format(data, weights_name), 
					by_name=True)
				model2.load_weights('{}/models/{}'.format(data, weights_name), 
					by_name=True)  
				self.pred_model.load_weights('{}/models/{}'.format(data, weights_name), 
					by_name=True)
				print('Model constructed.', weights_name)
				# For validating the data. 
				emb_weights = emb_layer.get_weights() 
				emb_weights[0][0] = np.zeros(50)
				self.emb_weights = emb_weights[0]
				emb_layer.set_weights(emb_weights)
			else:
				pred_model = Model(X_ph, preds)
				pred_model.compile(
					loss='categorical_crossentropy',
					optimizer='adam',
					metrics=['accuracy']) 
				self.pred_model = pred_model
				from load_data import Data
				dataset = Data(self.data, train = True)
				self.train(dataset) 
				print('Training is done.') 

		elif data == 'agccnn':
			from agccnn.data_helpers import create_vocab_set, construct_batch_generator, find_words_positions
			filter_kernels = [7, 7, 3, 3, 3, 3]
			dense_outputs = 1024
			self.charlen = 1014
			self.maxlen = 1014
			nb_filter = 256
			self.num_classes = 4
			self.vocab, self.reverse_vocab, self.vocab_size, self.vocab_check = create_vocab_set()
			self.embedding_dims = self.vocab_size
			self.type = 'char'
			K.set_learning_phase(1 if train else 0)
			#Define what the input shape looks like
			inputs = Input(shape=(self.charlen, self.vocab_size), name='input', dtype='float32')

			conv = Conv1D(filters = nb_filter, kernel_size= filter_kernels[0], padding = 'valid', activation = 'relu', input_shape=(self.charlen, self.vocab_size))(inputs)

			conv = MaxPooling1D(pool_size=3)(conv)

			conv1 = Conv1D(filters = nb_filter, kernel_size= filter_kernels[1], padding = 'valid', activation = 'relu')(conv)

			conv1 = MaxPooling1D(pool_size=3)(conv1) 

			conv2 = Conv1D(filters = nb_filter, kernel_size= filter_kernels[2], padding = 'valid', activation = 'relu')(conv1)
			conv3 = Conv1D(filters = nb_filter, kernel_size= filter_kernels[3], padding = 'valid', activation = 'relu')(conv2)
			conv4 = Conv1D(filters = nb_filter, kernel_size= filter_kernels[4], padding = 'valid', activation = 'relu')(conv3)
			conv5 = Conv1D(filters = nb_filter, kernel_size= filter_kernels[5], padding = 'valid', activation = 'relu')(conv4) 

			conv5 = MaxPooling1D(pool_size=3)(conv5)
			conv5 = Flatten()(conv5)

			#Two dense layers with dropout of .5
			z = Dropout(0.5)(Dense(dense_outputs, activation='relu')(conv5))
			z = Dropout(0.5)(Dense(dense_outputs, activation='relu')(z))
			#Output dense layer with softmax activation
			pred = Dense(self.num_classes, activation='softmax', name='output')(z)
			grads = []
			for c in range(self.num_classes):
				grads.append(tf.gradients(pred[:,c], inputs))
			grads = tf.concat(grads, axis = 0)  
			# [num_classes, batchsize, self.charlen, embedding_dims]
			approxs = grads * tf.expand_dims(inputs, 0) 
			# [num_classes, batchsize, self.charlen, embedding_dims]
			model = Model(inputs, pred) 
			model.compile(
				loss='categorical_crossentropy', 
				optimizer="sgd",
				metrics=['accuracy']
			)  
			model.load_weights(
				'agccnn/params/crepe_model_weights-15.h5', 
				by_name=True
			)	

			self.sess = K.get_session()  
			self.grads = grads 
			self.approxs = approxs
			self.input_ph = inputs 
			self.model = model  
			
			from nltk.tokenize.moses import MosesDetokenizer
			from nltk import word_tokenize

			detokenizer = MosesDetokenizer()
			self.tokenize = word_tokenize
			self.detokenize = detokenizer.detokenize
			self.construct_batch_generator = construct_batch_generator
			self.find_words_positions = lambda sent: find_words_positions(
					sent, 
					word_tokenize(sent), 
					self.charlen, 
					self.vocab, 
					self.vocab_size, 
					self.vocab_check
				)
			self.find_chars_positions = lambda sent: find_words_positions(
					sent, 
					list(sent.lower().replace(' ', '')), 
					self.charlen, 
					self.vocab, 
					self.vocab_size, 
					self.vocab_check, 
					True
				)

		elif data == 'yahoolstm':
			self.maxlen = 400
			self.num_classes = 10
			self.num_words = 20000
			self.batch_size = 40 
			self.embedding_dims = 300
			if not train:
				K.set_learning_phase(0)

			X_ph = Input(shape=(self.maxlen,), dtype='int32') 
			emb_layer = Embedding(                
				input_dim=self.num_words + 1,
				output_dim= self.embedding_dims,  
				input_length=self.maxlen,
				name = "embedding",
				trainable=True)
			emb = emb_layer(X_ph)

			if train:
				preds = construct_original_network(emb, data)
			else:
				emb_ph = Input(shape=(self.maxlen,self.embedding_dims), dtype='float32')  
				preds = construct_original_network(emb_ph, data)

			if train:
				model = Model(X_ph, preds) 

				model.compile(
					loss='categorical_crossentropy',
					optimizer='adam',
					metrics=['accuracy']
				)
			else:
				model1 = Model(X_ph, emb)
				model2 = Model(emb_ph, preds) 
				pred_out = model2(model1(X_ph)) 
				model = Model(X_ph, pred_out)
				model.compile(
					loss='categorical_crossentropy',
					optimizer='adam',
					metrics=['accuracy']
				)
				# Construct gradients. 
				grads = []
				for c in range(self.num_classes):
					grads.append(tf.gradients(preds[:,c], emb_ph))

				grads = tf.concat(grads, axis = 0)  
				# [num_classes, batchsize, maxlen, embedding_dims]

				approxs = grads * tf.expand_dims(emb_ph, 0) 
				# [num_classes, batchsize, maxlen, embedding_dims]
				prev_epoch = 0; prev_itr = 7
				model1.load_weights(
					'yahoolstm/models/original-{}-{}.hdf5'.format(prev_epoch, prev_itr), 
					by_name = True
				)
				model2.load_weights(
					'yahoolstm/models/original-{}-{}.hdf5'.format(prev_epoch, prev_itr), 
					by_name = True
				)

				emb_weights = emb_layer.get_weights() 
				self.emb_weights = emb_weights
				self.emb_out = emb 
				self.emb_ph = emb_ph
				self.sess = K.get_session()  
				self.grads = grads 
				self.approxs = approxs
				self.input_ph = X_ph
			self.pred_model = model  
			self.type = 'word'
			if train:
				from load_data import Data
				print('Loading data...')
				dataset = Data(data, train = True)
				print('Training...')
				self.train(dataset)