def SoftmaxAutoEncoder( input_dim, latent_dim=50, encoder=None, decoder=None, activation=None, loss=None, sparse=True, use_tied_layer=True, use_binary_activation=True, alpha=50, lr=0.001, ): """Softmax AutoEncoder Autoencoder using kullback_leibler_divergence as objective function, and softmax as output activation. Requiring input matrix row sum to 1. Parameters ---------- input_dim : dim of input sample. latent_dim : latent dim of latent vector. encoder : if not None, then will be used as latent_vector = encoder(input_layer). decoder : if not None, then will be used as generated_input = decoder(latent_vector). activation : default is "tanh" when use_binary_activation is False, otherwise variant sigmoid. loss : default is kullback_leibler_divergence. use_tied_layer : whether to use tied layer or not, used only when encoder and decoder is None. use_binary_activation : if True, using variant sigmoid 1/(1+exp(alpha*-x)). alpha : alpha in variant sigmoid. lr : learning rate. Examples -------- import keras from keras_aquarium import sae from scipy.sparse import csr_matrix import numpy as np # suppose you have a sparse matrix, which represents bag-of-words documents bow_docs = csr_matrix([n_docs, n_words]) model = sae.SoftmaxAutoEncoder( input_dim, # dim of input sample latent_dim=50, # latent dim of latent vector encoder=None, # if not None, then will be used as latent_vector = encoder(input_layer), decoder=None, # if not None, then will be used as generated_input = decoder(latent_vector) activation=None, # default is "tanh" when use_binary_activation is False, otherwise variant sigmoid loss=None, # default is kullback_leibler_divergence use_tied_layer=True, # whether to use tied layer or not, used only when encoder and decoder is None use_binary_activation=True, # if True, using variant sigmoid 1/(1+exp(alpha*-x)) alpha=50, # alpha in variant sigmoid ) def generate_dataset(batch_size): # memory friendly indices = np.arange(len(bow_docs)) while True: np.random.shuffle(indices) for i in xrange(0, len(indices), batch_size): inds = indices[i:i+batch_size] yield bow_docs[inds], bow_docs[inds].toarray() batch_size = 32 model.fit_generator( generate_dataset(batch_size), len(bow_docs)/batch_size, ) """ input_layer = Input(shape=[input_dim,], sparse=sparse) if encoder is not None: hidden = encoder(input_layer) else: hidden = input_layer if activation is None: if use_binary_activation: def binary_activation(x, ): """ embarrsingly using variant sigmoid sgm(x) = 1/(1 + exp(alpha*-x)), turn to sigmoid when alpha = 1, faster convergence, """ x = -1*alpha*x x = K.clip(x, -1e16, 80) alive = 1 / (1+K.exp(x)) return alive activation = binary_activation else: activation = activations.tanh encoder_ = Dense(latent_dim, activation=activation, kernel_initializer="glorot_normal",) code = encoder_(hidden) # code = Lambda(activation, )(code) # activation if decoder is not None: hidden_g = decoder(code) else: hidden_g = code if use_tied_layer: decoder_ = Dense_tied(input_dim, activation="softmax", tied_to=encoder_, kernel_regularizer=regularizers.l2(0.00001), bias_regularizer=regularizers.l2(0.00001),) else: decoder_ = Dense(input_dim, activation="softmax", ) res_input = decoder_(hidden_g) model = Model(inputs=input_layer, outputs=res_input, ) if loss is None: loss = losses.kullback_leibler_divergence model.compile( loss=loss, optimizer=optimizers.Nadam(lr=lr)) encoder = Model(inputs=input_layer, outputs=code, ) model._keras_aquarium_params = \ dict(encoder=encoder, ) return model
def DualMatrixFactorization( n_row, n_col, n_row_feature=None, n_col_feature=None, row_dim=50, col_dim=50, row_feature_dim=50, col_feature_dim=50, row_layers=[(50, "relu")], col_layers=[(50, "relu")], # output_mode="single", model_name=None, ): """Dual Deep Matrix Factorization. By introducing deeper encoding of both user vectors and item vectors, it outperform than simple Matrix Factorization. The model is inspried by paper [COLLABORATIVE DEEP EMBEDDING VIA DUAL NETWORKS](https://openreview.net/pdf?id=r1w7Jdqxl) Parameters ---------- n_row : the row of matrix. n_col : the col of matrix. n_row_feature : number of row features, default is None, no row features are applied. n_col_feature : number of col features, default is None, no col features are applied. row_dim : embedding dim of a row element. col_dim : embedding dim of a col element. row_feature_dim : latent representation dim of a row features. col_feature_dim : latent representation dim of a col features. row_layers : list of tuple (dim, activation) or callable To construct hidden layers. col_layers : list of tuple (dim, activation) or callable To construct hidden layers. model_name : str name of the model. Examples -------- import keras from keras_aquarium import dmf from scipy.sparse import csr_matrix, coo_matrix # suppose you have a sparse matrix as a user item rating matrix rating_matrix = coo_matrix( shape=[n_user, n_item] ) users = rating_matrix.row items = rating_matrix.col ratings = rating_matrix.data # you also have a sparse feature matrix for item, such as location of item, price of item, etc. item_features = csr_matrix( shape=[n_item, n_item_feature] ) # and sadly, you dont have any feature for user user_features = None # then you want to apply a Matrix Factorization model to predict ratings model = dmf.DualMatrixFactorization( # we choose user as row, item as col n_row=n_user, n_col=n_item, # specified matrix shape # or we can choose item as row, user as col, n_row=n_item, n_col=n_user, just transpose the matrix n_row_feature=None, # we dont have user feature n_col_feature=n_item_feature, # we do have item feature row_dim=30, col_dim=20, # specifed embedding dim, each user and item is then first embedded as a vector row_feature_dim=None, # no user feature col_feature_dim=None, # specifed item feature embedding dim, each item feature will be encoded as a dense vector # row_layers and col_layers are for encoding layers for user and item, # they work separately, so they can have different number of hidden layers # just make sure the finla hidden layer are the same dim row_layers=[(50, "relu"), (30, keras.losses.tanh)], # use two hidden layers, first one is a dense layer with 50 unit and relu activation, second one is a dense layer with 30 unit and tanh activation col_layers=[ (lambda x: Dense(30, regularizer=l2(0.001))(x) ), ], # can use a callable to create a hidden layer ) # use it as a keras model model.compile(loss="mse", optimizer="adam", ) inputs = [users] + [items, item_features[items], ] # note that there is no user_features model.fit(inputs, ratings, ) """ def make_row_layers(n_row, n_row_feature, row_feature_dim): row_input = Input(shape=(1, ), dtype="int32") row_embd = Embedding( input_dim=n_row, input_length=1, output_dim=row_dim, ) row_embd = Flatten()(row_embd(row_input)) if n_row_feature is not None: row_feature_input = Input(shape=(n_row_feature, ), sparse=True) row_feature_embd = Dense( row_feature_dim, activation=None, )(row_feature_input) row_hidden = concatenate([row_embd, row_feature_embd], ) else: row_feature_input = None row_hidden = row_embd if row_feature_input is None: inputs = [row_input] else: inputs = [row_input, row_feature_input] # print "row_hidden.shape:", K.int_shape(row_hidden) return inputs, row_hidden [row_inputs, row_hidden] = make_row_layers(n_row, n_row_feature, row_feature_dim) [col_inputs, col_hidden] = make_row_layers(n_col, n_col_feature, col_feature_dim) row_hiddens = [] col_hiddens = [] def map_layers(layers, hidden): hiddens = [] for l in layers: if callable(l): hidden = l(hidden) print "callable" else: (dim, act) = l print K.int_shape(hidden) hidden = Dense(dim, activation=act)(hidden) hiddens.append(hidden) return hiddens row_hiddens = map_layers(row_layers, row_hidden) col_hiddens = map_layers(col_layers, col_hidden) def zip_row_col(row_hiddens, col_hiddens): outputs = [] for row, col in zip(row_hiddens, col_hiddens): output = dot_layer([row, col], axes=-1) outputs.append(output) return outputs outputs = zip_row_col(row_hiddens, col_hiddens) # TODO: add multilevel mode # if output_mode == "single": # pred = outputs[-1] # else: # pred = add_layer(outputs) pred = outputs[-1] model = Model( inputs=row_inputs + col_inputs, outputs=pred, name=model_name, ) model.compile(optimizer=optimizers.Nadam(), loss=losses.mean_squared_error, metrics=['accuracy']) model._keras_aquarium_params = \ dict( model_type="dmf", outputs=outputs, row_hiddens=row_hiddens, col_hiddens=col_hiddens, row_encoder=Model(inputs=row_inputs, outputs=row_hiddens[-1]), col_encoder=Model(inputs=col_inputs, outputs=col_hiddens[-1]), ) return model
def HierarchicalAttentionRNN( max_sents, max_sent_length, n_classes, embeddings=None, n_words=None, word_dim=50, word_hidden_dim=100, sent_hidden_dim=100, ): """Hierarchical Attention RNN(GRU) Two level of lstm network for text Classification, encode sentence by words first, then encode document by sentences. Also add attention for both words and sentences. Check paper [HIERARCHICAL ATTENTION NETWORKS FOR DOCUMENT CLASSIFICATION](https://www.cs.cmu.edu/~hovy/papers/16HLT-hierarchical-attention-networks.pdf) for more details. Parameters ---------- max_sents : number of sentences in a document max_sent_length : number of words in a sentence n_classes : number of classes embeddings : use it to initialize word embeddings if applied n_words : number of words in vocabulary word_dim : dim of word embeddings word_hidden_dim : number of word units in rnn sent_hidden_dim : number of sentence units in rnn Examples -------- import keras from keras_aquarium import hatt_rnn from scipy.sparse import csr_matrix import numpy as np # suppose you have a 3D matrix (n_docs * n_sentences_in_doc * n_words_in_sentence), represents documents, sequence_docs = np.zeros([n_docs, n_sentences_in_doc, n_words_in_sentence]) # padding zeros word_embeddings = load_glove_word_embeddings() vocabulary = load_vocabulary() model = hatt_rnn.HierarchicalAttentionRNN( max_sents, max_sent_length, n_classes, # if use word_embeddings to initialize word embeddings layer embeddings=word_embeddings, # else n_words=len(vocabulary), word_dim=50, # units in words and sentences gru layer word_hidden_dim=100, sent_hidden_dim=100, ) model.fit(sequence_docs, labels) """ if embeddings is None: # embeddings = np.random.uniform([n_words, word_dim]) embedding_layer = Embedding(n_words+1, word_dim, input_length=max_sent_length, # mask_zero=True, trainable=True) else: embedding_layer = Embedding(len(embeddings), len(embeddings[0]), weights=[embeddings], input_length=max_sent_length, mask_zero=True, trainable=True) sent_input = Input(shape=(max_sent_length,), dtype='int32') embedded_sequences = embedding_layer(sent_input) class AttLayer(Layer): def __init__(self, hit=None, **kwargs): #self.input_spec = [InputSpec(ndim=3)] self.init = initializers.glorot_uniform() super(AttLayer, self).__init__(**kwargs) self.hit = hit def build(self, input_shape_li): input_shape = input_shape_li[-1] assert len(input_shape)==3 self.W = self.init((input_shape[-1],)) self.W = K.variable(self.W) self._x_input_shape = input_shape self.trainable_weights = [self.W] super(AttLayer, self).build(input_shape) # be sure you call this somewhere! def call(self, xli, mask=None): # eij = K.tanh(K.dot(x, self.W)) hit, x = xli # print "hit.shape:", K.int_shape(hit) def get_weights_(x): eij = K.dot(x, K.reshape(self.W, [self._x_input_shape[-1], 1]) ) eij = K.squeeze(eij, axis=-1) # print "eij.shape:", K.int_shape(eij) ai = K.exp(eij) ai_sum = K.sum(ai, axis=1) ai_sum = K.reshape(ai_sum, [-1, 1]) # print "ai_sum.shape:", K.int_shape(ai_sum) weights = ai/ai_sum # print "weights.shape:", K.int_shape(weights) return weights weights = get_weights_(x) self.output_weights = Lambda(get_weights_, )(x) # weighted_input = hit * weights weights = K.expand_dims(weights, axis=1) weighted_input = K.batch_dot(weights, hit, axes=[2, 1, ]) weighted_input = K.squeeze(weighted_input, axis=1) # weighted_input = K.tf.einsum("ijk,ij->ijk", hit, weights) # batch_dot is equivalent to K.tf.einsum to general method # weighted_input = K.sum(weighted_input, axis=1) # print "weighted_input.shape:", K.int_shape(weighted_input) return weighted_input def get_output_shape_for(self, input_shape_li): input_shape = input_shape_li[-1] return (input_shape[0], input_shape[-1]) def compute_output_shape(self, input_shape_li): return self.get_output_shape_for(input_shape_li) def get_weights(args): a, b = args eij = K.dot(a, K.transpose(b)) ai = K.exp(eij) weights = ai / K.sum(ai, axis=1) return weights layer_mode = True # ======== sent level ========= sent_hidden = Bidirectional( GRU(word_hidden_dim, activation="tanh", return_sequences=True) )(embedded_sequences) bi_word_hidden_dim = 2 * word_hidden_dim sent_hidden_att = TimeDistributed( Dense(bi_word_hidden_dim, activation="sigmoid") )(sent_hidden) if layer_mode: word_att_layer = AttLayer() sent_encoded = word_att_layer([sent_hidden, sent_hidden_att]) else: words_attention = K.random_uniform_variable( [1, bi_word_hidden_dim], low=0, high=1, ) word_weights = get_weights([sent_hidden_att, words_attention]) def attend_words(args): sent_hidden, sent_hidden_att = args weighted_input = sent_hidden * word_weights weighted_input = K.sum(weighted_input, axis=1) return weighted_input sent_encoded = Lambda(attend_words, )([sent_hidden, sent_hidden_att]) sent_encoder = Model(sent_input, sent_encoded) # ======== doc level ========= sents_input = Input( shape=(max_sents, max_sent_length), dtype='int32', ) sents_encoded = TimeDistributed(sent_encoder)(sents_input) doc_hidden = Bidirectional( GRU(sent_hidden_dim, activation="tanh", return_sequences=True) )(sents_encoded) bi_sent_hidden_dim = 2 * sent_hidden_dim doc_hidden_att = TimeDistributed( Dense(bi_sent_hidden_dim, activation="sigmoid") )(doc_hidden) if layer_mode: sent_att_layer = AttLayer() doc_encoded = sent_att_layer([doc_hidden, doc_hidden_att]) else: sents_attention = K.random_uniform_variable( [1, bi_sent_hidden_dim], low=0, high=1, ) sent_weights = get_weights([doc_hidden_att, sents_attention]) def attend_doc(args): doc_hidden, doc_hidden_att = args weighted_input = sent_hidden * sent_weights weighted_input = K.sum(weighted_input, axis=1) return weighted_input doc_encoded = Lambda(attend_doc, )([doc_hidden, doc_hidden_att]) # ======== fully connected ========= pred = Dense(n_classes, activation='softmax')(doc_encoded) model = Model(sents_input, pred) model.compile( loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy']) # ======== weights ========= if layer_mode: # pass sent_weights_model = Model(sents_input, sent_att_layer.output_weights) else: word_weights_layer = Lambda(get_weights, )([sent_hidden_att, words_attention]) print K.int_shape(word_weights_layer) word_weights_model = Model(sent_input, word_weights_layer) sents_word_weights = TimeDistributed(word_weights_model)(sents_input) word_weights_model = Model(sents_input, sents_word_weights) sent_weights_layer = Lambda(get_weights, )([doc_hidden_att, sents_attention]) sent_weights_model = Model(sents_input, sent_weights_layer) model._keras_aquarium_params = \ dict( model_type="hatt_rnn", # word_weights_model=word_weights_model, sent_weights_model=sent_weights_model, max_sents=max_sents, max_sent_length=max_sent_length, ) return model