def __init__(self, hidden_layer_size, vocab_size, huffman_tree=None): super(Skipgram, self).__init__() self.embeddings = nn.Embedding(vocab_size, hidden_layer_size, sparse=True) if huffman_tree is None: self.softmax_layer = nn.Linear(hidden_layer_size, vocab_size) self.use_hierarchical_softmax = False else: self.softmax_layer = HierarchicalSoftmax(huffman_tree) self.use_hierarchical_softmax = True
def create_o_test_model(train_model, examples, hidden_size, embed_size, glove, batch_size = 64, prem_len = 22): graph = Graph() hypo_layer = LSTM(output_dim= hidden_size, batch_input_shape=(batch_size, 1, embed_size), return_sequences=True, stateful = True, trainable = False) graph.add_input(name='hypo_input', batch_input_shape=(batch_size, 1), dtype = 'int32') graph.add_node(make_fixed_embeddings(glove, 1), name = 'hypo_word_vec', input='hypo_input') graph.add_node(hypo_layer, name = 'hypo', input='hypo_word_vec') graph.add_input(name='premise', batch_input_shape=(batch_size, prem_len, embed_size)) graph.add_input(name='creative', batch_input_shape=(batch_size, embed_size)) attention = LstmAttentionLayer(hidden_size, return_sequences=True, stateful = True, trainable = False, feed_state = False) graph.add_node(attention, name='attention', inputs=['premise', 'hypo', 'creative'], merge_mode='join') graph.add_input(name='train_input', batch_input_shape=(batch_size, 1), dtype='int32') hs = HierarchicalSoftmax(len(glove), input_dim = hidden_size, input_length = 1, trainable = False) graph.add_node(hs, name = 'softmax', inputs=['attention','train_input'], merge_mode = 'join') graph.add_output(name='output', input='softmax') hypo_layer.set_weights(train_model.nodes['hypo'].get_weights()) attention.set_weights(train_model.nodes['attention'].get_weights()) hs.set_weights(train_model.nodes['softmax'].get_weights()) graph.compile(loss={'output': hs_categorical_crossentropy}, optimizer='adam') func_premise = theano.function([train_model.inputs['premise_input'].get_input()], train_model.nodes['premise'].get_output(False), allow_input_downcast=True) func_noise = theano.function([train_model.inputs['noise_input'].get_input(), train_model.inputs['class_input'].get_input()], train_model.nodes['creative'].get_output(False), allow_input_downcast=True) return graph, func_premise, func_noise
def gen_train(noise_examples, hidden_size, noise_dim, glove, hypo_len, version): if version == 9: return baseline_train(noise_examples, hidden_size, noise_dim, glove, hypo_len, version) elif version == 6 or version == 7: return autoe_train(hidden_size, noise_dim, glove, hypo_len, version) prem_input = Input(shape=(None, ), dtype='int32', name='prem_input') hypo_input = Input(shape=(hypo_len + 1, ), dtype='int32', name='hypo_input') noise_input = Input(shape=(1, ), dtype='int32', name='noise_input') train_input = Input(shape=(None, ), dtype='int32', name='train_input') class_input = Input(shape=(3, ), name='class_input') prem_embeddings = make_fixed_embeddings(glove, None)(prem_input) hypo_embeddings = make_fixed_embeddings(glove, hypo_len + 1)(hypo_input) premise_layer = LSTM(output_dim=hidden_size, return_sequences=True, inner_activation='sigmoid', name='premise')(prem_embeddings) hypo_layer = LSTM(output_dim=hidden_size, return_sequences=True, inner_activation='sigmoid', name='hypo')(hypo_embeddings) noise_layer = Embedding(noise_examples, noise_dim, input_length=1, name='noise_embeddings')(noise_input) flat_noise = Flatten(name='noise_flatten')(noise_layer) if version == 8: create_input = merge([class_input, flat_noise], mode='concat') if version == 5: create_input = flat_noise creative = Dense(hidden_size, name='cmerge')(create_input) attention = LstmAttentionLayer( output_dim=hidden_size, return_sequences=True, feed_state=True, name='attention')([hypo_layer, premise_layer, creative]) hs = HierarchicalSoftmax(len(glove), trainable=True, name='hs')([attention, train_input]) inputs = [prem_input, hypo_input, noise_input, train_input, class_input] if version == 5: inputs = inputs[:4] model_name = 'version' + str(version) model = Model(input=inputs, output=hs, name=model_name) model.compile(loss=hs_categorical_crossentropy, optimizer='adam') return model
def baseline_train(noise_examples, hidden_size, noise_dim, glove, hypo_len, version): prem_input = Input(shape=(None, ), dtype='int32', name='prem_input') hypo_input = Input(shape=(hypo_len + 1, ), dtype='int32', name='hypo_input') noise_input = Input(shape=(1, ), dtype='int32', name='noise_input') train_input = Input(shape=(None, ), dtype='int32', name='train_input') class_input = Input(shape=(3, ), name='class_input') concat_dim = hidden_size + noise_dim + 3 prem_embeddings = make_fixed_embeddings(glove, None)(prem_input) hypo_embeddings = make_fixed_embeddings(glove, hypo_len + 1)(hypo_input) premise_layer = LSTM(output_dim=hidden_size, return_sequences=False, inner_activation='sigmoid', name='premise')(prem_embeddings) noise_layer = Embedding(noise_examples, noise_dim, input_length=1, name='noise_embeddings')(noise_input) flat_noise = Flatten(name='noise_flatten')(noise_layer) merged = merge([premise_layer, class_input, flat_noise], mode='concat') creative = Dense(concat_dim, name='cmerge')(merged) fake_merge = Lambda(lambda x: x[0], output_shape=lambda x: x[0])( [hypo_embeddings, creative]) hypo_layer = FeedLSTM(output_dim=concat_dim, return_sequences=True, feed_layer=creative, inner_activation='sigmoid', name='attention')([fake_merge]) hs = HierarchicalSoftmax(len(glove), trainable=True, name='hs')([hypo_layer, train_input]) inputs = [prem_input, hypo_input, noise_input, train_input, class_input] model_name = 'version' + str(version) model = Model(input=inputs, output=hs, name=model_name) model.compile(loss=hs_categorical_crossentropy, optimizer='adam') return model
def baseline_test(train_model, glove, batch_size): version = int(train_model.name[-1]) hidden_size = train_model.get_layer('attention').output_shape[-1] premise_input = Input(batch_shape=(batch_size, None, None)) hypo_input = Input(batch_shape=(batch_size, 1), dtype='int32') creative_input = Input(batch_shape=(batch_size, None)) train_input = Input(batch_shape=(batch_size, 1), dtype='int32') hypo_embeddings = make_fixed_embeddings(glove, 1)(hypo_input) hypo_layer = FeedLSTM(output_dim=hidden_size, return_sequences=True, stateful=True, trainable=False, feed_layer=premise_input, name='attention')([hypo_embeddings]) hs = HierarchicalSoftmax(len(glove), trainable=False, name='hs')([hypo_layer, train_input]) inputs = [hypo_input, creative_input, train_input] outputs = [hs] model = Model(input=inputs, output=outputs, name=train_model.name) model.compile(loss=hs_categorical_crossentropy, optimizer='adam') update_gen_weights(model, train_model) f_inputs = [ train_model.get_layer('noise_embeddings').output, train_model.get_layer('class_input').input, train_model.get_layer('prem_input').input ] func_noise = theano.function(f_inputs, train_model.get_layer('cmerge').output, allow_input_downcast=True) return model, None, func_noise
def autoe_train(hidden_size, noise_dim, glove, hypo_len, version): prem_input = Input(shape=(None, ), dtype='int32', name='prem_input') hypo_input = Input(shape=(hypo_len + 1, ), dtype='int32', name='hypo_input') train_input = Input(shape=(None, ), dtype='int32', name='train_input') class_input = Input(shape=(3, ), name='class_input') prem_embeddings = make_fixed_embeddings(glove, None)(prem_input) hypo_embeddings = make_fixed_embeddings(glove, hypo_len + 1)(hypo_input) premise_encoder = LSTM(output_dim=hidden_size, return_sequences=True, inner_activation='sigmoid', name='premise_encoder')(prem_embeddings) hypo_encoder = LSTM(output_dim=hidden_size, return_sequences=True, inner_activation='sigmoid', name='hypo_encoder')(hypo_embeddings) class_encoder = Dense(hidden_size, activation='tanh')(class_input) encoder = LstmAttentionLayer( output_dim=hidden_size, return_sequences=False, feed_state=True, name='encoder')([hypo_encoder, premise_encoder, class_encoder]) if version == 6: reduction = Dense(noise_dim, name='reduction', activation='tanh')(encoder) elif version == 7: z_mean = Dense(noise_dim, name='z_mean')(encoder) z_log_sigma = Dense(noise_dim, name='z_log_sigma')(encoder) def sampling(args): z_mean, z_log_sigma = args epsilon = K.random_normal(shape=( 64, noise_dim, ), mean=0., std=0.01) return z_mean + K.exp(z_log_sigma) * epsilon reduction = Lambda(sampling, output_shape=lambda sh: ( sh[0][0], noise_dim, ), name='reduction')([z_mean, z_log_sigma]) def vae_loss(args): z_mean, z_log_sigma = args return -0.5 * K.mean( 1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma), axis=-1) vae = Lambda(vae_loss, output_shape=lambda sh: ( sh[0][0], 1, ), name='vae_output')([z_mean, z_log_sigma]) merged = merge([class_input, reduction], mode='concat') creative = Dense(hidden_size, name='expansion', activation='tanh')(merged) premise_decoder = LSTM(output_dim=hidden_size, return_sequences=True, inner_activation='sigmoid', name='premise')(prem_embeddings) hypo_decoder = LSTM(output_dim=hidden_size, return_sequences=True, inner_activation='sigmoid', name='hypo')(hypo_embeddings) attention = LstmAttentionLayer( output_dim=hidden_size, return_sequences=True, feed_state=True, name='attention')([hypo_decoder, premise_decoder, creative]) hs = HierarchicalSoftmax(len(glove), trainable=True, name='hs')([attention, train_input]) inputs = [prem_input, hypo_input, train_input, class_input] model_name = 'version' + str(version) model = Model(input=inputs, output=(hs if version == 6 else [hs, vae]), name=model_name) if version == 6: model.compile(loss=hs_categorical_crossentropy, optimizer='adam') elif version == 7: def minimize(y_true, y_pred): return y_pred def metric(y_true, y_pred): return K.mean(y_pred) model.compile(loss=[hs_categorical_crossentropy, minimize], metrics={ 'hs': word_loss, 'vae_output': metric }, optimizer='adam') return model
def gen_test(train_model, glove, batch_size): version = int(train_model.name[-1]) if version == 9: return baseline_test(train_model, glove, batch_size) hidden_size = train_model.get_layer('premise').output_shape[-1] premise_input = Input(batch_shape=(batch_size, None, None)) hypo_input = Input(batch_shape=(batch_size, 1), dtype='int32') creative_input = Input(batch_shape=(batch_size, None)) train_input = Input(batch_shape=(batch_size, 1), dtype='int32') hypo_embeddings = make_fixed_embeddings(glove, 1)(hypo_input) hypo_layer = LSTM(output_dim=hidden_size, return_sequences=True, stateful=True, unroll=False, trainable=False, inner_activation='sigmoid', name='hypo')(hypo_embeddings) att_inputs = [hypo_layer, premise_input] if version == 5 else [ hypo_layer, premise_input, creative_input ] attention = LstmAttentionLayer(output_dim=hidden_size, return_sequences=True, stateful = True, unroll =False, trainable = False, feed_state = False, name='attention') \ (att_inputs) hs = HierarchicalSoftmax(len(glove), trainable=False, name='hs')([attention, train_input]) inputs = [premise_input, hypo_input, creative_input, train_input] outputs = [hs] model = Model(input=inputs, output=outputs, name=train_model.name) model.compile(loss=hs_categorical_crossentropy, optimizer='adam') update_gen_weights(model, train_model) func_premise = theano.function([train_model.get_layer('prem_input').input], train_model.get_layer('premise').output, allow_input_downcast=True) if version == 5 or version == 8: f_inputs = [train_model.get_layer('noise_embeddings').output] if version == 8: f_inputs += [train_model.get_layer('class_input').input] func_noise = theano.function(f_inputs, train_model.get_layer('cmerge').output, allow_input_downcast=True) elif version == 6 or version == 7: noise_input = train_model.get_layer('reduction').output class_input = train_model.get_layer('class_input').input noise_output = train_model.get_layer('expansion').output func_noise = theano.function([noise_input, class_input], noise_output, allow_input_downcast=True, on_unused_input='ignore') return model, func_premise, func_noise
class Skipgram(nn.Module): """ Skipgram model Args: hidden_layer_size: The second dimension of the hidden layer vocab_size: The vocabulary size. This should be the size of your word dictionary. """ def __init__(self, hidden_layer_size, vocab_size, huffman_tree=None): super(Skipgram, self).__init__() self.embeddings = nn.Embedding(vocab_size, hidden_layer_size, sparse=True) if huffman_tree is None: self.softmax_layer = nn.Linear(hidden_layer_size, vocab_size) self.use_hierarchical_softmax = False else: self.softmax_layer = HierarchicalSoftmax(huffman_tree) self.use_hierarchical_softmax = True def forward(self, input, id_list=None): if self.use_hierarchical_softmax: word_vector = self.embeddings(input).squeeze() probabilities = self.softmax_layer(word_vector, id_list.squeeze()) else: word_vector = self.embeddings(input).squeeze(1) probabilities = self.softmax_layer(word_vector) return probabilities def lookup(self, word, word_dictionary): """ Extracts the word vector for a word given the word and a dictionary that converts words to word ids. Args: word: The word whose vector you want. word_dictionary: A dictionary from words to id numbers. """ word_id = word_dictionary[word] start_vec = Variable(torch.LongTensor([word_id]).unsqueeze(0)).cuda() return self.embeddings(start_vec).squueze() def backprop(self, id_list, lr): """ Applies stochastic gradient descent to the weights that involve the id_list. Backwards should have been called before this. The reason to use this instead of an optimizer is to avoid iterating over all parameters. """ if not self.use_hierarchical_softmax: raise ValueError( 'You can only call backprop when using hierarchical softmax.') self.softmax_layer.backprop(id_list, lr) for p in self.embeddings.parameters(): p.data = p.data + (-lr) * p.grad.data # zero gradients after we make the calculation p.grad.data.zero_()
def test_hierarchical_softmax(timesteps=15, input_dim=50, batch_size=32, output_dim=3218, batches=300, epochs=30): model = Graph() model.add_input(name='real_input', batch_input_shape=(batch_size, timesteps, input_dim)) model.add_input(name='train_input', batch_input_shape=(batch_size, timesteps), dtype='int32') model.add_node(HierarchicalSoftmax(output_dim, input_dim=input_dim, input_length=timesteps), name='hs', inputs=['real_input', 'train_input'], merge_mode='join', create_output=True) model.compile(loss={'hs': hs_categorical_crossentropy}, optimizer='adam') print "hs model compiled" model2 = Sequential() model2.add( TimeDistributedDense(output_dim, batch_input_shape=(batch_size, timesteps, input_dim))) model2.add(Activation('softmax')) model2.compile(loss='categorical_crossentropy', optimizer='adam') print "softmax model compiled" learn_f = np.random.normal(size=(input_dim, output_dim)) learn_f = np.divide(learn_f, norm(learn_f, axis=1)[:, None]) print "learn_f generated" for j in range(epochs): batch_data = generate_batch(learn_f, batch_size, timesteps, input_dim, output_dim, batches) print "Epoch", j, "data genrated" p = Progbar(batches * batch_size) for b in batch_data: data_train = {'real_input': b[0], 'train_input': b[1], 'hs': b[2]} loss = float(model.train_on_batch(data_train)[0]) p.add(batch_size, [('hs_loss', loss)]) p2 = Progbar(batches * batch_size) for b in batch_data: loss, acc = model2.train_on_batch(b[0], b[3], accuracy=True) p2.add(batch_size, [('softmax_loss', loss), ('softmax_acc', acc)]) test_data = generate_batch(learn_f, batch_size, timesteps, input_dim, output_dim, batches) p = Progbar(batches * batch_size) for b in test_data: data_test = {'real_input': b[0], 'train_input': b[1], 'hs': b[3]} loss = float(model.test_on_batch(data_test)[0]) p.add(batch_size, [('hs__test_loss', loss)]) p2 = Progbar(batches * batch_size) for b in batch_data: loss = float(model2.train_on_batch(b[0], b[3])[0]) p2.add(batch_size, [('softmax_loss', loss)])