def __init__(self, name='ra', nimg=2048, nnh=512, na=512, nh=512, nw=512, nout=8843, npatch=30, model_file=None): self.name = name if model_file is not None: with h5py.File(model_file, 'r') as f: nimg = f.attrs['nimg'] nnh = f.attrs['nnh'] na = f.attrs['na'] nh = f.attrs['nh'] nw = f.attrs['nw'] nout = f.attrs['nout'] # npatch = f.attrs['npatch'] self.config = {'nimg': nimg, 'nnh': nnh, 'na': na, 'nh': nh, 'nw': nw, 'nout': nout, 'npatch': npatch} # word embedding layer self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name+'@embedding') # initialization mlp layer self.init_mlp = MLP(layer_sizes=[na, 2*nh], output_type='tanh', name=self.name+'@init_mlp') self.proj_mlp = MLP(layer_sizes=[nimg, na], output_type='tanh', name=self.name+'@proj_mlp') # lstm self.lstm = BasicLSTM(dim_x=na+nw, dim_h=nh, name=self.name+'@lstm') # prediction mlp self.pred_mlp = MLP(layer_sizes=[na+nh+nw, nout], output_type='softmax', name=self.name+'@pred_mlp') # attention layer self.attention = Attention(dim_item=na, dim_context=na+nw+nh, hsize=nnh, name=self.name+'@attention') # inputs cap = T.imatrix('cap') img = T.tensor3('img') self.inputs = [cap, img] # go through sequence feat = self.proj_mlp.compute(img) init_e = feat.mean(axis=1) init_state = T.concatenate([init_e, self.init_mlp.compute(init_e)], axis=-1) (state, self.p, loss, self.alpha), _ = theano.scan(fn=self.scan_func, sequences=[cap[0:-1, :], cap[1:, :]], outputs_info=[init_state, None, None, None], non_sequences=[feat]) # loss function loss = T.mean(loss) self.costs = [loss] # layers and parameters self.layers = [self.embedding, self.init_mlp, self.proj_mlp, self.attention, self.lstm, self.pred_mlp] self.params = sum([l.params for l in self.layers], []) # load weights from file, if model_file is not None if model_file is not None: self.load_weights(model_file) # these functions and variables are used in test stage self._init_func = None self._step_func = None self._proj_func = None self._feat_shared = theano.shared(np.zeros((1, npatch, nimg)).astype(theano.config.floatX))
class SkipGram: def __init__(self, vocab_size, hidden_size, window_size, corpus): # おもみ w_in = 0.01 * np.random.randn(vocab_size, hidden_size).astype('f') w_out = 0.01 * np.random.randn(vocab_size, hidden_size).astype('f') # layers self.embed_layer = Embedding(w_in) self.ns_loss_layers = [ NegativeSamplingLoss(w_out, corpus) for _ in range(2 * window_size) ] # おもみ, 勾配まとめ layers = [self.embed_layer] + self.ns_loss_layers self.params, self.grads = [], [] for l in layers: self.params += l.params self.grads += l.grads # 単語の分散表現 self.word_vecs = w_in def forward(self, contexts, target): h = self.embed_layer.forward(target) loss = sum([ l.forward(h, contexts[:, i]) for i, l in enumerate(self.ns_loss_layers) ]) return loss def backward(self, dl=1): dh = sum([l.backward(dl) for i, l in enumerate(self.ns_loss_layers)]) self.embed_layer.backward(dh)
def __init__(self, input_size=INPUT_SIZE, output_size=OUTPUT_SIZE, hidden_size=HIDDEN_SIZE, embed_size=EMBED_SIZE, lr=LEARNING_RATE, clip_grad=CLIP_GRAD, init_range=INIT_RANGE): # this model will generate a vector representation based on the input input_layers = [ Embedding(input_size, embed_size, init_range), Lstm(embed_size, hidden_size, init_range), ] # this model will generate an output sequence based on the hidden vector output_layers = [ Embedding(output_size, embed_size, init_range), Lstm(embed_size, hidden_size, init_range, previous=input_layers[1]), Softmax(hidden_size, output_size, init_range) ] self.input_layers, self.output_layers = input_layers, output_layers self.hidden_size = hidden_size self.embed_size = embed_size self.input_size = input_size self.output_size = output_size self.lr = lr self.clip_grad = clip_grad
def forward(self, xs: np.ndarray) -> np.ndarray: N, T = xs.shape V, D = self.W.shape out = np.empty((N, T, D), dtype=float) self.layers = [] for t in range(T): layer = Embedding(self.W) out[:, t, :] = layer.forward(xs[:, t]) self.layers.append(layer) return out
def __init__(self, vocab_size, embedding_size, hidden_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.embedding = Embedding(vocab_size, embedding_size) self.lstm = LSTM(embedding_size, hidden_size) self.layers = [self.embedding, self.lstm] self.params = list( itertools.chain(*[ layer.params for layer in self.layers if hasattr(layer, 'params') ]))
def forward(self, idxs): w, = self.params N, T = idxs.shape V, D = w.shape # 語彙数, 分散表現の次元数 self.layers = [] ys = np.empty((N, T, D), dtype='f') for t in range(T): layer = Embedding(w) ys[:, t, :] = layer.forward(idxs[:, t]) self.layers.append(layer) return ys
class Encoder(tf.keras.Model): def __init__( self , word_vocab_size , word_emb_dim , field_vocab_size , field_emb_dim , pos_vocab_size , pos_emb_dim , fglstm_dim): super(Encoder, self).__init__() self.embedding_layer = Embedding( word_vocab_size , word_emb_dim , field_vocab_size , field_emb_dim , pos_vocab_size , pos_emb_dim) self._field_pos_emb_dim = self.embedding_layer.get_output_shape()[1][2] self.cell = FieldGatingLSTMCell( fglstm_dim , word_emb_dim , self._field_pos_emb_dim) self.rnn = tf.keras.layers.RNN( self.cell , return_sequences=True , return_state=True) def get_field_pos_emb_dim(self): return self._field_pos_emb_dim def call(self, inputs): table_embeddings, field_pos_embeddings = self.embedding_layer(inputs) outputs, h, c = self.rnn((table_embeddings, field_pos_embeddings)) return outputs, (h, c), field_pos_embeddings
def __init__(self, FLAGS): self.embeddingLayers = Embedding(FLAGS.vocab_size, FLAGS.embedding_dim) self.cnnGLUBlock = CnnGLUBlock(dropout_rate = FLAGS.dropout_rate, is_batch_norm = FLAGS.is_batch_norm, is_training = FLAGS.is_training, pad_format = FLAGS.pad_format)
def __init__(self, name='ra', nimg=2048, na=512, nh=512, nw=512, nout=8843, npatch=30, model_file=None): self.name = name if model_file is not None: with h5py.File(model_file, 'r') as f: nimg = f.attrs['nimg'] na = f.attrs['na'] nh = f.attrs['nh'] nw = f.attrs['nw'] nout = f.attrs['nout'] # npatch = f.attrs['npatch'] self.config = {'nimg': nimg, 'na': na, 'nh': nh, 'nw': nw, 'nout': nout, 'npatch': npatch} # word embedding layer self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name+'@embedding') # initialization mlp layer self.init_mlp = MLP(layer_sizes=[na, 2*nh], output_type='tanh', name=self.name+'@init_mlp') self.proj_mlp = MLP(layer_sizes=[nimg, na], output_type='tanh', name=self.name+'@proj_mlp') # lstm self.lstm = BasicLSTM(dim_x=na+nw, dim_h=nh, name=self.name+'@lstm') # prediction mlp self.pred_mlp = MLP(layer_sizes=[na+nh+nw, nout], output_type='softmax', name=self.name+'@pred_mlp') # attention layer self.attention = Attention(dim_item=na, dim_context=na+nw+nh, hsize=nh, name=self.name+'@attention') # inputs cap = T.imatrix('cap') img = T.tensor3('img') self.inputs = [cap, img] # go through sequence feat = self.proj_mlp.compute(img) init_e = feat.mean(axis=1) init_state = T.concatenate([init_e, self.init_mlp.compute(init_e)], axis=-1) (state, self.p, loss, self.alpha), _ = theano.scan(fn=self.scan_func, sequences=[cap[0:-1, :], cap[1:, :]], outputs_info=[init_state, None, None, None], non_sequences=[feat]) # loss function loss = T.mean(loss) self.costs = [loss] # layers and parameters self.layers = [self.embedding, self.init_mlp, self.proj_mlp, self.attention, self.lstm, self.pred_mlp] self.params = sum([l.params for l in self.layers], []) # load weights from file, if model_file is not None if model_file is not None: self.load_weights(model_file) # these functions and variables are used in test stage self._init_func = None self._step_func = None self._proj_func = None self._feat_shared = theano.shared(np.zeros((1, npatch, na)).astype(theano.config.floatX))
class Decoder(Sequential): def __init__(self, vocab_size, embedding_size, hidden_size, output_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.output_size = output_size self.lstm = LSTM(embedding_size, hidden_size) self.lstm_output = TimeDistributed(hidden_size, output_size, activation='tanh') self.softmax = TimeDistributed(output_size, vocab_size, activation='softmax') self.embedding = Embedding(vocab_size, embedding_size) self.layers = [ self.lstm, self.lstm_output, self.softmax, self.embedding ] self.params = list( itertools.chain(*[ layer.params for layer in self.layers if hasattr(layer, 'params') ])) def forward(self, ec_H, ec_C, mask): (sens_size, batch_size) = T.shape(mask) def step(m, prev_Y, prev_H, prev_C): """Forward a time step of the decoder.""" # LSTM forward time step (H, C) = self.lstm.step(prev_Y, m, prev_H, prev_C) # LSTM output O = self.lstm_output.forward(H) # Apply softmax to LSTM output P = self.softmax.forward(O) # Make prediction one_hot_Y = T.argmax(P, axis=1) # Feed the output to the next time step Y = self.embedding.forward(one_hot_Y) # FIXME: Deal with differ length ? return (P, Y, H, C) results, updates = theano.scan(fn=step, sequences=[mask], outputs_info=[ None, dict(initial=T.zeros( (batch_size, self.embedding_size)), taps=[-1]), dict(initial=ec_H, taps=[-1]), dict(initial=ec_C, taps=[-1]) ]) # return np.swapaxes(results[0], 0, 1) # returns the softmax probabilities return results[0]
def __init__(self, vocab_size, embedding_size, hidden_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.embedding = Embedding(vocab_size, embedding_size) self.lstm = LSTM(embedding_size, hidden_size) self.layers = [self.embedding, self.lstm] self.params = list(itertools.chain(*[layer.params for layer in self.layers if hasattr(layer, 'params')]))
def create_output_node(model=None, input_sequences=None, num_gru=None, old_h0s=None, reset=None, num_pixelCNN_layer = None): assert(model is not None) assert(input_sequences is not None) assert(num_gru is not None) assert(old_h0s is not None) assert(reset is not None) assert(num_pixelCNN_layer is not None) new_h0s = T.zeros_like(old_h0s) h0s = theano.ifelse.ifelse(reset, new_h0s, old_h0s) embedding_layer = Embedding(Q_LEVELS, DIM, input_sequences, name = model.name+"Embedding.Q_LEVELS") model.add_layer(embedding_layer) prev_out = embedding_layer.output() last_layer = WrapperLayer(prev_out.reshape((prev_out.shape[0], prev_out.shape[1], WIDTH, DEPTH))) pixel_CNN = pixelConv( last_layer, DEPTH, DEPTH, name = model.name + ".pxCNN", num_layers = NUM_PIXEL_CNN_LAYER ) prev_out = pixel_CNN.output() last_layer = WrapperLayer(prev_out.reshape((prev_out.shape[0], prev_out.shape[1], -1))) last_hidden_list = [] for i in range(num_gru): gru_layer = GRU(DIM, DIM, last_layer, s0 = h0s[i,:,:], name = model.name+"GRU_{}".format(i)) last_hidden_list.append(gru_layer.output()[:,-1]) model.add_layer(gru_layer) last_layer = gru_layer fc1 = FC(DIM, Q_LEVELS, last_layer, name = model.name+"FullyConnected") model.add_layer(fc1) softmax = Softmax(fc1, name= model.name+"Softmax") model.add_layer(softmax) return softmax.output(), T.stack(last_hidden_list, axis = 0)
class EmbeddingDot: def __init__(self, w): self.embed = Embedding(w) self.params = self.embed.params self.grads = self.embed.grads self.cache = None def forward(self, h, idx): w_idx = self.embed.forward(idx) s = np.sum(h * w_idx, axis=1) self.cache = (h, w_idx) return s def backward(self, ds): ds = ds.reshape(ds.shape[0], 1) # ??? h, w_idx = self.cache dw_idx = ds * h self.embed.backward(dw_idx) dh = ds * w_idx return dh
def __init__(self, vocab_size, hidden_size, window_size, corpus): # おもみ w_in = 0.01 * np.random.randn(vocab_size, hidden_size).astype('f') w_out = 0.01 * np.random.randn(vocab_size, hidden_size).astype('f') # layers self.embed_layer = Embedding(w_in) self.ns_loss_layers = [ NegativeSamplingLoss(w_out, corpus) for _ in range(2 * window_size) ] # おもみ, 勾配まとめ layers = [self.embed_layer] + self.ns_loss_layers self.params, self.grads = [], [] for l in layers: self.params += l.params self.grads += l.grads # 単語の分散表現 self.word_vecs = w_in
def __init__(self, name='ss', nimg=2048, nh=512, nw=512, nout=8843, ns=80, model_file=None): self.name = name if model_file is not None: with h5py.File(model_file, 'r') as f: nimg = f.attrs['nimg'] nh = f.attrs['nh'] nw = f.attrs['nw'] ns = f.attrs['ns'] nout = f.attrs['nout'] self.config = {'nimg': nimg, 'nh': nh, 'nw': nw, 'nout': nout, 'ns': ns} # word embedding layer self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name+'@embedding') # initialization mlp layer self.proj_mlp = MLP(layer_sizes=[nimg, 2*nh], output_type='tanh', name=self.name+'@proj_mlp') # lstm self.lstm = BasicLSTM(dim_x=nw+ns, dim_h=nh, name=self.name+'@lstm') # prediction mlp self.pred_mlp = MLP(layer_sizes=[nh+nw, nout], output_type='softmax', name=self.name+'@pred_mlp') # inputs cap = T.imatrix('cap') img = T.matrix('img') scene = T.matrix('scene') self.inputs = [cap, img, scene] # go through sequence init_state = self.proj_mlp.compute(img) (state, self.p, loss), _ = theano.scan(fn=self.scan_func, sequences=[cap[0:-1, :], cap[1:, :]], outputs_info=[init_state, None, None], non_sequences=[scene]) # loss function loss = T.mean(loss) self.costs = [loss] # layers and parameters self.layers = [self.embedding, self.proj_mlp, self.lstm, self.pred_mlp] self.params = sum([l.params for l in self.layers], []) # load weights from file, if model_file is not None if model_file is not None: self.load_weights(model_file) # initialization for test stage self._init_func = None self._step_func = None self._scene_shared = theano.shared(np.zeros((1, ns)).astype(theano.config.floatX))
def __init__( self , word_vocab_size , word_emb_dim , field_vocab_size , field_emb_dim , pos_vocab_size , pos_emb_dim , fglstm_dim): super(Encoder, self).__init__() self.embedding_layer = Embedding( word_vocab_size , word_emb_dim , field_vocab_size , field_emb_dim , pos_vocab_size , pos_emb_dim) self._field_pos_emb_dim = self.embedding_layer.get_output_shape()[1][2] self.cell = FieldGatingLSTMCell( fglstm_dim , word_emb_dim , self._field_pos_emb_dim) self.rnn = tf.keras.layers.RNN( self.cell , return_sequences=True , return_state=True)
def __init__(self, num_layers, d_model, num_heads, dff, vocab_size,batch_size, rate=0.1, use_stats=False): super(Transformer, self).__init__() self.num_layers =num_layers self.vocab_size = vocab_size self.batch_size = batch_size self.model_depth = d_model self.num_heads = num_heads self.embedding = Embedding(vocab_size, d_model) self.encoder = Encoder(num_layers, d_model, num_heads, dff, vocab_size, rate) self.decoder = Decoder(num_layers, d_model, num_heads, dff, vocab_size, rate, use_stats) self.final_layer = tf.keras.layers.Dense(vocab_size)
def __init__(self, vocab_size, embedding_size, hidden_size, output_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.output_size = output_size self.lstm = LSTM(embedding_size, hidden_size) self.lstm_output = TimeDistributed(hidden_size, output_size, activation='tanh') self.softmax = TimeDistributed(output_size, vocab_size, activation='softmax') self.embedding = Embedding(vocab_size, embedding_size) self.layers = [self.lstm, self.lstm_output, self.softmax, self.embedding] self.params = list(itertools.chain(*[layer.params for layer in self.layers if hasattr(layer, 'params')]))
class EmbeddingDot: def __init__(self, W: np.ndarray) -> None: self.embed = Embedding(W) self.params = self.embed.params self.grads = self.embed.grads self.cache = None def forward(self, h: np.ndarray, idx: List[int]): target_W = self.embed.forward(idx) out = np.sum(target_W * h, axis=1) self.cache = (h, target_W) return out def backward(self, dout: np.ndarray) -> np.ndarray: h, target_W = self.cache dout = dout.reshape(dout.shape[0], 1) dtarget_W = dout * h self.embed.backward(dtarget_W) dh = dout * target_W return dh
def __init__(self, vocab_size, embedding_size, hidden_size, output_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.output_size = output_size self.lstm = LSTM(embedding_size, hidden_size) self.lstm_output = TimeDistributed(hidden_size, output_size, activation='tanh') self.softmax = TimeDistributed(output_size, vocab_size, activation='softmax') self.embedding = Embedding(vocab_size, embedding_size) self.layers = [ self.lstm, self.lstm_output, self.softmax, self.embedding ] self.params = list( itertools.chain(*[ layer.params for layer in self.layers if hasattr(layer, 'params') ]))
def _def_layers(self): # word embeddings self.word_embedding = Embedding(embedding_size=self.embedding_size, vocab_size=self.word_vocab_size, enable_cuda=self.enable_cuda) # lstm encoder self.encoder = FastUniLSTM( ninp=self.embedding_size, nhids=self.encoder_rnn_hidden_size, dropout_between_rnn_layers=self.dropout_between_rnn_layers) self.action_scorer_shared = torch.nn.Linear( self.encoder_rnn_hidden_size[-1], self.action_scorer_hidden_dim) action_scorers = [] for _ in range(self.generate_length): action_scorers.append( torch.nn.Linear(self.action_scorer_hidden_dim, self.word_vocab_size, bias=False)) self.action_scorers = torch.nn.ModuleList(action_scorers) self.fake_recurrent_mask = None
class Encoder(Sequential): def __init__(self, vocab_size, embedding_size, hidden_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.embedding = Embedding(vocab_size, embedding_size) self.lstm = LSTM(embedding_size, hidden_size) self.layers = [self.embedding, self.lstm] self.params = list(itertools.chain(*[layer.params for layer in self.layers if hasattr(layer, 'params')])) def forward(self, batch, mask): # ``batch`` is a matrix whose row ``x`` is a sentence, e.g. x = [1, 4, 5, 2, 0] # ``emb`` is a list of embedding matrix, e[i].shape = (sene_size, embedding_size) emb = self.embedding.forward(batch) (H, C) = self.lstm.forward(emb, mask) return (H[-1], C[-1])
class Decoder(Sequential): def __init__(self, vocab_size, embedding_size, hidden_size, output_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.output_size = output_size self.lstm = LSTM(embedding_size, hidden_size) self.lstm_output = TimeDistributed(hidden_size, output_size, activation='tanh') self.softmax = TimeDistributed(output_size, vocab_size, activation='softmax') self.embedding = Embedding(vocab_size, embedding_size) self.layers = [self.lstm, self.lstm_output, self.softmax, self.embedding] self.params = list(itertools.chain(*[layer.params for layer in self.layers if hasattr(layer, 'params')])) def forward(self, ec_H, ec_C, mask): (sens_size, batch_size) = T.shape(mask) def step(m, prev_Y, prev_H, prev_C): """Forward a time step of the decoder.""" # LSTM forward time step (H, C) = self.lstm.step(prev_Y, m, prev_H, prev_C) # LSTM output O = self.lstm_output.forward(H) # Apply softmax to LSTM output P = self.softmax.forward(O) # Make prediction one_hot_Y = T.argmax(P, axis=1) # Feed the output to the next time step Y = self.embedding.forward(one_hot_Y) # FIXME: Deal with differ length ? return (P, Y, H, C) results, updates = theano.scan( fn=step, sequences=[mask], outputs_info=[ None, dict(initial=T.zeros((batch_size, self.embedding_size)), taps=[-1]), dict(initial=ec_H, taps=[-1]), dict(initial=ec_C, taps=[-1]) ] ) # return np.swapaxes(results[0], 0, 1) # returns the softmax probabilities return results[0]
def __init__(self, vocab_size: int, hidden_size: int, window_size: int, corpus: List[int]) -> None: W_in = 0.01 * np.random.randn(vocab_size, hidden_size).astype(float) W_out = 0.01 * np.random.randn(vocab_size, hidden_size).astype(float) self.in_layers = [] for i in range(2 * window_size): layer = Embedding(W_in) self.in_layers.append(layer) self.ns_loss = NegativeSamplingLoss(W_out, corpus, power=0.75, sample_size=5) layers = self.in_layers + [self.ns_loss] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads self.word_vecs = W_in
def __init__(self, config): super(AESModel, self).__init__() self.config = config self.e0 = Embedding(config.vocab_size, config.embedding_output, config) self.m0 = Modeling(config.embedding_output, config.hidden_size, config) self.a0 = Attn(2 * config.hidden_size, 2 * config.hidden_size, config.max_length_sent, config, dropout_p=config.dropout) self.m1 = Modeling(4 * config.hidden_size, config.hidden_size, config) self.a1 = Attn(2 * config.hidden_size, 2 * config.hidden_size, config.max_length_sent, config, dropout_p=config.dropout) self.m2 = Modeling(4 * config.hidden_size, config.hidden_size, config) # self.m2 = Modeling(config.hidden_size, config.hidden_size, config) self.o0 = Output( 2 * config.hidden_size * config.max_length_sent * config.max_num_sent, config)
class Encoder(Sequential): def __init__(self, vocab_size, embedding_size, hidden_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.embedding = Embedding(vocab_size, embedding_size) self.lstm = LSTM(embedding_size, hidden_size) self.layers = [self.embedding, self.lstm] self.params = list( itertools.chain(*[ layer.params for layer in self.layers if hasattr(layer, 'params') ])) def forward(self, batch, mask): # ``batch`` is a matrix whose row ``x`` is a sentence, e.g. x = [1, 4, 5, 2, 0] # ``emb`` is a list of embedding matrix, e[i].shape = (sene_size, embedding_size) emb = self.embedding.forward(batch) (H, C) = self.lstm.forward(emb, mask) return (H[-1], C[-1])
class LSTM_DQN(torch.nn.Module): model_name = 'lstm_dqn' def __init__(self, model_config, word_vocab, generate_length=5, enable_cuda=False): super(LSTM_DQN, self).__init__() self.model_config = model_config self.enable_cuda = enable_cuda self.word_vocab_size = len(word_vocab) self.id2word = word_vocab self.generate_length = generate_length self.read_config() self._def_layers() self.init_weights() # self.print_parameters() def print_parameters(self): amount = 0 for p in self.parameters(): amount += np.prod(p.size()) print("total number of parameters: %s" % (amount)) parameters = filter(lambda p: p.requires_grad, self.parameters()) amount = 0 for p in parameters: amount += np.prod(p.size()) print("number of trainable parameters: %s" % (amount)) def read_config(self): # model config self.embedding_size = self.model_config['embedding_size'] self.encoder_rnn_hidden_size = self.model_config[ 'encoder_rnn_hidden_size'] self.action_scorer_hidden_dim = self.model_config[ 'action_scorer_hidden_dim'] self.dropout_between_rnn_layers = self.model_config[ 'dropout_between_rnn_layers'] def _def_layers(self): # word embeddings self.word_embedding = Embedding(embedding_size=self.embedding_size, vocab_size=self.word_vocab_size, enable_cuda=self.enable_cuda) # lstm encoder self.encoder = FastUniLSTM( ninp=self.embedding_size, nhids=self.encoder_rnn_hidden_size, dropout_between_rnn_layers=self.dropout_between_rnn_layers) self.action_scorer_shared = torch.nn.Linear( self.encoder_rnn_hidden_size[-1], self.action_scorer_hidden_dim) action_scorers = [] for _ in range(self.generate_length): action_scorers.append( torch.nn.Linear(self.action_scorer_hidden_dim, self.word_vocab_size, bias=False)) self.action_scorers = torch.nn.ModuleList(action_scorers) self.fake_recurrent_mask = None def init_weights(self): torch.nn.init.xavier_uniform_(self.action_scorer_shared.weight.data) for i in range(len(self.action_scorers)): torch.nn.init.xavier_uniform_(self.action_scorers[i].weight.data) self.action_scorer_shared.bias.data.fill_(0) def representation_generator(self, _input_words): embeddings, mask = self.word_embedding.forward( _input_words) # batch x time x emb encoding_sequence, _, _ = self.encoder.forward( embeddings, mask) # batch x time x h mean_encoding = masked_mean(encoding_sequence, mask) # batch x h return mean_encoding def action_scorer(self, state_representation): hidden = self.action_scorer_shared.forward( state_representation) # batch x hid hidden = F.relu(hidden) # batch x hid action_ranks = [] for i in range(len(self.action_scorers)): action_ranks.append( self.action_scorers[i].forward(hidden)) # batch x n_vocab return action_ranks
def __init__(self, w): self.embed = Embedding(w) self.params = self.embed.params self.grads = self.embed.grads self.cache = None
class Model(object): """ Region Attention model """ def __init__(self, name='ra', nimg=2048, na=512, nh=512, nw=512, nout=8843, npatch=30, model_file=None): self.name = name if model_file is not None: with h5py.File(model_file, 'r') as f: nimg = f.attrs['nimg'] na = f.attrs['na'] nh = f.attrs['nh'] nw = f.attrs['nw'] nout = f.attrs['nout'] # npatch = f.attrs['npatch'] self.config = {'nimg': nimg, 'na': na, 'nh': nh, 'nw': nw, 'nout': nout, 'npatch': npatch} # word embedding layer self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name+'@embedding') # initialization mlp layer self.init_mlp = MLP(layer_sizes=[na, 2*nh], output_type='tanh', name=self.name+'@init_mlp') self.proj_mlp = MLP(layer_sizes=[nimg, na], output_type='tanh', name=self.name+'@proj_mlp') # lstm self.lstm = BasicLSTM(dim_x=na+nw, dim_h=nh, name=self.name+'@lstm') # prediction mlp self.pred_mlp = MLP(layer_sizes=[na+nh+nw, nout], output_type='softmax', name=self.name+'@pred_mlp') # attention layer self.attention = Attention(dim_item=na, dim_context=na+nw+nh, hsize=nh, name=self.name+'@attention') # inputs cap = T.imatrix('cap') img = T.tensor3('img') self.inputs = [cap, img] # go through sequence feat = self.proj_mlp.compute(img) init_e = feat.mean(axis=1) init_state = T.concatenate([init_e, self.init_mlp.compute(init_e)], axis=-1) (state, self.p, loss, self.alpha), _ = theano.scan(fn=self.scan_func, sequences=[cap[0:-1, :], cap[1:, :]], outputs_info=[init_state, None, None, None], non_sequences=[feat]) # loss function loss = T.mean(loss) self.costs = [loss] # layers and parameters self.layers = [self.embedding, self.init_mlp, self.proj_mlp, self.attention, self.lstm, self.pred_mlp] self.params = sum([l.params for l in self.layers], []) # load weights from file, if model_file is not None if model_file is not None: self.load_weights(model_file) # these functions and variables are used in test stage self._init_func = None self._step_func = None self._proj_func = None self._feat_shared = theano.shared(np.zeros((1, npatch, na)).astype(theano.config.floatX)) def compute(self, state, w_idx, feat): # word embedding word_vec = self.embedding.compute(w_idx) # split states e_tm1, c_tm1, h_tm1 = split_state(state, scheme=[(1, self.config['na']), (2, self.config['nh'])]) # attention e_t, alpha = self.attention.compute(feat, T.concatenate([e_tm1, h_tm1, word_vec], axis=1)) # lstm step e_w = T.concatenate([e_t, word_vec], axis=-1) c_t, h_t = self.lstm.compute(e_w, c_tm1, h_tm1) # (mb,nh) # merge state new_state = T.concatenate([e_t, c_t, h_t], axis=-1) # predict word probability p = self.pred_mlp.compute(T.concatenate([e_t, h_t, word_vec], axis=-1)) return new_state, p, alpha def scan_func(self, w_tm1, w_t, state, feat): # update state new_state, p, alpha = self.compute(state, w_tm1, feat) # cross-entropy loss loss = T.nnet.categorical_crossentropy(p, w_t) return new_state, p, loss, alpha def init_func(self, img_value): if self._proj_func is None: img = T.tensor3() self._proj_func = theano.function([img], self.proj_mlp.compute(img)) if self._init_func is None: init_e = self._feat_shared.mean(axis=1) init_state = T.concatenate([init_e, self.init_mlp.compute(init_e)], axis=-1) self._init_func = theano.function([], init_state) self._feat_shared.set_value(self._proj_func(img_value)) return self._init_func() def step_func(self, state_value, w_value): if self._step_func is None: w = T.ivector() state = T.matrix() new_state, p, _ = self.compute(state, w, self._feat_shared) self._step_func = theano.function([state, w], [new_state, T.log(p)]) return self._step_func(state_value, w_value) def save_to_dir(self, save_dir, idx): save_file = osp.join(save_dir, self.name+'.h5.'+str(idx)) for l in self.layers: l.save_weights(save_file) with h5py.File(save_file) as f: for k, v in self.config.items(): f.attrs[k] = v def load_weights(self, model_file): for l in self.layers: l.load_weights(model_file)
def _def_layers(self): # word embeddings if self.use_pretrained_embedding: self.word_embedding = Embedding(embedding_size=self.word_embedding_size, vocab_size=self.word_vocab_size, id2word=self.word_vocab, dropout_rate=self.embedding_dropout, load_pretrained=True, trainable=self.word_embedding_trainable, embedding_oov_init="random", pretrained_embedding_path=self.pretrained_embedding_path) else: self.word_embedding = Embedding(embedding_size=self.word_embedding_size, vocab_size=self.word_vocab_size, trainable=self.word_embedding_trainable, dropout_rate=self.embedding_dropout) # node embeddings self.node_embedding = Embedding(embedding_size=self.node_embedding_size, vocab_size=self.node_vocab_size, trainable=self.node_embedding_trainable, dropout_rate=self.embedding_dropout) # relation embeddings self.relation_embedding = Embedding(embedding_size=self.relation_embedding_size, vocab_size=self.relation_vocab_size, trainable=self.relation_embedding_trainable, dropout_rate=self.embedding_dropout) self.word_embedding_prj = torch.nn.Linear(self.word_embedding_size, self.block_hidden_dim, bias=False) self.encoder = torch.nn.ModuleList([EncoderBlock(conv_num=self.encoder_conv_num, ch_num=self.block_hidden_dim, k=5, block_hidden_dim=self.block_hidden_dim, n_head=self.n_heads, dropout=self.block_dropout) for _ in range(self.encoder_layers)]) self.rgcns = StackedRelationalGraphConvolution(entity_input_dim=self.node_embedding_size+self.block_hidden_dim, relation_input_dim=self.relation_embedding_size+self.block_hidden_dim, num_relations=self.relation_vocab_size, hidden_dims=self.gcn_hidden_dims, num_bases=self.gcn_num_bases, use_highway_connections=self.gcn_highway_connections, dropout_rate=self.dropout, real_valued_graph=self.real_valued_graph) self.attention = CQAttention(block_hidden_dim=self.block_hidden_dim, dropout=self.attention_dropout) self.attention_prj = torch.nn.Linear(self.block_hidden_dim * 4, self.block_hidden_dim, bias=False) self.self_attention_text = SelfAttention(self.block_hidden_dim, self.n_heads, self.dropout) self.self_attention_graph = SelfAttention(self.block_hidden_dim, self.n_heads, self.dropout) # recurrent memories self.recurrent_memory_bi_input = LSTMCell(self.block_hidden_dim * 2, self.block_hidden_dim, use_bias=True) self.recurrent_memory_single_input = LSTMCell(self.block_hidden_dim, self.block_hidden_dim, use_bias=True) linear_function = NoisyLinear if self.noisy_net else torch.nn.Linear self.action_scorer_linear_1_tri_input = linear_function(self.block_hidden_dim * 3, self.block_hidden_dim) self.action_scorer_linear_1_bi_input = linear_function(self.block_hidden_dim * 2, self.block_hidden_dim) self.action_scorer_linear_2 = linear_function(self.block_hidden_dim, 1) # text encoder for pretraining tasks # (we separate this because we don't want to init text encoder with pretrained parameters when training RL) self.encoder_for_pretraining_tasks = torch.nn.ModuleList([EncoderBlock(conv_num=self.encoder_conv_num, ch_num=self.block_hidden_dim, k=5, block_hidden_dim=self.block_hidden_dim, n_head=self.n_heads, dropout=self.block_dropout) for _ in range(self.encoder_layers)]) # command generation self.cmd_gen_attention = CQAttention(block_hidden_dim=self.block_hidden_dim, dropout=self.attention_dropout) self.cmd_gen_attention_prj = torch.nn.Linear(self.block_hidden_dim * 4, self.block_hidden_dim, bias=False) self.decoder = torch.nn.ModuleList([DecoderBlock(ch_num=self.block_hidden_dim, k=5, block_hidden_dim=self.block_hidden_dim, n_head=self.n_heads, dropout=self.block_dropout) for _ in range(self.decoder_layers)]) self.tgt_word_prj = torch.nn.Linear(self.block_hidden_dim, self.word_vocab_size, bias=False) self.pointer_softmax = PointerSoftmax(input_dim=self.block_hidden_dim, hidden_dim=self.block_hidden_dim) # observation generation self.obs_gen_attention = CQAttention(block_hidden_dim=self.block_hidden_dim, dropout=self.attention_dropout) self.obs_gen_attention_prj = torch.nn.Linear(self.block_hidden_dim * 4, self.block_hidden_dim, bias=False) self.obs_gen_decoder = torch.nn.ModuleList([DecoderBlockForObsGen(ch_num=self.block_hidden_dim, k=5, block_hidden_dim=self.block_hidden_dim, n_head=self.n_heads, dropout=self.block_dropout) for _ in range(self.decoder_layers)]) self.obs_gen_tgt_word_prj = torch.nn.Linear(self.block_hidden_dim, self.word_vocab_size, bias=False) self.obs_gen_linear_1 = torch.nn.Linear(self.block_hidden_dim, self.block_hidden_dim) self.obs_gen_linear_2 = torch.nn.Linear(self.block_hidden_dim, int(len(self.relation_vocab) / 2) * len(self.node_vocab) * len(self.node_vocab)) self.obs_gen_attention_to_rnn_input = torch.nn.Linear(self.block_hidden_dim * 4, self.block_hidden_dim) self.obs_gen_graph_rnncell = torch.nn.GRUCell(self.block_hidden_dim, self.block_hidden_dim) self.observation_discriminator = ObservationDiscriminator(self.block_hidden_dim) # action prediction self.ap_attention = CQAttention(block_hidden_dim=self.block_hidden_dim, dropout=self.attention_dropout) self.ap_attention_prj = torch.nn.Linear(self.block_hidden_dim * 4, self.block_hidden_dim, bias=False) self.ap_self_attention = SelfAttention(self.block_hidden_dim * 3, self.n_heads, self.dropout) self.ap_linear_1 = torch.nn.Linear(self.block_hidden_dim * 3, self.block_hidden_dim) self.ap_linear_2 = torch.nn.Linear(self.block_hidden_dim, 1) # state prediction self.sp_attention = CQAttention(block_hidden_dim=self.block_hidden_dim, dropout=self.attention_dropout) self.sp_attention_prj = torch.nn.Linear(self.block_hidden_dim * 4, self.block_hidden_dim, bias=False) self.sp_self_attention = SelfAttention(self.block_hidden_dim * 3, self.n_heads, self.dropout) self.sp_linear_1 = torch.nn.Linear(self.block_hidden_dim * 3, self.block_hidden_dim) self.sp_linear_2 = torch.nn.Linear(self.block_hidden_dim, 1) # deep graph infomax self.dgi_discriminator = DGIDiscriminator(self.gcn_hidden_dims[-1])
from tensor import Tensor from layers import Embedding from rnn import RNNCell from losses import CrossEntropyLoss from optimizers import SGD with open('data/shakespear.txt', 'r') as f: raw = f.read() vocab = list(set(raw)) word2index = {} for i, word in enumerate(vocab): word2index[word] = i indices = np.array(list(map(lambda x: word2index[x], raw))) embed = Embedding(vocab_size=len(vocab), dim=512) model = RNNCell(n_inputs=512, n_hidden=512, n_output=len(vocab)) criterion = CrossEntropyLoss() optim = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=0.01) batch_size = 32 bptt = 16 n_batches = int((indices.shape[0] / batch_size)) trimmed_indices = indices[:n_batches * batch_size] # batch_indices: each column represents a sub-sequence from indices -> continuous batched_indices = trimmed_indices.reshape(batch_size, n_batches) batched_indices = batched_indices.transpose() input_batched_indices = batched_indices[:-1]
d_vocab_size = len(d_w2i) x = tf.placeholder(tf.int32, [None, None], name='x') m = tf.cast(tf.not_equal(x, -1), tf.float32) t = tf.placeholder(tf.int32, [None, None], name='t') t_in = t[:, :-1] t_out = t[:, 1:] t_out_one_hot = tf.one_hot(t_out, depth=d_vocab_size, dtype=tf.float32) # Attention mask ma = tf.where(condition=tf.equal(x, PADDING_ID), x=tf.ones_like(x, dtype=tf.float32) * np.float32(-1e+10), y=tf.ones_like(x, dtype=tf.float32)) encoder = [ Embedding(e_vocab_size, EMB_DIM), GRU(EMB_DIM, HID_DIM, m), GRU(EMB_DIM, HID_DIM, m[:, ::-1]) ] x_emb = f_props(encoder[:1], x) h_ef = f_props(encoder[1:2], x_emb) h_eb = f_props(encoder[2:], x_emb[:, ::-1])[:, ::-1, :] h_e = tf.concat([h_ef, h_eb], axis=2) h_d1_0 = tf.reduce_mean(h_e, axis=1) h_d2_0 = tf.reduce_mean(h_e, axis=1) decoder = [ Embedding(d_vocab_size, EMB_DIM), GRU(EMB_DIM, 2 * HID_DIM, tf.ones_like(t_in, dtype='float32'), h_0=h_d1_0), Attention(2 * HID_DIM, 2 * HID_DIM, h_e, ma),
def __init__(self, args): super(QAxl, self).__init__() hidden_size = args['hidden_size'] dropout = args['dropout'] attention_size = args['attention_size'] word_emb = np.array(read_json(args['data_dir'] + 'word_emb.json'), dtype=np.float32) word_size = word_emb.shape[0] word_dim = word_emb.shape[1] char_dim = args['char_dim'] char_len = len(read_json(args['data_dir'] + 'char2id.json')) pos_dim = args['pos_dim'] ner_dim = args['ner_dim'] self.args = args self.train_loss = AverageMeter() self.use_cuda = args['use_cuda'] self.use_xl = args['use_xl'] if self.use_xl: self.xl = TransfoXLModel.from_pretrained('transfo-xl-wt103') xl_dim = 1024 ## Embedding Layer print('Building embedding...') self.word_embeddings = nn.Embedding(word_emb.shape[0], word_dim, padding_idx=0) self.word_embeddings.weight.data = torch.from_numpy(word_emb) self.char_embeddings = nn.Embedding(char_len, char_dim, padding_idx=0) self.pos_embeddings = nn.Embedding(args['pos_size'], args['pos_dim'], padding_idx=0) self.ner_embeddings = nn.Embedding(args['ner_size'], args['ner_dim'], padding_idx=0) with open(args['data_dir'] + 'tune_word_idx.pkl', 'rb') as f: tune_idx = pkl.load(f) self.fixed_idx = list( set([i for i in range(word_size)]) - set(tune_idx)) fixed_embedding = torch.from_numpy(word_emb)[self.fixed_idx] self.register_buffer('fixed_embedding', fixed_embedding) self.fixed_embedding = fixed_embedding low_p_dim = word_dim + word_dim + args['pos_dim'] + args['ner_dim'] + 4 low_q_dim = word_dim + args['pos_dim'] + args['ner_dim'] if self.use_xl: low_p_dim += xl_dim low_q_dim += xl_dim self.emb_char = Embedding(word_dim, char_dim, hidden_size) ## Forward Layers Declaration high_p_dim = 2 * hidden_size full_q_dim = 2 * high_p_dim attention_dim = word_dim + full_q_dim if self.use_xl: attention_dim += xl_dim self.word_attention_layer = WordAttention(word_dim, attention_size, dropout) self.low_rnn = StackedPaddedRNN(low_p_dim, hidden_size, 1, dropout=dropout) self.high_rnn = StackedPaddedRNN(high_p_dim, hidden_size, 1, dropout=dropout) self.full_rnn = StackedPaddedRNN(full_q_dim, hidden_size, 1, dropout=dropout) self.low_attention_layer = MultiAttention(attention_dim, attention_size, dropout) self.high_attention_layer = MultiAttention(attention_dim, attention_size, dropout) self.full_attention_layer = MultiAttention(attention_dim, attention_size, dropout) ## Fusion Layer and Final Attention + Final RNN fuse_dim = 10 * hidden_size self_attention_dim = 12 * hidden_size + word_dim + ner_dim + pos_dim + 1 if self.use_xl: self_attention_dim += xl_dim self.fuse_rnn = StackedPaddedRNN(fuse_dim, hidden_size, 1, dropout=dropout) self.self_attention_layer = MultiAttention(self_attention_dim, attention_size, dropout) self.self_rnn = StackedPaddedRNN(4 * hidden_size, hidden_size, 1, dropout=dropout) ## Verifier and output self.summ_layer = PointerS(2 * hidden_size, dropout=dropout, use_cuda=self.use_cuda) self.summ_layer2 = PointerS(2 * hidden_size, dropout=dropout, use_cuda=self.use_cuda) self.pointer_layer = PointerNet(2 * hidden_size, use_cuda=self.use_cuda) self.has_ans = nn.Sequential(nn.Dropout(p=dropout), nn.Linear(6 * hidden_size, 2))
class Model(object): """ Region Attention model """ def __init__(self, name='ra', nimg=2048, nnh=512, na=512, nh=512, nw=512, nout=8843, npatch=30, model_file=None): self.name = name if model_file is not None: with h5py.File(model_file, 'r') as f: nimg = f.attrs['nimg'] nnh = f.attrs['nnh'] na = f.attrs['na'] nh = f.attrs['nh'] nw = f.attrs['nw'] nout = f.attrs['nout'] # npatch = f.attrs['npatch'] self.config = {'nimg': nimg, 'nnh': nnh, 'na': na, 'nh': nh, 'nw': nw, 'nout': nout, 'npatch': npatch} # word embedding layer self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name+'@embedding') # initialization mlp layer self.init_mlp = MLP(layer_sizes=[na, 2*nh], output_type='tanh', name=self.name+'@init_mlp') self.proj_mlp = MLP(layer_sizes=[nimg, na], output_type='tanh', name=self.name+'@proj_mlp') # lstm self.lstm = BasicLSTM(dim_x=na+nw, dim_h=nh, name=self.name+'@lstm') # prediction mlp self.pred_mlp = MLP(layer_sizes=[na+nh+nw, nout], output_type='softmax', name=self.name+'@pred_mlp') # attention layer self.attention = Attention(dim_item=na, dim_context=na+nw+nh, hsize=nnh, name=self.name+'@attention') # inputs cap = T.imatrix('cap') img = T.tensor3('img') self.inputs = [cap, img] # go through sequence feat = self.proj_mlp.compute(img) init_e = feat.mean(axis=1) init_state = T.concatenate([init_e, self.init_mlp.compute(init_e)], axis=-1) (state, self.p, loss, self.alpha), _ = theano.scan(fn=self.scan_func, sequences=[cap[0:-1, :], cap[1:, :]], outputs_info=[init_state, None, None, None], non_sequences=[feat]) # loss function loss = T.mean(loss) self.costs = [loss] # layers and parameters self.layers = [self.embedding, self.init_mlp, self.proj_mlp, self.attention, self.lstm, self.pred_mlp] self.params = sum([l.params for l in self.layers], []) # load weights from file, if model_file is not None if model_file is not None: self.load_weights(model_file) # these functions and variables are used in test stage self._init_func = None self._step_func = None self._proj_func = None self._feat_shared = theano.shared(np.zeros((1, npatch, nimg)).astype(theano.config.floatX)) def compute(self, state, w_idx, feat): # word embedding word_vec = self.embedding.compute(w_idx) # split states e_tm1, c_tm1, h_tm1 = split_state(state, scheme=[(1, self.config['na']), (2, self.config['nh'])]) # attention e_t, alpha = self.attention.compute(feat, T.concatenate([e_tm1, h_tm1, word_vec], axis=1)) # lstm step e_w = T.concatenate([e_t, word_vec], axis=-1) c_t, h_t = self.lstm.compute(e_w, c_tm1, h_tm1) # (mb,nh) # merge state new_state = T.concatenate([e_t, c_t, h_t], axis=-1) # predict word probability p = self.pred_mlp.compute(T.concatenate([e_t, h_t, word_vec], axis=-1)) return new_state, p, alpha def scan_func(self, w_tm1, w_t, state, feat): # update state new_state, p, alpha = self.compute(state, w_tm1, feat) # cross-entropy loss loss = T.nnet.categorical_crossentropy(p, w_t) return new_state, p, loss, alpha def init_func(self, img_value): if self._proj_func is None: img = T.tensor3() self._proj_func = theano.function([img], self.proj_mlp.compute(img)) if self._init_func is None: init_e = self._feat_shared.mean(axis=1) init_state = T.concatenate([init_e, self.init_mlp.compute(init_e)], axis=-1) self._init_func = theano.function([], init_state) self._feat_shared.set_value(self._proj_func(img_value)) return self._init_func() def step_func(self, state_value, w_value): if self._step_func is None: w = T.ivector() state = T.matrix() new_state, p, _ = self.compute(state, w, self._feat_shared) self._step_func = theano.function([state, w], [new_state, T.log(p)]) return self._step_func(state_value, w_value) def save_to_dir(self, save_dir, idx): save_file = osp.join(save_dir, self.name+'.h5.'+str(idx)) for l in self.layers: l.save_weights(save_file) with h5py.File(save_file) as f: for k, v in self.config.items(): f.attrs[k] = v def load_weights(self, model_file): for l in self.layers: l.load_weights(model_file)
def __init__(self, name='gnic', nimg=2048, nh=512, nw=512, nout=8843, model_file=None): self.name = name if model_file is not None: with h5py.File(model_file, 'r') as f: nimg = f.attrs['nimg'] nh = f.attrs['nh'] nw = f.attrs['nw'] nout = f.attrs['nout'] self.config = {'nimg': nimg, 'nh': nh, 'nw': nw, 'nout': nout} # word embedding layer self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name + '@embedding') # initialization mlp layer self.proj_mlp = MLP(layer_sizes=[nimg, 2 * nh], output_type='tanh', name=self.name + '@proj_mlp') # lstm self.lstm = BasicLSTM(dim_x=nw, dim_h=nh, name=self.name + '@lstm') # prediction mlp self.pred_mlp = MLP(layer_sizes=[nh + nw, nout], output_type='softmax', name=self.name + '@pred_mlp') # inputs cap = T.imatrix('cap') img = T.matrix('img') self.inputs = [cap, img] # go through sequence init_state = self.proj_mlp.compute(img) (state, self.p, loss), _ = theano.scan(fn=self.scan_func, sequences=[cap[0:-1, :], cap[1:, :]], outputs_info=[init_state, None, None]) # loss function loss = T.mean(loss) self.costs = [loss] # layers and parameters self.layers = [self.embedding, self.proj_mlp, self.lstm, self.pred_mlp] self.params = sum([l.params for l in self.layers], []) # load weights from file, if model_file is not None if model_file is not None: self.load_weights(model_file) # these functions are used in test stage self._init_func = None self._step_func = None
class Model(object): """ scene-specific contexts """ def __init__(self, name='ss', nimg=2048, nh=512, nw=512, nout=8843, ns=80, model_file=None): self.name = name if model_file is not None: with h5py.File(model_file, 'r') as f: nimg = f.attrs['nimg'] nh = f.attrs['nh'] nw = f.attrs['nw'] ns = f.attrs['ns'] nout = f.attrs['nout'] self.config = {'nimg': nimg, 'nh': nh, 'nw': nw, 'nout': nout, 'ns': ns} # word embedding layer self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name+'@embedding') # initialization mlp layer self.proj_mlp = MLP(layer_sizes=[nimg, 2*nh], output_type='tanh', name=self.name+'@proj_mlp') # lstm self.lstm = BasicLSTM(dim_x=nw+ns, dim_h=nh, name=self.name+'@lstm') # prediction mlp self.pred_mlp = MLP(layer_sizes=[nh+nw, nout], output_type='softmax', name=self.name+'@pred_mlp') # inputs cap = T.imatrix('cap') img = T.matrix('img') scene = T.matrix('scene') self.inputs = [cap, img, scene] # go through sequence init_state = self.proj_mlp.compute(img) (state, self.p, loss), _ = theano.scan(fn=self.scan_func, sequences=[cap[0:-1, :], cap[1:, :]], outputs_info=[init_state, None, None], non_sequences=[scene]) # loss function loss = T.mean(loss) self.costs = [loss] # layers and parameters self.layers = [self.embedding, self.proj_mlp, self.lstm, self.pred_mlp] self.params = sum([l.params for l in self.layers], []) # load weights from file, if model_file is not None if model_file is not None: self.load_weights(model_file) # initialization for test stage self._init_func = None self._step_func = None self._scene_shared = theano.shared(np.zeros((1, ns)).astype(theano.config.floatX)) def compute(self, state, w_idx, scene): # word embedding word_vec = self.embedding.compute(w_idx) # split states c_tm1, h_tm1 = split_state(state, scheme=[(2, self.config['nh'])]) # lstm step w_s = T.concatenate([word_vec, scene], axis=1) c_t, h_t = self.lstm.compute(w_s, c_tm1, h_tm1) # merge state new_state = T.concatenate([c_t, h_t], axis=-1) # add w_{t-1} as feature h_and_w = T.concatenate([h_t, word_vec], axis=-1) # predict probability p = self.pred_mlp.compute(h_and_w) return new_state, p def scan_func(self, w_tm1, w_t, state, scene): # update state new_state, p = self.compute(state, w_tm1, scene) # cross-entropy loss loss = T.nnet.categorical_crossentropy(p, w_t) return new_state, p, loss def init_func(self, img_value, scene_value): if self._init_func is None: img = T.matrix() init_state = self.proj_mlp.compute(img) self._init_func = theano.function([img], init_state) self._scene_shared.set_value(scene_value) return self._init_func(img_value) def step_func(self, state_value, w_value): if self._step_func is None: w = T.ivector() state = T.matrix() new_state, p = self.compute(state, w, self._scene_shared) self._step_func = theano.function([state, w], [new_state, T.log(p)]) return self._step_func(state_value, w_value) def save_to_dir(self, save_dir, idx): save_file = osp.join(save_dir, self.name+'.h5.'+str(idx)) for l in self.layers: l.save_weights(save_file) with h5py.File(save_file) as f: for k, v in self.config.items(): f.attrs[k] = v def load_weights(self, model_file): for l in self.layers: l.load_weights(model_file)
class Model(object): """ an re-implementation of google NIC system, used as the baseline in our paper """ def __init__(self, name='gnic', nimg=2048, nh=512, nw=512, nout=8843, model_file=None): self.name = name if model_file is not None: with h5py.File(model_file, 'r') as f: nimg = f.attrs['nimg'] nh = f.attrs['nh'] nw = f.attrs['nw'] nout = f.attrs['nout'] self.config = {'nimg': nimg, 'nh': nh, 'nw': nw, 'nout': nout} # word embedding layer self.embedding = Embedding(n_emb=nout, dim_emb=nw, name=self.name + '@embedding') # initialization mlp layer self.proj_mlp = MLP(layer_sizes=[nimg, 2 * nh], output_type='tanh', name=self.name + '@proj_mlp') # lstm self.lstm = BasicLSTM(dim_x=nw, dim_h=nh, name=self.name + '@lstm') # prediction mlp self.pred_mlp = MLP(layer_sizes=[nh + nw, nout], output_type='softmax', name=self.name + '@pred_mlp') # inputs cap = T.imatrix('cap') img = T.matrix('img') self.inputs = [cap, img] # go through sequence init_state = self.proj_mlp.compute(img) (state, self.p, loss), _ = theano.scan(fn=self.scan_func, sequences=[cap[0:-1, :], cap[1:, :]], outputs_info=[init_state, None, None]) # loss function loss = T.mean(loss) self.costs = [loss] # layers and parameters self.layers = [self.embedding, self.proj_mlp, self.lstm, self.pred_mlp] self.params = sum([l.params for l in self.layers], []) # load weights from file, if model_file is not None if model_file is not None: self.load_weights(model_file) # these functions are used in test stage self._init_func = None self._step_func = None def compute(self, state, w_idx): # word embedding word_vec = self.embedding.compute(w_idx) # split states c_tm1, h_tm1 = split_state(state, scheme=[(2, self.config['nh'])]) # lstm step c_t, h_t = self.lstm.compute(word_vec, c_tm1, h_tm1) # merge state new_state = T.concatenate([c_t, h_t], axis=-1) # add w_{t-1} as feature h_and_w = T.concatenate([h_t, word_vec], axis=-1) # predict probability p = self.pred_mlp.compute(h_and_w) return new_state, p def scan_func(self, w_tm1, w_t, state): # update state new_state, p = self.compute(state, w_tm1) # cross-entropy loss loss = T.nnet.categorical_crossentropy(p, w_t) return new_state, p, loss def init_func(self, img_value): if self._init_func is None: img = T.matrix() init_state = self.proj_mlp.compute(img) self._init_func = theano.function([img], init_state) return self._init_func(img_value) def step_func(self, state_value, w_value): if self._step_func is None: w = T.ivector() state = T.matrix() new_state, p = self.compute(state, w) self._step_func = theano.function([state, w], [new_state, T.log(p)]) return self._step_func(state_value, w_value) def save_to_dir(self, save_dir, idx): save_file = osp.join(save_dir, self.name + '.h5.' + str(idx)) for l in self.layers: l.save_weights(save_file) with h5py.File(save_file) as f: for k, v in self.config.items(): f.attrs[k] = v def load_weights(self, model_file): for l in self.layers: l.load_weights(model_file)
import numpy as np from numpy.random import randn from random import randint from layers import Lstm, Softmax, Embedding DELTA = 1e-5 THRESHOLD = 1e-2 EOS = 0 HIDDEN_SIZE = 10 input_layers = [ Embedding(5, 10), Lstm(10, 10), ] output_layers = [ Embedding(5, 10), Lstm(10, 10, previous=input_layers[1]), Softmax(10, 4), ] X = [randint(0, 4), randint(0, 4)] Y = [randint(0, 3), randint(0, 3)] def train(): # reset state for layer in input_layers: layer.initSequence()