def forward_prop(): for l, l_info in enumerate(ConvModel.layer_info): func_name, func_args = l_info[0], l_info[1:] act_in = ConvModel.act_tensor[l] act_out, dz_out, maxpool_mask = None, None, None # perform layer-wise convolution operation if func_name == 'convolve3d': z_out_3d = Layers.convolve3d(act_in, ConvModel.w_tensor[l], ConvModel.b_tensor[l], func_args[0], func_args[1]) dz_out = Layers.drelu(np.array(z_out_3d)) act_out = Layers.relu(np.array(z_out_3d)) # perform mlp layer-wise forward pass operation if func_name == 'fc_layer': act_in = np.ravel(act_in) z_out = np.dot(act_in, ConvModel.w_tensor[l]) + ConvModel.b_tensor[l] act_out = getattr(Layers, func_args[0])(np.array(z_out)) dz_out = getattr(Layers, 'd' + func_args[0])(np.array(z_out)) # perform layer-wise pooling operation if func_name == 'maxpool3d': pool_outputs = Layers.maxpool3d(input=act_in, filter_dim=func_args[0], stride=func_args[1]) act_out = pool_outputs[0] maxpool_mask = pool_outputs[1] dz_out = Layers.drelu(np.array(act_out)) ConvModel.act_tensor[l + 1] = np.array(act_out) ConvModel.dz_tensor[l] = np.array(dz_out) ConvModel.maxpool_masks[l] = np.array(maxpool_mask)
def ssd_multibox_layer(net, num_classes, sizes, ratios=[1], normalization=-1, bn_normalization=False): if normalization > 0: net = Layers.l2_normalization(net, scaling=True) # Number of anchors num_anchors = len(sizes) + len(ratios) # Location num_loc_pred = num_anchors * 4 loc_pred = Layers.conv2d(net, net.get_shape[-1], num_loc_pred, 3, 1, 'SAME', 'conv_loc', activation_fn=False) loc_pred = Layers.channel_to_last(loc_pred) loc_pred = tf.reshape(loc_pred, tensor_shape(loc_pred, 4)[:-1] + [num_anchors, 4]) # Class prediction pass
def __init__(self, d, lr, lambda_z_wu, read_attn, write_attn, do_classify, do_reconst): self.do_classify = do_classify """ flags for each regularizor """ self.do_reconst = do_reconst self.read_attn = read_attn self.write_attn = write_attn """ dataset information """ self.set_datainfo(d) """ external toolkits """ self.ls = Layers() self.lf = LossFunctions(self.ls, self.d, self.encoder) self.ii = ImageInterface(_is_3d, self.read_attn, self.write_attn, GLIMPSE_SIZE_READ, GLIMPSE_SIZE_WRITE, _h, _w, _c) # for refference from get_loss_kl_draw() self.T = T self.L = L self.Z_SIZES = Z_SIZES """ placeholders defined outside""" self.lr = lr self.lambda_z_wu = lambda_z_wu """sequence of canvases """ self.cs = [0] * T """ initialization """ self.init_lstms() self.init_time_zero() """ workaround for variable_scope(reuse=True) """ self.DO_SHARE = None
def _build_model(self): w_initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01) b_initializer = tf.constant_initializer(0.1) with tf.variable_scope('rnn_block'): gru_cell_fw = tf.nn.rnn_cell.MultiRNNCell( [Layers.dropout_wrapped_gru_cell(self._dropout_keep_prob, num_units=self.opt["rnn_node_num"], activation=tf.nn.relu) for _ in range(self.opt["rnn_layer_num"])]) gru_cell_bw = tf.nn.rnn_cell.MultiRNNCell( [Layers.dropout_wrapped_gru_cell(self._dropout_keep_prob, num_units=self.opt["rnn_node_num"], activation=tf.nn.relu) for _ in range(self.opt["rnn_layer_num"])]) _, h_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=gru_cell_fw, cell_bw=gru_cell_bw, inputs=self.emb_sent, dtype=tf.float32) rnn_output = tf.concat([h_state[0][-1], h_state[1][-1]], axis=1) with tf.variable_scope('fully_connected_block'): fully_connected_layer_1 = tf.layers.dense(rnn_output, self.opt["fully_connected_layer_1_node_num"], kernel_initializer=w_initializer, bias_initializer=b_initializer, activation=tf.nn.relu) fully_connected_layer_1 = tf.nn.dropout(fully_connected_layer_1, self._dropout_keep_prob) fully_connected_layer_2 = tf.layers.dense(fully_connected_layer_1, self.opt["fully_connected_layer_2_node_num"], kernel_initializer=w_initializer, bias_initializer=b_initializer, activation=tf.nn.relu) fully_connected_layer_2 = tf.nn.dropout(fully_connected_layer_2, self._dropout_keep_prob) fully_connected_layer_3 = tf.layers.dense(fully_connected_layer_2, self.opt["fully_connected_layer_3_node_num"], kernel_initializer=w_initializer, bias_initializer=b_initializer, activation=tf.nn.relu) fully_connected_layer_3 = tf.nn.dropout(fully_connected_layer_3, self._dropout_keep_prob) with tf.name_scope('output'): output_layer = tf.layers.dense(fully_connected_layer_3, self.opt["label_dim"], kernel_initializer=w_initializer, bias_initializer=b_initializer, activation=tf.nn.relu) tf.summary.histogram('logits', output_layer) self.output = tf.arg_max(output_layer, 1) with tf.name_scope('loss'): self.loss = \ tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self._label, logits=output_layer)) tf.summary.scalar("training_loss", self.loss) with tf.name_scope('adam_optimizer'): self.train_step = tf.train.AdamOptimizer(1e-4).minimize(self.loss)
def __init__(self, d, lr, lambda_z_wu, do_classify, use_kl=True): """ model architecture """ self.MLP_SIZES = [512, 256, 256, 128, 128] self.Z_SIZES = [64, 32, 32, 32, 32] self.L = L = len(self.MLP_SIZES) self.do_classify = do_classify """ flags for regularizers """ self.use_kl = use_kl """ data and external toolkits """ self.d = d # dataset manager self.ls = Layers() self.lf = LossFunctions(self.ls, d, self.encoder) """ placeholders defined outside""" self.lr = lr self.lambda_z_wu = lambda_z_wu """ cache for mu and sigma """ self.e_mus, self.e_logsigmas = [0] * L, [ 0 ] * L # q(z_i+1 | z_i), bottom-up inference as Eq.7-9 self.p_mus, self.p_logsigmas = [0] * L, [ 0 ] * L # p(z_i | z_i+1), top-down prior as Eq.1-3 self.d_mus, self.d_logsigmas = [0] * L, [ 0 ] * L # q(z_i | .), bidirectional inference as Eq.17-19
def __init__(self, resource): """ data and external toolkits """ self.d = resource.dh # dataset manager self.ls = Layers() self.lf = LossFunctions(self.ls, self.d, self.encoder) """ placeholders defined outside""" if c.DO_TRAIN: self.lr = resource.ph['lr']
def residual_net(inputs, scope, reuse=False): with tf.variable_scope(scope, reuse=reuse): res1 = Layers.residual(inputs, 128, 3, 1, 'SAME', 'res1') res2 = Layers.residual(res1, 128, 3, 1, 'SAME', 'res2') res3 = Layers.residual(res2, 128, 3, 1, 'SAME', 'res3') res4 = Layers.residual(res3, 128, 3, 1, 'SAME', 'res4') res5 = Layers.residual(res4, 128, 3, 1, 'SAME', 'res5') # res6 = Layers.residual(res5, 128, 3, 1, 'SAME', 'res6') return res5
def __init__(self, d, lr, lambda_pi_usl, use_pi): """ flags for each regularizor """ self.use_pi = use_pi """ data and external toolkits """ self.d = d # dataset manager self.ls = Layers() self.lf = LossFunctions(self.ls, d, self.encoder) """ placeholders defined outside""" self.lr = lr self.lambda_pi_usl = lambda_pi_usl
def __init__(self, num_inputs, num_hidden, num_outputs, learn_rate, h_w=None, h_b=None, o_w=None, o_b=None): self.num_inputs = num_inputs self.hidden_layer = Layers(num_hidden, h_b) self.output_layer = Layers(num_outputs, o_b) self.init_weights_i2h(h_w) self.init_weights_h2o(o_w) self.learn_rate = learn_rate
def subsampled(inputs, reuse=False): # Less border effect inputs = Layers.pad(inputs) with tf.variable_scope('subsampled', reuse=reuse): conv1 = Layers.conv2d(inputs, 3, 32, 9, 1, 'SAME', 'conv1') norm1 = Layers.instance_norm(conv1) relu1 = Layers.relu(norm1) conv2 = Layers.conv2d(relu1, 32, 64, 3, 2, 'SAME', 'conv2') norm2 = Layers.instance_norm(conv2) relu2 = Layers.relu(norm2) conv3 = Layers.conv2d(relu2, 64, 128, 3, 2, 'SAME', 'conv3') norm3 = Layers.instance_norm(conv3) relu3 = Layers.relu(norm3) return relu3
def __init__(self, is_3d, is_read_attention, is_write_attention, read_n, write_n, h, w, c): """ to manage do_share flag inside Layers object, ImageInterface has Layers as its own property """ self.do_share = False self.ls = Layers() self.is_3d = is_3d self.read_n = read_n self.write_n = write_n self.h = h self.w = w self.c = c if is_read_attention: self.read = self._read_attention else: self.read = self._read_no_attention if is_write_attention: self.write = self._write_attention else: self.write = self._write_no_attention
def __init__(self, variable_list, fully_index, sparse_index, sparse_num, input_shape): self._variable_list = copy.deepcopy(variable_list) self.layers = Layers( weight_init=tf.contrib.layers.xavier_initializer(), regularizer=tf.contrib.layers.l2_regularizer( FLAGS.regularizer_factor), bias_init=tf.constant_initializer(0.0)) self.fully_index = fully_index self.sparse_index = sparse_index self.sparse_num = sparse_num self.index_fix = [0, self.fully_index, self.sparse_index] self._input_shape = input_shape
def upsampling(inputs, reuse=False): with tf.variable_scope('upsampling', reuse=reuse): deconv1 = Layers.resize_conv2d(inputs, 128, 64, 3, 2, 'SAME', 'deconv1') denorm1 = Layers.instance_norm(deconv1) derelu1 = Layers.relu(denorm1) deconv2 = Layers.resize_conv2d(derelu1, 64, 32, 3, 2, 'SAME', 'deconv2') denorm2 = Layers.instance_norm(deconv2) derelu2 = Layers.relu(denorm2) deconv3 = Layers.resize_conv2d(derelu2, 32, 3, 9, 1, 'SAME', 'deconv3') denorm3 = Layers.instance_norm(deconv3) detanh3 = tf.nn.tanh(denorm3) y = (detanh3 + 1) * 127.5 # Remove the border effect y = Layers.remove_pad(y) return y
def backprop(): for l in range(len(ConvModel.layer_info) - 1, -1, -1): # retrieve dcdact for final layer if l == len(ConvModel.layer_info) - 1: dact_l = Layers.dact_fc_layer_outer(ConvModel.act_tensor[l + 1], ConvModel.target) # retrieve dcdact for layers that link to fc layers elif ConvModel.layer_info[l + 1][0] == 'fc_layer': dact_l = Layers.dact_fc_layer_inner(ConvModel.w_tensor[l + 1], ConvModel.dz_tensor[l + 1], ConvModel.dact_tensor[l + 1]).reshape(ConvModel.dz_tensor[l].shape) # retrieve dcdact for layers that link to convolutional layers elif ConvModel.layer_info[l + 1][0] == 'convolve3d': dact_l = Layers.dact_conv_l(ConvModel.dz_tensor[l], ConvModel.layer_info[l + 1][2], ConvModel.layer_info[l + 1][1], ConvModel.w_tensor[l + 1], ConvModel.dz_tensor[l + 1], ConvModel.dact_tensor[l + 1]) # no backprop operations for layers that link to pooling layers else: continue # retrieve cost derivatives for fully connected layer if ConvModel.layer_info[l][0] == 'fc_layer': cost_diff = Layers.dcdw_fc(ConvModel.dz_tensor[l], ConvModel.act_tensor[l], dact_l) # retrieve cost derivatives for convolutional layer without maxpooling elif ConvModel.layer_info[l][0] == 'convolve3d': cost_diff = Layers.dcdw_conv_l(ConvModel.act_tensor[l], ConvModel.layer_info[l][2], ConvModel.layer_info[l][1], ConvModel.w_tensor[l].shape[3], ConvModel.dz_tensor[l], dact_l) # retrieve cost derivatives for convolutional layer with maxpooling else: dact_l = Layers.apply_maxpool_mask(dact_l, ConvModel.layer_info[l][1], ConvModel.layer_info[l][2], ConvModel.maxpool_masks[l]) cost_diff = Layers.dcdw_conv_l(ConvModel.act_tensor[l - 1], ConvModel.layer_info[l - 1][2], ConvModel.layer_info[l - 1][1], ConvModel.w_tensor[l - 1].shape[3], ConvModel.dz_tensor[l - 1], dact_l) # store the cost derivatives if ConvModel.layer_info[l][0] == 'maxpool3d': ConvModel.dw_tensor[l - 1] = cost_diff[0] ConvModel.db_tensor[l - 1] = cost_diff[1] ConvModel.dact_tensor[l - 1] = cost_diff[2] else: ConvModel.dw_tensor[l] = cost_diff[0] ConvModel.db_tensor[l] = cost_diff[1] ConvModel.dact_tensor[l] = cost_diff[2]
class PI(object): def __init__(self, d, lr, lambda_pi_usl, use_pi): """ flags for each regularizor """ self.use_pi = use_pi """ data and external toolkits """ self.d = d # dataset manager self.ls = Layers() self.lf = LossFunctions(self.ls, d, self.encoder) """ placeholders defined outside""" self.lr = lr self.lambda_pi_usl = lambda_pi_usl def encoder(self, x, is_train=True, do_update_bn=True): """ https://arxiv.org/pdf/1610.02242.pdf """ if is_train: h = self.distort(x) h = self.ls.get_corrupted(x, 0.15) else: h = x scope = '1' h = self.ls.conv2d(scope + '_1', h, 128, activation=self.ls.lrelu) h = self.ls.conv2d(scope + '_2', h, 128, activation=self.ls.lrelu) h = self.ls.conv2d(scope + '_3', h, 128, activation=self.ls.lrelu) h = self.ls.max_pool(h) if is_train: h = tf.nn.dropout(h, 0.5) scope = '2' h = self.ls.conv2d(scope + '_1', h, 256, activation=self.ls.lrelu) h = self.ls.conv2d(scope + '_2', h, 256, activation=self.ls.lrelu) h = self.ls.conv2d(scope + '_3', h, 256, activation=self.ls.lrelu) h = self.ls.max_pool(h) if is_train: h = tf.nn.dropout(h, 0.5) scope = '3' h = self.ls.conv2d(scope + '_1', h, 512, activation=self.ls.lrelu) h = self.ls.conv2d(scope + '_2', h, 256, activation=self.ls.lrelu, filter_size=(1, 1)) h = self.ls.conv2d(scope + '_3', h, 128, activation=self.ls.lrelu, filter_size=(1, 1)) h = tf.reduce_mean(h, reduction_indices=[1, 2]) # Global average pooling h = self.ls.dense(scope, h, self.d.l) return h def build_graph_train(self, x_l, y_l, x, is_supervised=True): o = dict() # output loss = 0 logit = self.encoder(x) with tf.variable_scope(tf.get_variable_scope(), reuse=True): logit_l = self.encoder( x_l, is_train=True, do_update_bn=False) # for pyx and vat loss computation """ Classification Loss """ o['Ly'], o['accur'] = self.lf.get_loss_pyx(logit_l, y_l) loss += o['Ly'] """ PI Model Loss """ if self.use_pi: with tf.variable_scope(tf.get_variable_scope(), reuse=True): _, _, o['Lp'] = self.lf.get_loss_pi(x, logit, is_train=True) loss += self.lambda_pi_usl * o['Lp'] else: o['Lp'] = tf.constant(0) """ set losses """ o['loss'] = loss self.o_train = o """ set optimizer """ optimizer = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.5) #self.op = optimizer.minimize(loss) grads = optimizer.compute_gradients(loss) for i, (g, v) in enumerate(grads): if g is not None: #g = tf.Print(g, [g], "g %s = "%(v)) grads[i] = (tf.clip_by_norm(g, 5), v) # clip gradients else: print('g is None:', v) v = tf.Print(v, [v], "v = ", summarize=10000) self.op = optimizer.apply_gradients(grads) # return train_op def build_graph_test(self, x_l, y_l): o = dict() # output loss = 0 logit_l = self.encoder( x_l, is_train=False, do_update_bn=False) # for pyx and vat loss computation """ classification loss """ o['Ly'], o['accur'] = self.lf.get_loss_pyx(logit_l, y_l) loss += o['Ly'] """ set losses """ o['loss'] = loss self.o_test = o def distort(self, x): _d = self.d def _distort(a_image): """ bounding_boxes: A Tensor of type float32. 3-D with shape [batch, N, 4] describing the N bounding boxes associated with the image. Bounding boxes are supplied and returned as [y_min, x_min, y_max, x_max] """ # shape: [1, 1, 4] bounding_boxes = tf.constant([[[1 / 10, 1 / 10, 9 / 10, 9 / 10]]], dtype=tf.float32) begin, size, _ = tf.image.sample_distorted_bounding_box( (_d.h, _d.w, _d.c), bounding_boxes, min_object_covered=(8.5 / 10.0), aspect_ratio_range=[7.0 / 10.0, 10.0 / 7.0]) a_image = tf.slice(a_image, begin, size) """ for the purpose of distorting not use tf.image.resize_image_with_crop_or_pad under """ a_image = tf.image.resize_images(a_image, [_d.h, _d.w]) """ due to the size of channel returned from tf.image.resize_images is not being given, specify it manually. """ a_image = tf.reshape(a_image, [_d.h, _d.w, _d.c]) return a_image """ process batch times in parallel """ return tf.map_fn(_distort, x)
def __init__(self, channel=None): self.rng_numpy, self.rng_theano = get_two_rngs() self.layers = Layers() self.predict = Predict() self.channel = channel
class LVAE(object): def __init__(self, d, lr, lambda_z_wu, do_classify, use_kl=True): """ model architecture """ self.MLP_SIZES = [512, 256, 256, 128, 128] self.Z_SIZES = [64, 32, 32, 32, 32] self.L = L = len(self.MLP_SIZES) self.do_classify = do_classify """ flags for regularizers """ self.use_kl = use_kl """ data and external toolkits """ self.d = d # dataset manager self.ls = Layers() self.lf = LossFunctions(self.ls, d, self.encoder) """ placeholders defined outside""" self.lr = lr self.lambda_z_wu = lambda_z_wu """ cache for mu and sigma """ self.e_mus, self.e_logsigmas = [0] * L, [ 0 ] * L # q(z_i+1 | z_i), bottom-up inference as Eq.7-9 self.p_mus, self.p_logsigmas = [0] * L, [ 0 ] * L # p(z_i | z_i+1), top-down prior as Eq.1-3 self.d_mus, self.d_logsigmas = [0] * L, [ 0 ] * L # q(z_i | .), bidirectional inference as Eq.17-19 def encoder(self, x, is_train=True, do_update_bn=True): h = x for l in range(self.L): scope = 'Encode_L' + str(l) h = self.ls.dense(scope, h, self.MLP_SIZES[l]) h = self.ls.bn(scope, h, is_train, do_update_bn, name=scope) h = tf.nn.elu(h) """ prepare for bidirectional inference """ _, self.e_mus[l], self.e_logsigmas[l] = self.ls.vae_sampler( scope, h, self.Z_SIZES[l], tf.nn.softplus) # Eq.13-15 #return h return self.e_mus[-1] def decoder(self, is_train=True, do_update_bn=True): for l in range(self.L - 1, -1, -1): scope = 'Decoder_L' + str(l) if l == self.L - 1: """ At the highest latent layer, mu & sigma are identical to those outputed from encoer. And making actual z is not necessary for the highest layer.""" mu, logsigma = self.e_mus[l], self.e_logsigmas[l] self.d_mus[l], self.d_logsigmas[l] = mu, logsigma z = self.ls.sampler(self.d_mus[l], tf.exp(self.d_logsigmas[l])) """ prior of z_L is set as standard Gaussian, N(0,I). """ self.p_mus[l], self.p_logsigmas[l] = tf.zeros( (mu.get_shape())), tf.zeros((logsigma.get_shape())) else: """ prior is developed from z of the above layer """ _, self.p_mus[l], self.p_logsigmas[l] = self.ls.vae_sampler( scope, z, self.Z_SIZES[l], tf.nn.softplus) # Eq.13-15 z, self.d_mus[l], self.d_logsigmas[ l] = self.ls.precision_weighted_sampler( scope, (self.e_mus[l], tf.exp(self.e_logsigmas[l])), (self.p_mus[l], tf.exp( self.p_logsigmas[l]))) # Eq.17-19 """ go out to the input space """ _d = self.d x = self.ls.dense('bottom', z, _d.img_size, tf.nn.elu) # reconstructed input if _d.is_3d: x = tf.reshape(x, (-1, _d.h, _d.w, _d.c)) return x def build_graph_train(self, x_l, y_l, x): o = dict() # output loss = 0 logit = self.encoder(x) x_reconst = self.decoder() with tf.variable_scope(tf.get_variable_scope(), reuse=True): logit_l = self.encoder( x_l, is_train=True, do_update_bn=False) # for pyx and vat loss computation """ Classification Loss """ if self.do_classify: o['Ly'], o['accur'] = self.lf.get_loss_pyx(logit_l, y_l) loss += o['Ly'] """ for visualizationc """ o['z'], o['y'] = logit, y_l """ p(x|z) Reconstruction Loss """ o['Lr'] = self.lf.get_loss_pxz(x_reconst, x, 'DiscretizedLogistic') loss += o['Lr'] o['x'] = x o['cs'] = x_reconst """ VAE KL-Divergence Loss """ if self.use_kl: o['KL1'], o['KL2'], o['Lz'] = self.lf.get_loss_kl(self, _lambda=10.0) loss += self.lambda_z_wu * o['Lz'] else: o['KL1'], o['KL2'], o['Lz'] = tf.constant(0), tf.constant( 0), tf.constant(0) """ set losses """ o['loss'] = loss self.o_train = o """ set optimizer """ optimizer = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.5) #self.op = optimizer.minimize(loss) grads = optimizer.compute_gradients(loss) for i, (g, v) in enumerate(grads): if g is not None: #g = tf.Print(g, [g], "g %s = "%(v)) grads[i] = (tf.clip_by_norm(g, 5), v) # clip gradients else: print('g is None:', v) v = tf.Print(v, [v], "v = ", summarize=10000) #for v in tf.all_variables(): print("%s : %s" % (v.name,v.get_shape())) self.op = optimizer.apply_gradients(grads) # return train_op def build_graph_test(self, x_l, y_l): o = dict() # output loss = 0 logit_l = self.encoder( x_l, is_train=False, do_update_bn=False) # for pyx and vat loss computation """ classification loss """ if self.do_classify: o['Ly'], o['accur'] = self.lf.get_loss_pyx(logit_l, y_l) loss += o['Ly'] """ for visualizationc """ o['z'], o['y'] = logit_l, y_l """ set losses """ o['loss'] = loss self.o_test = o
MAXLEN = 10 HIDDEN_SIZE = 100 EPOCHS = 500 MIN_COUNT = 1 lr = 0.5 pretrained = False activation = "linear" helper = DataHelper() emb_matrix, inputs, targets = helper.trainset_preparation( path_data, EMB_DIM, MAXLEN, BATCH_SIZE, MIN_COUNT, pretrained) VOCAB_SIZE = len(helper.stoi) helper.save("./binaries/vocab.pkl") itos = {i: t for t, i in helper.stoi.items()} lm = Layers(VOCAB_SIZE, EMB_DIM, HIDDEN_SIZE, emb_matrix, activation) def generate(sent_input, len_sent): sent_input = helper.preprocessing(sent_input) token = sent_input.split() sequence = helper.transform(token) for i in range(len_sent): inputs = np.array([sequence]) prob = lm.forward(inputs)[-1][-1] index = np.argmax(prob) sequence.append(index) print(" ".join([itos[idx] for idx in sequence])) for e in range(EPOCHS):
class Attention(object): def __init__(self, channel=None): self.rng_numpy, self.rng_theano = get_two_rngs() self.layers = Layers() self.predict = Predict() self.channel = channel def load_params(self, path, params): # load params from disk pp = np.load(path) for kk, vv in params.iteritems(): if kk not in pp: raise Warning('%s is not in the archive'%kk) params[kk] = pp[kk] return params def init_params(self, options): # all parameters params = OrderedDict() # embedding params['Wemb'] = norm_weight(options['n_words'], options['dim_word']) ctx_dim = options['ctx_dim'] # init_state, init_cell params = self.layers.get_layer('ff')[0](params, nin=ctx_dim, nout=options['mu_dim'], prefix='ff_state') params = self.layers.get_layer('ff')[0](params, nin=ctx_dim, nout=options['mu_dim'], prefix='ff_memory') # decoder: LSTM params = self.layers.get_layer('lstm')[0](params, nin=options['dim_word'], dim=options['tu_dim'], prefix='tu_lstm') params = self.layers.get_layer('attend')[0](params, nin=options['tu_dim'], dimctx=ctx_dim, prefix='attend') params = self.layers.get_layer('lstm_concat')[0](options, params, nin=options['tu_dim'], dim=options['mu_dim'], dimctx=ctx_dim, prefix='mu_lstm') # readout params = self.layers.get_layer('ff')[0](params, nin=options['mu_dim'], nout=options['dim_word'], prefix='ff_logit_lstm') if options['ctx2out']: params = self.layers.get_layer('ff')[0](params, nin=ctx_dim, nout=options['dim_word'], prefix='ff_logit_ctx') params = self.layers.get_layer('ff')[0](params, nin=options['dim_word'], nout=options['n_words'], prefix='ff_logit') return params def build_model(self, tparams, options): trng = RandomStreams(1234) use_noise = theano.shared(np.float32(0.)) # description string: #words x #samples x = tensor.matrix('x', dtype='int64') mask = tensor.matrix('mask', dtype='float32') # context: #samples x #annotations x dim ctx = tensor.tensor3('ctx', dtype='float32') mask_ctx = tensor.matrix('mask_ctx', dtype='float32') n_timesteps = x.shape[0] n_samples = x.shape[1] # index into the word embedding matrix, shift it forward in time emb = tparams['Wemb'][x.flatten()].reshape( [n_timesteps, n_samples, options['dim_word']]) emb_shifted = tensor.zeros_like(emb) emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) emb = emb_shifted ctx_ = ctx counts = mask_ctx.sum(-1).dimshuffle(0,'x') ctx_mean = ctx_.sum(1)/counts # initial state/cell init_state = self.layers.get_layer('ff')[1](tparams, ctx_mean, activ='tanh', prefix='ff_state') init_memory = self.layers.get_layer('ff')[1](tparams, ctx_mean, activ='tanh', prefix='ff_memory') # decoder tu_lstm = self.layers.get_layer('lstm')[1](tparams, emb, mask=mask, prefix='tu_lstm') attend = self.layers.get_layer('attend')[1](tparams, tu_lstm[0], ctx_) mu_lstm = self.layers.get_layer('lstm_concat')[1](options, tparams, tu_lstm[0], mask=mask, ctxs=attend[1], one_step=False, init_state=init_state, init_memory=init_memory, trng=trng, use_noise=use_noise, prefix='mu_lstm') proj_h = mu_lstm[0] betas = mu_lstm[2] ctxs = mu_lstm[3] alphas = attend[0] if options['use_dropout']: proj_h = self.layers.dropout_layer(proj_h, use_noise, trng) # compute word probabilities logit = self.layers.get_layer('ff')[1](tparams, proj_h, activ='linear', prefix='ff_logit_lstm') if options['prev2out']: logit += emb if options['ctx2out']: logit += self.layers.get_layer('ff')[1](tparams, ctxs, activ='linear', prefix='ff_logit_ctx') logit = tanh(logit) if options['use_dropout']: logit = self.layers.dropout_layer(logit, use_noise, trng) # (t,m,n_words) logit = self.layers.get_layer('ff')[1](tparams, logit, activ='linear', prefix='ff_logit') logit_shp = logit.shape # (t*m, n_words) probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])) # cost x_flat = x.flatten() # (t*m,) cost = -tensor.log(probs[T.arange(x_flat.shape[0]), x_flat] + 1e-8) cost = cost.reshape([x.shape[0], x.shape[1]]) cost = (cost * mask).sum(0) extra = [probs, alphas, betas] test = [attend[1]] return trng, use_noise, x, mask, ctx, mask_ctx, alphas, cost, extra, test def pred_probs(self, whichset, f_log_probs, verbose=True): probs = [] n_done = 0 NLL = [] L = [] if whichset == 'train': tags = self.engine.train iterator = self.engine.kf_train elif whichset == 'valid': tags = self.engine.valid iterator = self.engine.kf_valid elif whichset == 'test': tags = self.engine.test iterator = self.engine.kf_test else: raise NotImplementedError() n_samples = np.sum([len(index) for index in iterator]) for index in iterator: tag = [tags[i] for i in index] x, mask, ctx, ctx_mask,vid_names = data_engine.prepare_data( self.engine, tag) pred_probs = f_log_probs(x, mask, ctx, ctx_mask) L.append(mask.sum(0).tolist()) NLL.append((-1 * pred_probs).tolist()) probs.append(pred_probs.tolist()) n_done += len(tag) if verbose: sys.stdout.write('\rComputing LL on %d/%d examples'%( n_done, n_samples)) sys.stdout.flush() print probs = flatten_list_of_list(probs) NLL = flatten_list_of_list(NLL) L = flatten_list_of_list(L) perp = 2**(np.sum(NLL) / np.sum(L) / np.log(2)) return -1 * np.mean(probs), perp def train(self, random_seed=1234, reload_=False, verbose=True, debug=True, save_model_dir='', from_dir=None, # dataset dataset='youtube2text', video_feature='googlenet', K=10, OutOf=240, # network dim_word=256, # word vector dimensionality ctx_dim=-1, # context vector dimensionality, auto set tu_dim=512, mu_dim=1024, vu_dim=1024, n_layers_out=1, n_layers_init=1, prev2out=False, ctx2out=False, selector=False, n_words=100000, maxlen=100, # maximum length of the description use_dropout=False, isGlobal=False, # training patience=10, max_epochs=5000, decay_c=0., alpha_c=0., alpha_entropy_r=0., lrate=0.01, optimizer='adadelta', clip_c=2., # minibatch batch_size = 64, valid_batch_size = 64, dispFreq=100, validFreq=10, saveFreq=10, # save the parameters after every saveFreq updates sampleFreq=10, # generate some samples after every sampleFreq updates # metric metric='blue' ): self.rng_numpy, self.rng_theano = get_two_rngs() model_options = locals().copy() if 'self' in model_options: del model_options['self'] model_options = validate_options(model_options) with open('%smodel_options.pkl'%save_model_dir, 'wb') as f: pkl.dump(model_options, f) print 'Loading data' self.engine = data_engine.Movie2Caption('attention', dataset, video_feature, batch_size, valid_batch_size, maxlen, n_words, K, OutOf) model_options['ctx_dim'] = self.engine.ctx_dim print 'init params' t0 = time.time() params = self.init_params(model_options) # reloading if reload_: model_saved = from_dir+'/model_best_so_far.npz' assert os.path.isfile(model_saved) print "Reloading model params..." params = load_params(model_saved, params) tparams = init_tparams(params) if verbose: print tparams.keys trng, use_noise, x, mask, ctx, mask_ctx, alphas, cost, extra, test = \ self.build_model(tparams, model_options) if debug: print 'buliding test' test_fun = theano.function([x, mask, ctx, mask_ctx], test, name='f_test', on_unused_input='ignore') print 'buliding sampler' f_init, f_next = self.predict.build_sampler(self.layers, tparams, model_options, use_noise, trng) # before any regularizer print 'building f_log_probs' f_log_probs = theano.function([x, mask, ctx, mask_ctx], -cost, profile=False, on_unused_input='ignore') cost = cost.mean() if decay_c > 0.: decay_c = theano.shared(np.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay if alpha_c > 0.: alpha_c = theano.shared(np.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ((1.-alphas.sum(0))**2).sum(0).mean() cost += alpha_reg if alpha_entropy_r > 0: alpha_entropy_r = theano.shared(np.float32(alpha_entropy_r), name='alpha_entropy_r') alpha_reg_2 = alpha_entropy_r * (-tensor.sum(alphas * tensor.log(alphas+1e-8),axis=-1)).sum(0).mean() cost += alpha_reg_2 else: alpha_reg_2 = tensor.zeros_like(cost) print 'building f_alpha' f_alpha = theano.function([x, mask, ctx, mask_ctx], [alphas, alpha_reg_2], name='f_alpha', on_unused_input='ignore') print 'compute grad' grads = tensor.grad(cost, wrt=itemlist(tparams)) if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'build train fns' f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, [x, mask, ctx, mask_ctx], cost, extra + grads) print 'compilation took %.4f sec'%(time.time()-t0) print 'Optimization' history_errs = [] # reload history if reload_: print 'loading history error...' history_errs = np.load( from_dir+'model_best_so_far.npz')['history_errs'].tolist() bad_counter = 0 processes = None queue = None rqueue = None shared_params = None uidx = 0 uidx_best_blue = 0 uidx_best_valid_err = 0 estop = False best_p = unzip(tparams) best_blue_valid = 0 best_valid_err = 999 alphas_ratio = [] for eidx in xrange(max_epochs): n_samples = 0 train_costs = [] grads_record = [] print 'Epoch ', eidx for idx in self.engine.kf_train: tags = [self.engine.train[index] for index in idx] n_samples += len(tags) uidx += 1 use_noise.set_value(1.) pd_start = time.time() x, mask, ctx, ctx_mask,vid_names = data_engine.prepare_data( self.engine, tags) if debug: datas = test_fun(x, mask, ctx, ctx_mask) for item in datas: print item[0].shape pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', maxlen continue ud_start = time.time() rvals = f_grad_shared(x, mask, ctx, ctx_mask) cost = rvals[0] probs = rvals[1] alphas = rvals[2] betas = rvals[3] grads = rvals[4:] grads, NaN_keys = grad_nan_report(grads, tparams) if len(grads_record) >= 5: del grads_record[0] grads_record.append(grads) if NaN_keys != []: print 'grads contain NaN' import pdb; pdb.set_trace() if np.isnan(cost) or np.isinf(cost): print 'NaN detected in cost' import pdb; pdb.set_trace() # update params f_update(lrate) ud_duration = time.time() - ud_start if eidx == 0: train_error = cost else: train_error = train_error * 0.95 + cost * 0.05 train_costs.append(cost) if np.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, ', Update ', uidx, \ ', Train cost mean so far', train_error, \ ', betas mean', np.round(betas.mean(), 3), \ ', fetching data time spent (sec)', np.round(pd_duration, 3), \ ', update time spent (sec)', np.round(ud_duration, 3) alphas,reg = f_alpha(x,mask,ctx,ctx_mask) print 'alpha ratio %.3f, reg %.3f' % ( alphas.min(-1).mean() / (alphas.max(-1)).mean(), reg) if np.mod(uidx, saveFreq) == 0: pass if np.mod(uidx, sampleFreq) == 0: use_noise.set_value(0.) print '------------- sampling from train ----------' self.predict.sample_execute(self.engine, model_options, tparams, f_init, f_next, x, ctx, ctx_mask, trng,vid_names) print '------------- sampling from valid ----------' idx = self.engine.kf_valid[np.random.randint(1, len(self.engine.kf_valid) - 1)] tags = [self.engine.valid[index] for index in idx] x_s, mask_s, ctx_s, mask_ctx_s,vid_names = data_engine.prepare_data(self.engine, tags) self.predict.sample_execute(self.engine, model_options, tparams, f_init, f_next, x_s, ctx_s, mask_ctx_s, trng, vid_names) # end of sample if validFreq != -1 and np.mod(uidx, validFreq) == 0: t0_valid = time.time() alphas,_ = f_alpha(x, mask, ctx, ctx_mask) ratio = alphas.min(-1).mean()/(alphas.max(-1)).mean() alphas_ratio.append(ratio) np.savetxt(save_model_dir+'alpha_ratio.txt',alphas_ratio) current_params = unzip(tparams) np.savez(save_model_dir+'model_current.npz', history_errs=history_errs, **current_params) use_noise.set_value(0.) train_err = -1 train_perp = -1 valid_err = -1 valid_perp = -1 test_err = -1 test_perp = -1 if not debug: # first compute train cost if 0: print 'computing cost on trainset' train_err, train_perp = self.pred_probs( 'train', f_log_probs, verbose=model_options['verbose']) else: train_err = 0. train_perp = 0. if 1: print 'validating...' valid_err, valid_perp = self.pred_probs( 'valid', f_log_probs, verbose=model_options['verbose'], ) else: valid_err = 0. valid_perp = 0. if 0: print 'testing...' test_err, test_perp = self.pred_probs( 'test', f_log_probs, verbose=model_options['verbose'] ) else: test_err = 0. test_perp = 0. mean_ranking = 0 blue_t0 = time.time() scores, processes, queue, rqueue, shared_params = \ metrics.compute_score(model_type='attention', model_archive=current_params, options=model_options, engine=self.engine, save_dir=save_model_dir, beam=5, n_process=5, whichset='both', on_cpu=False, processes=processes, queue=queue, rqueue=rqueue, shared_params=shared_params, metric=metric, one_time=False, f_init=f_init, f_next=f_next, model=self.predict ) valid_B1 = scores['valid']['Bleu_1'] valid_B2 = scores['valid']['Bleu_2'] valid_B3 = scores['valid']['Bleu_3'] valid_B4 = scores['valid']['Bleu_4'] valid_Rouge = scores['valid']['ROUGE_L'] valid_Cider = scores['valid']['CIDEr'] valid_meteor = scores['valid']['METEOR'] test_B1 = scores['test']['Bleu_1'] test_B2 = scores['test']['Bleu_2'] test_B3 = scores['test']['Bleu_3'] test_B4 = scores['test']['Bleu_4'] test_Rouge = scores['test']['ROUGE_L'] test_Cider = scores['test']['CIDEr'] test_meteor = scores['test']['METEOR'] print 'computing meteor/blue score used %.4f sec, '\ 'blue score: %.1f, meteor score: %.1f'%( time.time()-blue_t0, valid_B4, valid_meteor) history_errs.append([eidx, uidx, train_perp, train_err, valid_perp, valid_err, test_perp, test_err, valid_B1, valid_B2, valid_B3, valid_B4, valid_meteor, valid_Rouge, valid_Cider, test_B1, test_B2, test_B3, test_B4, test_meteor, test_Rouge, test_Cider]) np.savetxt(save_model_dir+'train_valid_test.txt', history_errs, fmt='%.3f') print 'save validation results to %s'%save_model_dir # save best model according to the best blue or meteor if len(history_errs) > 1 and valid_B4 > np.array(history_errs)[:-1, 11].max(): print 'Saving to %s...'%save_model_dir, np.savez( save_model_dir+'model_best_blue_or_meteor.npz', history_errs=history_errs, **best_p) if len(history_errs) > 1 and valid_err < np.array(history_errs)[:-1, 5].min(): best_p = unzip(tparams) bad_counter = 0 best_valid_err = valid_err uidx_best_valid_err = uidx print 'Saving to %s...'%save_model_dir, np.savez( save_model_dir+'model_best_so_far.npz', history_errs=history_errs, **best_p) with open('%smodel_options.pkl'%save_model_dir, 'wb') as f: pkl.dump(model_options, f) print 'Done' elif len(history_errs) > 1 and valid_err >= np.array(history_errs)[:-1, 5].min(): bad_counter += 1 print 'history best ', np.array(history_errs)[:,6].min() print 'bad_counter ', bad_counter print 'patience ', patience if bad_counter > patience: print 'Early Stop!' estop = True break if test_B4 > 0.48 and test_meteor > 0.32: print 'Saving to %s...' % save_model_dir, numpy.savez( save_model_dir + 'model_' + str(uidx) + '.npz', history_errs=history_errs, **current_params) if self.channel: self.channel.save() print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err, \ 'best valid err so far',best_valid_err print 'valid took %.2f sec'%(time.time() - t0_valid) # end of validatioin if debug: break if estop: break if debug: break # end for loop over minibatches print 'This epoch has seen %d samples, train cost %.2f'%( n_samples, np.mean(train_costs)) # end for loop over epochs print 'Optimization ended.' if best_p is not None: zipp(best_p, tparams) print 'stopped at epoch %d, minibatch %d, '\ 'curent Train %.2f, current Valid %.2f, current Test %.2f '%( eidx, uidx, np.mean(train_err), np.mean(valid_err), np.mean(test_err)) params = copy.copy(best_p) np.savez(save_model_dir+'model_best.npz', train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) if history_errs != []: history = np.asarray(history_errs) best_valid_idx = history[:,6].argmin() np.savetxt(save_model_dir+'train_valid_test.txt', history, fmt='%.4f') print 'final best exp ', history[best_valid_idx] return train_err, valid_err, test_err
def __init__(self, batch: int = None, hours: float = None, width: int = None, height: int = None, level: Levels = None, reset_chance: float = None, failed_actions_chance: float = None, **kwargs) -> None: super().__init__() self.batch: int = batch self.hours: float = hours self.level: Levels = level self.uses = { Levels.Causal1: { LayerType.Blocks, LayerType.Goal, LayerType.Gold, LayerType.Keys, LayerType.Door }, Levels.Causal2: { LayerType.Blocks, LayerType.Goal, LayerType.Diamond1, LayerType.Diamond2, LayerType.Diamond3, LayerType.Diamond4 }, Levels.Causal3: { LayerType.Blocks, LayerType.Goal, LayerType.Gold, LayerType.Bluedoor, LayerType.Bluekeys, LayerType.Reddoor, LayerType.Redkeys }, Levels.Causal4: { LayerType.Blocks, LayerType.Goal, LayerType.Gold, LayerType.Bluedoor, LayerType.Bluekeys, LayerType.Reddoor, LayerType.Redkeys, LayerType.Rock, LayerType.Dirt }, Levels.Rocks: {LayerType.Blocks, LayerType.Goal, LayerType.Rock, LayerType.Dirt}, Levels.Maze: { LayerType.Blocks, LayerType.Goal, LayerType.Gold, LayerType.Door, LayerType.Keys, LayerType.Holder, LayerType.Putter }, Levels.Causal5: { LayerType.Blocks, LayerType.Goal, LayerType.Brown1, LayerType.Brown2, LayerType.Brown3, LayerType.Pink1, LayerType.Pink2, LayerType.Pink3 }, Levels.Coconuts: { LayerType.Blocks, LayerType.Goal, LayerType.Rock, LayerType.Dirt, LayerType.Gold, LayerType.Coconut }, Levels.Causal6: { LayerType.Blocks, LayerType.Goal, LayerType.Greendown, LayerType.Greenup, LayerType.Greenstar, LayerType.Yellowstar, LayerType.Bluestar }, Levels.SuperLevel: { LayerType.Blocks, LayerType.Goal, LayerType.Gold, LayerType.Bluedoor, LayerType.Bluekeys, LayerType.Reddoor, LayerType.Redkeys, LayerType.Rock, LayerType.Dirt, LayerType.Coconut, LayerType.Door, LayerType.Keys }, Levels.SuperLevel2: { LayerType.Blocks, LayerType.Goal, LayerType.Gold, LayerType.Bluedoor, LayerType.Bluekeys, LayerType.Reddoor, LayerType.Redkeys, LayerType.Rock, LayerType.Dirt, LayerType.Coconut }, Levels.MonsterLevel: { LayerType.Blocks, LayerType.Goal, LayerType.Gold, LayerType.Monster, LayerType.Rock }, Levels.Causal7: { LayerType.Blocks, LayerType.Goal, LayerType.Greencross, LayerType.Bluecross, LayerType.Redcross, LayerType.Purplecross }, Levels.CausalSuper: { LayerType.Blocks, LayerType.Goal, LayerType.Super1, LayerType.Super2, LayerType.Super3, LayerType.Super4, LayerType.Super5, LayerType.Super6, LayerType.Super7 } } convert = {(use, [ layer for layer in LayerType if layer.name == name.split('_')[1] ][0]) for name, use in kwargs.items() if name.split('_')[0] == "layer"} self.layers: Layers = Layers( batch, width, height, level, reset_chance, failed_actions_chance, *[ layer for use, layer in convert if use and (layer in self.uses[level]) ]) for i in range(width): for j in range(height): for k in range(batch): self.layers.all_items[k][(i, j)] = 0 self.layers.update(isFirstTime=True)
def train( random_seed=1234, dim_word=256, # word vector dimensionality ctx_dim=-1, # context vector dimensionality, auto set dim=1000, # the number of LSTM units n_layers_out=1, n_layers_init=1, encoder='none', encoder_dim=100, prev2out=False, ctx2out=False, patience=10, max_epochs=5000, dispFreq=100, decay_c=0., alpha_c=0., alpha_entropy_r=0., lrate=0.01, selector=False, n_words=100000, maxlen=100, # maximum length of the description optimizer='adadelta', clip_c=2., batch_size=64, valid_batch_size=64, save_model_dir='/data/lisatmp3/yaoli/exp/capgen_vid/attention/test/', validFreq=10, saveFreq=10, # save the parameters after every saveFreq updates sampleFreq=10, # generate some samples after every sampleFreq updates metric='blue', dataset='youtube2text', video_feature='googlenet', use_dropout=False, reload_=False, from_dir=None, K=10, OutOf=240, verbose=True, debug=True): rng_numpy, rng_theano = utils.get_two_rngs() model_options = locals().copy() if 'self' in model_options: del model_options['self'] with open('%smodel_options.pkl' % save_model_dir, 'wb') as f: pkl.dump(model_options, f) # instance model layers = Layers() model = Model() print 'Loading data' engine = data_engine.Movie2Caption('attention', dataset, video_feature, batch_size, valid_batch_size, maxlen, n_words, K, OutOf) model_options['ctx_dim'] = engine.ctx_dim model_options['n_words'] = engine.n_words print 'n_words:', model_options['n_words'] # set test values, for debugging idx = engine.kf_train[0] [x_tv, mask_tv, ctx_tv, ctx_mask_tv ] = data_engine.prepare_data(engine, [engine.train[index] for index in idx]) print 'init params' t0 = time.time() params = model.init_params(model_options) # reloading if reload_: model_saved = from_dir + '/model_best_so_far.npz' assert os.path.isfile(model_saved) print "Reloading model params..." params = utils.load_params(model_saved, params) tparams = utils.init_tparams(params) trng, use_noise, \ x, mask, ctx, mask_ctx, \ cost, extra = \ model.build_model(tparams, model_options) alphas = extra[1] betas = extra[2] print 'buliding sampler' f_init, f_next = model.build_sampler(tparams, model_options, use_noise, trng) # before any regularizer print 'building f_log_probs' f_log_probs = theano.function([x, mask, ctx, mask_ctx], -cost, profile=False, on_unused_input='ignore') cost = cost.mean() if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ((1. - alphas.sum(0))**2).sum(-1).mean() cost += alpha_reg if alpha_entropy_r > 0: alpha_entropy_r = theano.shared(numpy.float32(alpha_entropy_r), name='alpha_entropy_r') alpha_reg_2 = alpha_entropy_r * (-tensor.sum( alphas * tensor.log(alphas + 1e-8), axis=-1)).sum(-1).mean() cost += alpha_reg_2 else: alpha_reg_2 = tensor.zeros_like(cost) print 'building f_alpha' f_alpha = theano.function([x, mask, ctx, mask_ctx], [alphas, betas], name='f_alpha', on_unused_input='ignore') print 'compute grad' grads = tensor.grad(cost, wrt=utils.itemlist(tparams)) if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads lr = tensor.scalar(name='lr') print 'build train fns' f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, [x, mask, ctx, mask_ctx], cost, extra + grads) print 'compilation took %.4f sec' % (time.time() - t0) print 'Optimization' history_errs = [] # reload history if reload_: print 'loading history error...' history_errs = numpy.load( from_dir + 'model_best_so_far.npz')['history_errs'].tolist() bad_counter = 0 processes = None queue = None rqueue = None shared_params = None uidx = 0 uidx_best_blue = 0 uidx_best_valid_err = 0 estop = False best_p = utils.unzip(tparams) best_blue_valid = 0 best_valid_err = 999 alphas_ratio = [] for eidx in xrange(max_epochs): n_samples = 0 train_costs = [] grads_record = [] print 'Epoch ', eidx for idx in engine.kf_train: tags = [engine.train[index] for index in idx] n_samples += len(tags) uidx += 1 use_noise.set_value(1.) pd_start = time.time() x, mask, ctx, ctx_mask = data_engine.prepare_data(engine, tags) pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', maxlen continue ud_start = time.time() rvals = f_grad_shared(x, mask, ctx, ctx_mask) cost = rvals[0] probs = rvals[1] alphas = rvals[2] betas = rvals[3] grads = rvals[4:] grads, NaN_keys = utils.grad_nan_report(grads, tparams) if len(grads_record) >= 5: del grads_record[0] grads_record.append(grads) if NaN_keys != []: print 'grads contain NaN' import pdb pdb.set_trace() if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected in cost' import pdb pdb.set_trace() # update params f_update(lrate) ud_duration = time.time() - ud_start if eidx == 0: train_error = cost else: train_error = train_error * 0.95 + cost * 0.05 train_costs.append(cost) if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Train cost mean so far', \ train_error, 'fetching data time spent (sec)', pd_duration, \ 'update time spent (sec)', ud_duration, 'save_dir', save_model_dir alphas, betas = f_alpha(x, mask, ctx, ctx_mask) counts = mask.sum(0) betas_mean = (betas * mask).sum(0) / counts betas_mean = betas_mean.mean() print 'alpha ratio %.3f, betas mean %.3f' % ( alphas.min(-1).mean() / (alphas.max(-1)).mean(), betas_mean) l = 0 for vv in x[:, 0]: if vv == 0: break if vv in engine.word_idict: print '(', numpy.round(betas[l, 0], 3), ')', engine.word_idict[vv], else: print '(', numpy.round(betas[l, 0], 3), ')', 'UNK', l += 1 print '(', numpy.round(betas[l, 0], 3), ')' if numpy.mod(uidx, saveFreq) == 0: pass if numpy.mod(uidx, sampleFreq) == 0: use_noise.set_value(0.) print '------------- sampling from train ----------' x_s = x mask_s = mask ctx_s = ctx ctx_mask_s = ctx_mask model.sample_execute(engine, model_options, tparams, f_init, f_next, x_s, ctx_s, ctx_mask_s, trng) print '------------- sampling from valid ----------' idx = engine.kf_valid[numpy.random.randint( 1, len(engine.kf_valid) - 1)] tags = [engine.valid[index] for index in idx] x_s, mask_s, ctx_s, mask_ctx_s = data_engine.prepare_data( engine, tags) model.sample_execute(engine, model_options, tparams, f_init, f_next, x_s, ctx_s, mask_ctx_s, trng) if validFreq != -1 and numpy.mod(uidx, validFreq) == 0: t0_valid = time.time() alphas, _ = f_alpha(x, mask, ctx, ctx_mask) ratio = alphas.min(-1).mean() / (alphas.max(-1)).mean() alphas_ratio.append(ratio) numpy.savetxt(save_model_dir + 'alpha_ratio.txt', alphas_ratio) current_params = utils.unzip(tparams) numpy.savez(save_model_dir + 'model_current.npz', history_errs=history_errs, **current_params) use_noise.set_value(0.) train_err = -1 train_perp = -1 valid_err = -1 valid_perp = -1 test_err = -1 test_perp = -1 if not debug: # first compute train cost if 0: print 'computing cost on trainset' train_err, train_perp = model.pred_probs( engine, 'train', f_log_probs, verbose=model_options['verbose']) else: train_err = 0. train_perp = 0. if 1: print 'validating...' valid_err, valid_perp = model.pred_probs( engine, 'valid', f_log_probs, verbose=model_options['verbose'], ) else: valid_err = 0. valid_perp = 0. if 1: print 'testing...' test_err, test_perp = model.pred_probs( engine, 'test', f_log_probs, verbose=model_options['verbose']) else: test_err = 0. test_perp = 0. mean_ranking = 0 blue_t0 = time.time() scores, processes, queue, rqueue, shared_params = \ metrics.compute_score( model_type='attention', model_archive=current_params, options=model_options, engine=engine, save_dir=save_model_dir, beam=5, n_process=5, whichset='both', on_cpu=False, processes=processes, queue=queue, rqueue=rqueue, shared_params=shared_params, metric=metric, one_time=False, f_init=f_init, f_next=f_next, model=model ) ''' {'blue': {'test': [-1], 'valid': [77.7, 60.5, 48.7, 38.5, 38.3]}, 'alternative_valid': {'Bleu_3': 0.40702270203174923, 'Bleu_4': 0.29276570520368456, 'CIDEr': 0.25247168210607884, 'Bleu_2': 0.529069629270047, 'Bleu_1': 0.6804308797115253, 'ROUGE_L': 0.51083584331688392}, 'meteor': {'test': [-1], 'valid': [0.282787550236724]}} ''' valid_B1 = scores['valid']['Bleu_1'] valid_B2 = scores['valid']['Bleu_2'] valid_B3 = scores['valid']['Bleu_3'] valid_B4 = scores['valid']['Bleu_4'] valid_Rouge = scores['valid']['ROUGE_L'] valid_Cider = scores['valid']['CIDEr'] valid_meteor = scores['valid']['METEOR'] test_B1 = scores['test']['Bleu_1'] test_B2 = scores['test']['Bleu_2'] test_B3 = scores['test']['Bleu_3'] test_B4 = scores['test']['Bleu_4'] test_Rouge = scores['test']['ROUGE_L'] test_Cider = scores['test']['CIDEr'] test_meteor = scores['test']['METEOR'] print 'computing meteor/blue score used %.4f sec, '\ 'blue score: %.1f, meteor score: %.1f'%( time.time()-blue_t0, valid_B4, valid_meteor) history_errs.append([ eidx, uidx, train_err, train_perp, valid_perp, test_perp, valid_err, test_err, valid_B1, valid_B2, valid_B3, valid_B4, valid_meteor, valid_Rouge, valid_Cider, test_B1, test_B2, test_B3, test_B4, test_meteor, test_Rouge, test_Cider ]) numpy.savetxt(save_model_dir + 'train_valid_test.txt', history_errs, fmt='%.3f') print 'save validation results to %s' % save_model_dir # save best model according to the best blue or meteor if len(history_errs) > 1 and \ valid_B4 > numpy.array(history_errs)[:-1,11].max(): print 'Saving to %s...' % save_model_dir, numpy.savez(save_model_dir + 'model_best_blue_or_meteor.npz', history_errs=history_errs, **best_p) if len(history_errs) > 1 and \ valid_err < numpy.array(history_errs)[:-1,6].min(): best_p = utils.unzip(tparams) bad_counter = 0 best_valid_err = valid_err uidx_best_valid_err = uidx print 'Saving to %s...' % save_model_dir, numpy.savez(save_model_dir + 'model_best_so_far.npz', history_errs=history_errs, **best_p) with open('%smodel_options.pkl' % save_model_dir, 'wb') as f: pkl.dump(model_options, f) print 'Done' elif len(history_errs) > 1 and \ valid_err >= numpy.array(history_errs)[:-1,6].min(): bad_counter += 1 print 'history best ', numpy.array(history_errs)[:, 6].min() print 'bad_counter ', bad_counter print 'patience ', patience if bad_counter > patience: print 'Early Stop!' estop = True break if test_B4 > 0.52 and test_meteor > 0.32: print 'Saving to %s...' % save_model_dir, numpy.savez(save_model_dir + 'model_' + str(uidx) + '.npz', history_errs=history_errs, **current_params) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err, \ 'best valid err so far',best_valid_err print 'valid took %.2f sec' % (time.time() - t0_valid) # end of validatioin if debug: break if estop: break if debug: break # end for loop over minibatches print 'This epoch has seen %d samples, train cost %.2f' % ( n_samples, numpy.mean(train_costs)) # end for loop over epochs print 'Optimization ended.' if best_p is not None: utils.zipp(best_p, tparams) use_noise.set_value(0.) valid_err = 0 test_err = 0 if not debug: #if valid: valid_err, valid_perp = model.pred_probs( engine, 'valid', f_log_probs, verbose=model_options['verbose']) #if test: #test_err, test_perp = self.pred_probs( # 'test', f_log_probs, # verbose=model_options['verbose']) print 'stopped at epoch %d, minibatch %d, '\ 'curent Train %.2f, current Valid %.2f, current Test %.2f '%( eidx,uidx,numpy.mean(train_err),numpy.mean(valid_err),numpy.mean(test_err)) params = copy.copy(best_p) numpy.savez(save_model_dir + 'model_best.npz', train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) if history_errs != []: history = numpy.asarray(history_errs) best_valid_idx = history[:, 6].argmin() numpy.savetxt(save_model_dir + 'train_valid_test.txt', history, fmt='%.4f') print 'final best exp ', history[best_valid_idx] return train_err, valid_err, test_err
def train(random_seed=1234, dim_word=256, # word vector dimensionality ctx_dim=-1, # context vector dimensionality, auto set dim=1000, # the number of LSTM units n_layers_out=1, n_layers_init=1, encoder='none', encoder_dim=100, prev2out=False, ctx2out=False, patience=10, max_epochs=5000, dispFreq=100, decay_c=0., alpha_c=0., alpha_entropy_r=0., lrate=0.01, selector=False, n_words=100000, maxlen=100, # maximum length of the description optimizer='adadelta', clip_c=2., batch_size = 64, valid_batch_size = 64, save_model_dir='/data/lisatmp3/yaoli/exp/capgen_vid/attention/test/', validFreq=10, saveFreq=10, # save the parameters after every saveFreq updates sampleFreq=10, # generate some samples after every sampleFreq updates metric='blue', dataset='youtube2text', video_feature='googlenet', use_dropout=False, reload_=False, from_dir=None, K1=10, K2=10, OutOf=240, verbose=True, debug=True ): rng_numpy, rng_theano = utils.get_two_rngs() model_options = locals().copy() model_options_c = locals().copy() if 'self' in model_options: del model_options['self'] with open('model_files/model_options.pkl', 'wb') as f: pkl.dump(model_options, f) with open('model_files/model_options_c3d.pkl', 'wb') as f: pkl.dump(model_options_c, f) # instance model layers = Layers() model = Model() model_c = Model() print 'Loading data' engine = data_engine.Movie2Caption('attention', dataset, video_feature, batch_size, valid_batch_size, maxlen, n_words, K1, K2, OutOf) model_options['ctx_dim'] = engine.ctx_dim model_options_c['ctx_dim'] = engine.ctx_dim_c model_options['n_words'] = engine.n_words model_options_c['n_words'] = engine.n_words print 'n_words:', model_options['n_words'] print model_options_c['dim'],model_options_c['ctx_dim'] # set test values, for debugging idx = engine.kf_train[0] [x_tv, mask_tv, ctx_tv, ctx_mask_tv, ctx_tv_c, ctx_mask_tv_c] = data_engine.prepare_data( engine, [engine.train[index] for index in idx]) print 'init params' t0 = time.time() params = model.init_params(model_options) params_c = model_c.init_params(model_options_c) # reloading model_saved = 'model_files/model_resnet.npz' model_saved_c = 'model_files/model_c3d.npz' assert os.path.isfile(model_saved) print "Reloading model params..." params = utils.load_params(model_saved, params) params_c = utils.load_params(model_saved_c, params_c) tparams = utils.init_tparams(params) tparams_c = utils.init_tparams(params_c) trng, use_noise, \ x, mask, ctx, mask_ctx, \ cost, extra = \ model.build_model(tparams, model_options) alphas = extra[1] betas = extra[2] trng_c, use_noise_c, \ x_c, mask_c, ctx_c, mask_ctx_c, \ cost_c, extra_c = \ model_c.build_model(tparams_c, model_options_c) alphas_c = extra_c[1] betas_c = extra_c[2] print 'buliding sampler' f_init, f_next = model.build_sampler(tparams, model_options, use_noise, trng) f_init_c, f_next_c = model_c.build_sampler(tparams_c, model_options_c, use_noise_c, trng_c) # before any regularizer print 'building f_log_probs' f_log_probs = theano.function([x, mask, ctx, mask_ctx], -cost, profile=False, on_unused_input='ignore') f_log_probs_c = theano.function([x_c, mask_c, ctx_c, mask_ctx_c], -cost_c, profile=False, on_unused_input='ignore') bad_counter = 0 processes = None queue = None rqueue = None shared_params = None uidx = 0 uidx_best_blue = 0 uidx_best_valid_err = 0 estop = False best_p = utils.unzip(tparams) best_blue_valid = 0 best_valid_err = 999 alphas_ratio = [] for eidx in xrange(max_epochs): n_samples = 0 train_costs = [] grads_record = [] print 'Epoch ', eidx for idx in engine.kf_train: tags = [engine.train[index] for index in idx] n_samples += len(tags) use_noise.set_value(1.) pd_start = time.time() x, mask, ctx, ctx_mask, ctx_c, ctx_mask_c = data_engine.prepare_data( engine, tags) #print 'x:',x.shape,'ctx:',ctx.shape,'ctx_c:',ctx_c.shape pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', maxlen continue if numpy.mod(uidx, saveFreq) == 0: pass if numpy.mod(uidx, sampleFreq) == 0: use_noise.set_value(0.) print '------------- sampling from train ----------' x_s = x mask_s = mask ctx_s = ctx ctx_s_c = ctx_c ctx_mask_s = ctx_mask ctx_mask_s_c = ctx_mask_c model.sample_execute_ensemble(engine, model_options,model_options_c, tparams,tparams_c, f_init,f_init_c, f_next,f_next_c, x_s, ctx_s, ctx_mask_s, ctx_s_c, ctx_mask_s_c, trng) print '------------- sampling from valid ----------' idx = engine.kf_valid[numpy.random.randint(1, len(engine.kf_valid) - 1)] tags = [engine.valid[index] for index in idx] x_s, mask_s, ctx_s, mask_ctx_s, ctx_s_c,mask_ctx_s_c = data_engine.prepare_data(engine, tags) model.sample_execute_ensemble(engine, model_options,model_options_c, tparams,tparams_c, f_init, f_init_c, f_next, f_next_c, x_s, ctx_s, mask_ctx_s, ctx_s_c, mask_ctx_s_c, trng) if validFreq != -1 and numpy.mod(uidx, validFreq) == 0: current_params = utils.unzip(tparams) use_noise.set_value(0.) train_err = -1 train_perp = -1 valid_err = -1 valid_perp = -1 test_err = -1 test_perp = -1 mean_ranking = 0 blue_t0 = time.time() scores, processes, queue, rqueue, shared_params = \ metrics.compute_score_ensemble( model_type='attention', model_archive=current_params, options=model_options, options_c=model_options_c, engine=engine, save_dir=save_model_dir, beam=5, n_process=5, whichset='both', on_cpu=False, processes=processes, queue=queue, rqueue=rqueue, shared_params=shared_params, metric=metric, one_time=False, f_init=f_init, f_init_c=f_init_c, f_next=f_next, f_next_c= f_next_c, model=model ) ''' {'blue': {'test': [-1], 'valid': [77.7, 60.5, 48.7, 38.5, 38.3]}, 'alternative_valid': {'Bleu_3': 0.40702270203174923, 'Bleu_4': 0.29276570520368456, 'CIDEr': 0.25247168210607884, 'Bleu_2': 0.529069629270047, 'Bleu_1': 0.6804308797115253, 'ROUGE_L': 0.51083584331688392}, 'meteor': {'test': [-1], 'valid': [0.282787550236724]}} ''' valid_B1 = scores['valid']['Bleu_1'] valid_B2 = scores['valid']['Bleu_2'] valid_B3 = scores['valid']['Bleu_3'] valid_B4 = scores['valid']['Bleu_4'] valid_Rouge = scores['valid']['ROUGE_L'] valid_Cider = scores['valid']['CIDEr'] valid_meteor = scores['valid']['METEOR'] test_B1 = scores['test']['Bleu_1'] test_B2 = scores['test']['Bleu_2'] test_B3 = scores['test']['Bleu_3'] test_B4 = scores['test']['Bleu_4'] test_Rouge = scores['test']['ROUGE_L'] test_Cider = scores['test']['CIDEr'] test_meteor = scores['test']['METEOR'] print 'computing meteor/blue score used %.4f sec, '\ 'blue score: %.1f, meteor score: %.1f'%( time.time()-blue_t0, valid_B4, valid_meteor) if test_B4>0.52 and test_meteor>0.32: print 'Saving to %s...'%save_model_dir, numpy.savez( save_model_dir+'model_'+str(uidx)+'.npz', **current_params) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err, \ 'best valid err so far',best_valid_err print 'valid took %.2f sec'%(time.time() - t0_valid) # end of validatioin sys.exit() if debug: break if estop: break if debug: break # end for loop over minibatches print 'This epoch has seen %d samples, train cost %.2f'%( n_samples, numpy.mean(train_costs)) # end for loop over epochs print 'Optimization ended.' if best_p is not None: utils.zipp(best_p, tparams) use_noise.set_value(0.) valid_err = 0 test_err = 0 if not debug: #if valid: valid_err, valid_perp = model.pred_probs( engine, 'valid', f_log_probs, verbose=model_options['verbose']) #if test: #test_err, test_perp = self.pred_probs( # 'test', f_log_probs, # verbose=model_options['verbose']) print 'stopped at epoch %d, minibatch %d, '\ 'curent Train %.2f, current Valid %.2f, current Test %.2f '%( eidx,uidx,numpy.mean(train_err),numpy.mean(valid_err),numpy.mean(test_err)) params = copy.copy(best_p) numpy.savez(save_model_dir+'model_best.npz', train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) if history_errs != []: history = numpy.asarray(history_errs) best_valid_idx = history[:,6].argmin() numpy.savetxt(save_model_dir+'train_valid_test.txt', history, fmt='%.4f') print 'final best exp ', history[best_valid_idx] return train_err, valid_err, test_err
class Model(object): def __init__(self): self.layers = Layers() def init_params(self, options): # all parameters params = OrderedDict() # embedding params['Wemb'] = utils.norm_weight(options['vocab_size'], options['word_dim']) # LSTM initial states params = self.layers.get_layer('ff')[0](options, params, prefix='ff_state', nin=options['ctx_dim'], nout=options['lstm_dim']) params = self.layers.get_layer('ff')[0](options, params, prefix='ff_memory', nin=options['ctx_dim'], nout=options['lstm_dim']) # decoder: LSTM params = self.layers.get_layer('lstm_cond')[0]( options, params, prefix='bo_lstm', nin=options['word_dim'], dim=options['lstm_dim'], dimctx=options['ctx_dim']) params = self.layers.get_layer('lstm')[0](params, nin=options['lstm_dim'], dim=options['lstm_dim'], prefix='to_lstm') # readout params = self.layers.get_layer('ff')[0](options, params, prefix='ff_logit_bo', nin=options['lstm_dim'], nout=options['word_dim']) if options['ctx2out']: params = self.layers.get_layer('ff')[0](options, params, prefix='ff_logit_ctx', nin=options['ctx_dim'], nout=options['word_dim']) params = self.layers.get_layer('ff')[0](options, params, prefix='ff_logit_to', nin=options['lstm_dim'], nout=options['word_dim']) # MLP params = self.layers.get_layer('ff')[0](options, params, prefix='ff_logit', nin=options['word_dim'], nout=options['vocab_size']) return params def build_model(self, tfparams, options, x, mask, ctx, ctx_mask): use_noise = tf.Variable(False, dtype=tf.bool, trainable=False, name="use_noise") x_shape = tf.shape(x) n_timesteps = x_shape[0] n_samples = x_shape[1] # get word embeddings emb = tf.nn.embedding_lookup( tfparams['Wemb'], x, name="inputs_emb_lookup") # (num_steps,64,512) emb_shape = tf.shape(emb) indices = tf.expand_dims(tf.range(1, emb_shape[0]), axis=1) emb_shifted = tf.scatter_nd(indices, emb[:-1], emb_shape) emb = emb_shifted # count num_frames==28 with tf.name_scope("ctx_mean"): with tf.name_scope("counts"): counts = tf.expand_dims( tf.reduce_sum(ctx_mask, axis=-1, name="reduce_sum_ctx_mask"), 1) # (64,1) ctx_ = ctx ctx0 = ctx_ # (64,28,2048) ctx_mean = tf.reduce_sum( ctx0, axis=1, name="reduce_sum_ctx" ) / counts #mean pooling of {vi} # (64,2048) # initial state/cell with tf.name_scope("init_state"): init_state = self.layers.get_layer('ff')[1]( tfparams, ctx_mean, options, prefix='ff_state', activ='tanh') # (64,512) with tf.name_scope("init_memory"): init_memory = self.layers.get_layer('ff')[1]( tfparams, ctx_mean, options, prefix='ff_memory', activ='tanh') # (64,512) # hstltm = self.layers.build_hlstm(['bo_lstm','to_lstm'], inputs, n_timesteps, init_state, init_memory) with tf.name_scope("bo_lstm"): bo_lstm = self.layers.get_layer('lstm_cond')[1]( tfparams, emb, options, prefix='bo_lstm', mask=mask, context=ctx0, one_step=False, init_state=init_state, init_memory=init_memory, use_noise=use_noise) with tf.name_scope("to_lstm"): to_lstm = self.layers.get_layer('lstm')[1](tfparams, bo_lstm[0], mask=mask, one_step=False, prefix='to_lstm') bo_lstm_h = bo_lstm[0] # (t,64,512) to_lstm_h = to_lstm[0] # (t,64,512) alphas = bo_lstm[2] # (t,64,28) ctxs = bo_lstm[3] # (t,64,2048) betas = bo_lstm[4] # (t,64,) if options['use_dropout']: bo_lstm_h = self.layers.dropout_layer(bo_lstm_h, use_noise) to_lstm_h = self.layers.dropout_layer(to_lstm_h, use_noise) # compute word probabilities logit = self.layers.get_layer('ff')[1]( tfparams, bo_lstm_h, options, prefix='ff_logit_bo', activ='linear') # (t,64,512)*(512,512) = (t,64,512) if options['prev2out']: logit += emb if options['ctx2out']: to_lstm_h *= (1 - betas[:, :, None]) # (t,64,512)*(t,64,1) ctxs_beta = self.layers.get_layer('ff')[1]( tfparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') # (t,64,2048)*(2048,512) = (t,64,512) ctxs_beta += self.layers.get_layer('ff')[1]( tfparams, to_lstm_h, options, prefix='ff_logit_to', activ='linear' ) # (t,64,512)+((t,64,512)*(512,512)) = (t,64,512) logit += ctxs_beta logit = utils.tanh(logit) # (t,64,512) if options['use_dropout']: logit = self.layers.dropout_layer(logit, use_noise) # (t,m,n_words) logit = self.layers.get_layer('ff')[1]( tfparams, logit, options, prefix='ff_logit', activ='linear') # (t,64,512)*(512,vocab_size) = (t,64,vocab_size) logit_shape = tf.shape(logit) # (t*m, n_words) probs = tf.nn.softmax( tf.reshape(logit, [logit_shape[0] * logit_shape[1], logit_shape[2] ])) # (t*64, vocab_size) # cost x_flat = tf.reshape(x, [x_shape[0] * x_shape[1]]) # (t*m,) x_flat_shape = tf.shape(x_flat) gather_indices = tf.stack([tf.range(x_flat_shape[0]), x_flat], axis=1) # (t*m,2) cost = -tf.log( tf.gather_nd(probs, gather_indices) + 1e-8) # (t*m,) : pick probs of each word in each timestep cost = tf.reshape(cost, [x_shape[0], x_shape[1]]) # (t,m) cost = tf.reduce_sum( (cost * mask), axis=0 ) # (m,) : sum across all timesteps for each element in batch extra = [probs, alphas, betas] return use_noise, cost, extra def build_sampler(self, tfparams, options, use_noise, ctx0, ctx_mask, x, bo_init_state_sampler, to_init_state_sampler, bo_init_memory_sampler, to_init_memory_sampler, mode=None): # ctx: # frames x ctx_dim ctx_ = ctx0 counts = tf.reduce_sum(ctx_mask, axis=-1) # scalar ctx = ctx_ ctx_mean = tf.reduce_sum(ctx, axis=0) / counts # (2048,) ctx = tf.expand_dims(ctx, 0) # (1,28,2048) # initial state/cell bo_init_state = self.layers.get_layer('ff')[1](tfparams, ctx_mean, options, prefix='ff_state', activ='tanh') # (512,) bo_init_memory = self.layers.get_layer('ff')[1](tfparams, ctx_mean, options, prefix='ff_memory', activ='tanh') # (512,) to_init_state = tf.zeros( shape=(options['lstm_dim'], ), dtype=tf.float32) # DOUBT : constant or not? # (512,) to_init_memory = tf.zeros(shape=(options['lstm_dim'], ), dtype=tf.float32) # (512,) init_state = [bo_init_state, to_init_state] init_memory = [bo_init_memory, to_init_memory] print 'building f_init...', f_init = [ctx0] + init_state + init_memory print 'done' init_state = [bo_init_state_sampler, to_init_state_sampler] init_memory = [bo_init_memory_sampler, to_init_memory_sampler] # # if it's the first word, embedding should be all zero emb = tf.cond( tf.reduce_any(x[:, None] < 0), lambda: tf.zeros( shape=(1, tfparams['Wemb'].shape[1]), dtype=tf.float32), lambda: tf.nn.embedding_lookup(tfparams['Wemb'], x)) # (m,512) bo_lstm = self.layers.get_layer('lstm_cond')[1]( tfparams, emb, options, prefix='bo_lstm', mask=None, context=ctx, one_step=True, init_state=init_state[0], init_memory=init_memory[0], use_noise=use_noise, mode=mode) to_lstm = self.layers.get_layer('lstm')[1](tfparams, bo_lstm[0], mask=None, one_step=True, init_state=init_state[1], init_memory=init_memory[1], prefix='to_lstm') next_state = [bo_lstm[0], to_lstm[0]] next_memory = [bo_lstm[1], to_lstm[0]] bo_lstm_h = bo_lstm[0] # (1,512) to_lstm_h = to_lstm[0] # (1,512) alphas = bo_lstm[2] # (1,28) ctxs = bo_lstm[3] # (1,2048) betas = bo_lstm[4] # (1,) if options['use_dropout']: bo_lstm_h = self.layers.dropout_layer(bo_lstm_h, use_noise) to_lstm_h = self.layers.dropout_layer(to_lstm_h, use_noise) # compute word probabilities logit = self.layers.get_layer('ff')[1]( tfparams, bo_lstm_h, options, prefix='ff_logit_bo', activ='linear') # (1,512)*(512,512) = (1,512) if options['prev2out']: logit += emb if options['ctx2out']: to_lstm_h *= (1 - betas[:, None]) # (1,512)*(1,1) = (1,512) ctxs_beta = self.layers.get_layer('ff')[1]( tfparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') # (1,2048)*(2048,512) = (1,512) ctxs_beta += self.layers.get_layer('ff')[1]( tfparams, to_lstm_h, options, prefix='ff_logit_to', activ='linear') # (1,512)+((1,512)*(512,512)) = (1,512) logit += ctxs_beta logit = utils.tanh(logit) # (1,512) if options['use_dropout']: logit = self.layers.dropout_layer(logit, use_noise) # (1,n_words) logit = self.layers.get_layer('ff')[1]( tfparams, logit, options, prefix='ff_logit', activ='linear') # (1,512)*(512,vocab_size) = (1,vocab_size) next_probs = tf.nn.softmax(logit) # next_sample = trng.multinomial(pvals=next_probs).argmax(1) # INCOMPLETE , DOUBT : why is multinomial needed? next_sample = tf.multinomial( next_probs, 1) # draw samples with given probabilities (1,1) next_sample_shape = tf.shape(next_sample) next_sample = tf.reshape(next_sample, [next_sample_shape[0]]) # next word probability print 'building f_next...', f_next = [next_probs, next_sample] + next_state + next_memory print 'done' return f_init, f_next def gen_sample(self, sess, tfparams, f_init, f_next, ctx0, ctx_mask, options, k=1, maxlen=30, stochastic=False, restrict_voc=False): ''' ctx0: (28,2048) (f, dim_ctx) ctx_mask: (28,) (f, ) restrict_voc: set the probability of outofvoc words with 0, renormalize ''' if k > 1: assert not stochastic, 'Beam search does not support stochastic sampling' sample = [] sample_score = [] if stochastic: sample_score = 0 live_k = 1 dead_k = 0 hyp_samples = [[]] * live_k hyp_scores = np.zeros(live_k).astype('float32') hyp_states = [] hyp_memories = [] # [(28,2048),(512,),(512,),(512,),(512,)] rval = sess.run(f_init, feed_dict={ "ctx_sampler:0": ctx0, "ctx_mask_sampler:0": ctx_mask }) ctx0 = rval[0] next_state = [] next_memory = [] n_layers_lstm = 2 for lidx in xrange(n_layers_lstm): next_state.append(rval[1 + lidx]) next_state[-1] = next_state[-1].reshape( [live_k, next_state[-1].shape[0]]) for lidx in xrange(n_layers_lstm): next_memory.append(rval[1 + n_layers_lstm + lidx]) next_memory[-1] = next_memory[-1].reshape( [live_k, next_memory[-1].shape[0]]) next_w = -1 * np.ones((1, )).astype('int32') for ii in xrange(maxlen): # return [(1, vocab_size), (1,), (1, 512), (1, 512), (1, 512), (1, 512)] # next_w: vector (1,) # ctx: matrix (28, 2048) # ctx_mask: vector (28,) # next_state: [matrix] [(1, 512), (1, 512)] # next_memory: [matrix] [(1, 512), (1, 512)] rval = sess.run(f_next, feed_dict={ "x_sampler:0": next_w, "ctx_sampler:0": ctx0, "ctx_mask_sampler:0": ctx_mask, 'bo_init_state_sampler:0': next_state[0], 'to_init_state_sampler:0': next_state[1], 'bo_init_memory_sampler:0': next_memory[0], 'to_init_memory_sampler:0': next_memory[1] }) next_p = rval[0] if restrict_voc: raise NotImplementedError() next_w = rval[1] # already argmax sorted next_state = [] for lidx in xrange(n_layers_lstm): next_state.append(rval[2 + lidx]) next_memory = [] for lidx in xrange(n_layers_lstm): next_memory.append(rval[2 + n_layers_lstm + lidx]) if stochastic: sample.append(next_w[0]) # take the most likely one sample_score += next_p[0, next_w[0]] if next_w[0] == 0: break else: # the first run is (1,vocab_size) cand_scores = hyp_scores[:, None] - np.log(next_p) cand_flat = cand_scores.flatten() ranks_flat = cand_flat.argsort()[:(k - dead_k)] voc_size = next_p.shape[1] trans_indices = ranks_flat / voc_size # index of row word_indices = ranks_flat % voc_size # index of col costs = cand_flat[ranks_flat] new_hyp_samples = [] new_hyp_scores = np.zeros(k - dead_k).astype('float32') new_hyp_states = [] for lidx in xrange(n_layers_lstm): new_hyp_states.append([]) new_hyp_memories = [] for lidx in xrange(n_layers_lstm): new_hyp_memories.append([]) for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)): new_hyp_samples.append(hyp_samples[ti] + [wi]) new_hyp_scores[idx] = copy.copy(costs[idx]) for lidx in xrange(n_layers_lstm): new_hyp_states[lidx].append( copy.copy(next_state[lidx][ti])) for lidx in xrange(n_layers_lstm): new_hyp_memories[lidx].append( copy.copy(next_memory[lidx][ti])) # check the finished samples new_live_k = 0 hyp_samples = [] hyp_scores = [] hyp_states = [] for lidx in xrange(n_layers_lstm): hyp_states.append([]) hyp_memories = [] for lidx in xrange(n_layers_lstm): hyp_memories.append([]) for idx in xrange(len(new_hyp_samples)): if new_hyp_samples[idx][-1] == 0: sample.append(new_hyp_samples[idx]) sample_score.append(new_hyp_scores[idx]) dead_k += 1 else: new_live_k += 1 hyp_samples.append(new_hyp_samples[idx]) hyp_scores.append(new_hyp_scores[idx]) for lidx in xrange(n_layers_lstm): hyp_states[lidx].append(new_hyp_states[lidx][idx]) for lidx in xrange(n_layers_lstm): hyp_memories[lidx].append( new_hyp_memories[lidx][idx]) hyp_scores = np.array(hyp_scores) live_k = new_live_k if new_live_k < 1: break if dead_k >= k: break next_w = np.array([w[-1] for w in hyp_samples]) next_state = [] for lidx in xrange(n_layers_lstm): next_state.append(np.array(hyp_states[lidx])) next_memory = [] for lidx in xrange(n_layers_lstm): next_memory.append(np.array(hyp_memories[lidx])) if not stochastic: # dump every remaining one if live_k > 0: for idx in xrange(live_k): sample.append(hyp_samples[idx]) sample_score.append(hyp_scores[idx]) return sample, sample_score, next_state, next_memory def pred_probs(self, sess, engine, whichset, f_log_probs, verbose=True): probs = [] n_done = 0 NLL = [] L = [] if whichset == 'train': tags = engine.train_data_ids iterator = engine.kf_train elif whichset == 'val': tags = engine.val_data_ids iterator = engine.kf_val elif whichset == 'test': tags = engine.test_data_ids iterator = engine.kf_test else: raise NotImplementedError() n_samples = np.sum([len(index) for index in iterator]) for index in iterator: tag = [tags[i] for i in index] x, mask, ctx, ctx_mask = prepare_data(engine, tag, mode=whichset) pred_probs = sess.run(f_log_probs, feed_dict={ "word_seq_x:0": x, "word_seq_mask:0": mask, "ctx:0": ctx, "ctx_mask:0": ctx_mask }) L.append(mask.sum(0).tolist()) NLL.append((-1 * pred_probs).tolist()) probs.append(pred_probs.tolist()) n_done += len(tag) if verbose: sys.stdout.write('\rComputing LL on %d/%d examples' % (n_done, n_samples)) sys.stdout.flush() print "" probs = utils.flatten_list_of_list(probs) NLL = utils.flatten_list_of_list(NLL) L = utils.flatten_list_of_list(L) perp = 2**(np.sum(NLL) / np.sum(L) / np.log(2)) return -1 * np.mean(probs), perp def sample_execute(self, sess, engine, options, tfparams, f_init, f_next, x, ctx, ctx_mask): stochastic = not options['beam_search'] if stochastic: beam = 1 else: beam = 5 # x = (t,64) # ctx = (64,28,2048) # ctx_mask = (64,28) for jj in xrange(np.minimum(10, x.shape[1])): sample, score, _, _ = self.gen_sample(sess, tfparams, f_init, f_next, ctx[jj], ctx_mask[jj], options, k=beam, maxlen=30, stochastic=stochastic) if not stochastic: best_one = np.argmin(score) sample = sample[best_one] else: sample = sample print 'Truth ', jj, ': ', for vv in x[:, jj]: if vv == 0: break if vv in engine.reverse_vocab: print engine.reverse_vocab[vv], else: print 'UNK', print "" for kk, ss in enumerate([sample]): print 'Sample (', jj, ') ', ': ', for vv in ss: if vv == 0: break if vv in engine.reverse_vocab: print engine.reverse_vocab[vv], else: print 'UNK', print ""
def __init__(self, config): self.config = config self.layers = Layers(config)
def load_svg(self, filename): paths, attributes, svg_attributes = svg2paths2(filename) self.layers = Layers() self.layers.load_layers(paths, attributes)
class Model(object): def __init__(self): self.layers = Layers() def init_params(self, options): # all parameters params = OrderedDict() # embedding ctx_dim_c = 4096 params['Wemb'] = utils.norm_weight(options['n_words'], options['dim_word']) ctx_dim = options['ctx_dim'] params = self.layers.get_layer('ff')[0](options, params, prefix='ff_state', nin=ctx_dim, nout=options['dim']) params = self.layers.get_layer('ff')[0](options, params, prefix='ff_memory', nin=ctx_dim, nout=options['dim']) params = self.layers.get_layer('ff')[0](options, params, prefix='ff_state_c', nin=ctx_dim_c, nout=options['dim']) params = self.layers.get_layer('ff')[0](options, params, prefix='ff_memory_c', nin=ctx_dim_c, nout=options['dim']) # decoder: LSTM params = self.layers.get_layer('lstm_cond')[0](options, params, prefix='bo_lstm', nin=options['dim_word'], dim=options['dim'], dimctx=ctx_dim) params = self.layers.get_layer('lstm')[0](params, nin=options['dim'], dim=options['dim'], prefix='to_lstm') # readout params = self.layers.get_layer('ff')[0](options, params, prefix='ff_logit_bo', nin=options['dim'], nout=options['dim_word']) if options['ctx2out']: params = self.layers.get_layer('ff')[0](options, params, prefix='ff_logit_ctx', nin=ctx_dim, nout=options['dim_word']) params = self.layers.get_layer('ff')[0](options, params, prefix='ff_logit_ctx_c', nin=ctx_dim_c, nout=options['dim_word']) params = self.layers.get_layer('ff')[0](options, params, prefix='ff_logit_to', nin=options['dim'], nout=options['dim_word']) params = self.layers.get_layer('ff')[0](options, params, prefix='ff_logit', nin=options['dim_word'], nout=options['n_words']) return params def build_model(self, tparams, options): trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # description string: #words x #samples x = tensor.matrix('x', dtype='int64') mask = tensor.matrix('mask', dtype='float32') # context: #samples x #annotations x dim ctx = tensor.tensor3('ctx', dtype='float32') mask_ctx = tensor.matrix('mask_ctx', dtype='float32') ctx_c = tensor.tensor3('ctx_c', dtype='float32') mask_ctx_c = tensor.matrix('mask_ctx_c', dtype='float32') n_timesteps = x.shape[0] n_samples = x.shape[1] # index into the word embedding matrix, shift it forward in time emb = tparams['Wemb'][x.flatten()].reshape( [n_timesteps, n_samples, options['dim_word']]) emb_shifted = tensor.zeros_like(emb) emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) emb = emb_shifted counts = mask_ctx.sum(-1).dimshuffle(0, 'x') ctx_ = ctx ctx_c_ = ctx_c ctx0 = ctx_ ctx_mean = ctx0.sum(1) / counts ctx0_c = ctx_c_ ctx_mean_c = ctx0_c.sum(1) / counts # initial state/cell init_state = self.layers.get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh') init_memory = self.layers.get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory', activ='tanh') init_state_c = self.layers.get_layer('ff')[1](tparams, ctx_mean_c, options, prefix='ff_state_c', activ='tanh') init_memory_c = self.layers.get_layer('ff')[1](tparams, ctx_mean_c, options, prefix='ff_memory_c', activ='tanh') init_state += init_state_c init_memory += init_memory_c # decoder bo_lstm = self.layers.get_layer('lstm_cond')[1]( tparams, emb, options, prefix='bo_lstm', mask=mask, context=ctx0, context_c=ctx0_c, one_step=False, init_state=init_state, init_memory=init_memory, trng=trng, use_noise=use_noise) to_lstm = self.layers.get_layer('lstm')[1](tparams, bo_lstm[0], mask=mask, one_step=False, prefix='to_lstm') bo_lstm_h = bo_lstm[0] to_lstm_h = to_lstm[0] alphas = bo_lstm[2] alphas_c = bo_lstm[3] ctxs = bo_lstm[4] ctxs_c = bo_lstm[5] weight = bo_lstm[6] if options['use_dropout']: bo_lstm_h = self.layers.dropout_layer(bo_lstm_h, use_noise, trng) to_lstm_h = self.layers.dropout_layer(to_lstm_h, use_noise, trng) # compute word probabilities logit = self.layers.get_layer('ff')[1](tparams, bo_lstm_h, options, prefix='ff_logit_bo', activ='linear') if options['prev2out']: logit += emb if options['ctx2out']: betas = weight[:, :, 2] #betas = betas.reshape([betas.shape[1],betas.shape[2]]) to_lstm_h *= betas[:, :, None] ctxs_beta = self.layers.get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') ctxs_beta_c = self.layers.get_layer('ff')[1]( tparams, ctxs_c, options, prefix='ff_logit_ctx_c', activ='linear') to_lstm_h = self.layers.get_layer('ff')[1](tparams, to_lstm_h, options, prefix='ff_logit_to', activ='linear') logit = logit + ctxs_beta + ctxs_beta_c + to_lstm_h logit = utils.tanh(logit) if options['use_dropout']: logit = self.layers.dropout_layer(logit, use_noise, trng) # (t,m,n_words) logit = self.layers.get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape # (t*m, n_words) probs = tensor.nnet.softmax( logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]])) # cost x_flat = x.flatten() # (t*m,) cost = -tensor.log(probs[tensor.arange(x_flat.shape[0]), x_flat] + 1e-8) cost = cost.reshape([x.shape[0], x.shape[1]]) cost = (cost * mask).sum(0) extra = [probs, alphas, alphas_c, weight[:, :, 0], weight[:, :, 1]] return trng, use_noise, x, mask, ctx, mask_ctx, ctx_c, mask_ctx_c, cost, extra def build_sampler(self, tparams, options, use_noise, trng, mode=None): # context: #annotations x dim ctx0 = tensor.matrix('ctx_sampler', dtype='float32') # ctx0.tag.test_value = numpy.random.uniform(size=(50,1024)).astype('float32') ctx_mask = tensor.vector('ctx_mask', dtype='float32') # ctx_mask.tag.test_value = numpy.random.binomial(n=1,p=0.5,size=(50,)).astype('float32') ctx0_c = tensor.matrix('ctx_sampler_c', dtype='float32') # ctx0.tag.test_value = numpy.random.uniform(size=(50,1024)).astype('float32') ctx_mask_c = tensor.vector('ctx_mask_c', dtype='float32') ctx_ = ctx0 counts = ctx_mask.sum(-1) ctx = ctx_ ctx_mean = ctx.sum(0) / counts ctx_c_ = ctx0_c counts_c = ctx_mask_c.sum(-1) ctx_c = ctx_c_ ctx_mean_c = ctx_c.sum(0) / counts_c # ctx_mean = ctx.mean(0) ctx = ctx.dimshuffle('x', 0, 1) # initial state/cell bo_init_state = self.layers.get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh') bo_init_memory = self.layers.get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory', activ='tanh') bo_init_state_c = self.layers.get_layer('ff')[1](tparams, ctx_mean_c, options, prefix='ff_state_c', activ='tanh') bo_init_memory_c = self.layers.get_layer('ff')[1](tparams, ctx_mean_c, options, prefix='ff_memory_c', activ='tanh') bo_init_state += bo_init_state_c bo_init_memory += bo_init_memory_c to_init_state = tensor.alloc(0., options['dim']) to_init_memory = tensor.alloc(0., options['dim']) init_state = [bo_init_state, to_init_state] init_memory = [bo_init_memory, to_init_memory] print 'Building f_init...', f_init = theano.function([ctx0, ctx_mask, ctx0_c, ctx_mask_c], [ctx0] + init_state + init_memory, name='f_init', on_unused_input='ignore', profile=False, mode=mode) print 'Done' x = tensor.vector('x_sampler', dtype='int64') init_state = [ tensor.matrix('bo_init_state', dtype='float32'), tensor.matrix('to_init_state', dtype='float32') ] init_memory = [ tensor.matrix('bo_init_memory', dtype='float32'), tensor.matrix('to_init_memory', dtype='float32') ] # if it's the first word, emb should be all zero emb = tensor.switch(x[:, None] < 0, tensor.alloc(0., 1, tparams['Wemb'].shape[1]), tparams['Wemb'][x]) bo_lstm = self.layers.get_layer('lstm_cond')[1]( tparams, emb, options, prefix='bo_lstm', mask=None, context=ctx, context_c=ctx_c, one_step=True, init_state=init_state[0], init_memory=init_memory[0], trng=trng, use_noise=use_noise, mode=mode) to_lstm = self.layers.get_layer('lstm')[1](tparams, bo_lstm[0], mask=None, one_step=True, init_state=init_state[1], init_memory=init_memory[1], prefix='to_lstm') next_state = [bo_lstm[0], to_lstm[0]] next_memory = [bo_lstm[1], to_lstm[0]] bo_lstm_h = bo_lstm[0] to_lstm_h = to_lstm[0] alphas = bo_lstm[2] alphas_c = bo_lstm[3] ctxs = bo_lstm[4] ctxs_c = bo_lstm[5] weight = bo_lstm[6] if options['use_dropout']: bo_lstm_h = self.layers.dropout_layer(bo_lstm_h, use_noise, trng) to_lstm_h = self.layers.dropout_layer(to_lstm_h, use_noise, trng) logit = self.layers.get_layer('ff')[1](tparams, bo_lstm_h, options, prefix='ff_logit_bo', activ='linear') if options['prev2out']: logit += emb if options['ctx2out']: betas = weight[:, 2] # betas = betas.reshape([betas.shape[1],betas.shape[2]]) to_lstm_h *= betas[:, None] ctxs_beta = self.layers.get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear') ctxs_beta_c = self.layers.get_layer('ff')[1]( tparams, ctxs_c, options, prefix='ff_logit_ctx_c', activ='linear') to_lstm_h = self.layers.get_layer('ff')[1](tparams, to_lstm_h, options, prefix='ff_logit_to', activ='linear') logit = logit + ctxs_beta + ctxs_beta_c + to_lstm_h logit = utils.tanh(logit) if options['use_dropout']: logit = self.layers.dropout_layer(logit, use_noise, trng) logit = self.layers.get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape next_probs = tensor.nnet.softmax(logit) next_sample = trng.multinomial(pvals=next_probs).argmax(1) # next word probability print 'building f_next...' f_next = theano.function( [x, ctx0, ctx_mask, ctx0_c, ctx_mask_c] + init_state + init_memory, [next_probs, next_sample] + next_state + next_memory, name='f_next', profile=False, mode=mode, on_unused_input='ignore') print 'Done' return f_init, f_next def gen_sample(self, tparams, f_init, f_next, ctx0, ctx0_c, ctx_mask, ctx_mask_c, options, trng=None, k=1, maxlen=30, stochastic=False, restrict_voc=False): ''' ctx0: (26,1024) ctx_mask: (26,) restrict_voc: set the probability of outofvoc words with 0, renormalize ''' if k > 1: assert not stochastic, 'Beam search does not support stochastic sampling' sample = [] sample_score = [] if stochastic: sample_score = 0 live_k = 1 dead_k = 0 hyp_samples = [[]] * live_k hyp_scores = numpy.zeros(live_k).astype('float32') hyp_states = [] hyp_memories = [] # [(26,1024),(512,),(512,)] rval = f_init(ctx0, ctx_mask, ctx0_c, ctx_mask_c) ctx0 = rval[0] next_state = [] next_memory = [] n_layers_lstm = 2 for lidx in xrange(n_layers_lstm): next_state.append(rval[1 + lidx]) next_state[-1] = next_state[-1].reshape( [live_k, next_state[-1].shape[0]]) for lidx in xrange(n_layers_lstm): next_memory.append(rval[1 + n_layers_lstm + lidx]) next_memory[-1] = next_memory[-1].reshape( [live_k, next_memory[-1].shape[0]]) next_w = -1 * numpy.ones((1, )).astype('int64') # next_state: [(1,512)] # next_memory: [(1,512)] for ii in xrange(maxlen): # return [(1, 50000), (1,), (1, 512), (1, 512)] # next_w: vector # ctx: matrix # ctx_mask: vector # next_state: [matrix] # next_memory: [matrix] rval = f_next(*([next_w, ctx0, ctx_mask, ctx0_c, ctx_mask_c] + next_state + next_memory)) next_p = rval[0] if restrict_voc: raise NotImplementedError() next_w = rval[1] # already argmax sorted next_state = [] for lidx in xrange(n_layers_lstm): next_state.append(rval[2 + lidx]) next_memory = [] for lidx in xrange(n_layers_lstm): next_memory.append(rval[2 + n_layers_lstm + lidx]) if stochastic: sample.append(next_w[0]) # take the most likely one sample_score += next_p[0, next_w[0]] if next_w[0] == 0: break else: # the first run is (1,50000) cand_scores = hyp_scores[:, None] - numpy.log(next_p) cand_flat = cand_scores.flatten() ranks_flat = cand_flat.argsort()[:(k - dead_k)] voc_size = next_p.shape[1] trans_indices = ranks_flat / voc_size # index of row word_indices = ranks_flat % voc_size # index of col costs = cand_flat[ranks_flat] new_hyp_samples = [] new_hyp_scores = numpy.zeros(k - dead_k).astype('float32') new_hyp_states = [] for lidx in xrange(n_layers_lstm): new_hyp_states.append([]) new_hyp_memories = [] for lidx in xrange(n_layers_lstm): new_hyp_memories.append([]) for idx, [ti, wi] in enumerate(zip(trans_indices, word_indices)): new_hyp_samples.append(hyp_samples[ti] + [wi]) new_hyp_scores[idx] = copy.copy(costs[idx]) for lidx in xrange(n_layers_lstm): new_hyp_states[lidx].append( copy.copy(next_state[lidx][ti])) for lidx in xrange(n_layers_lstm): new_hyp_memories[lidx].append( copy.copy(next_memory[lidx][ti])) # check the finished samples new_live_k = 0 hyp_samples = [] hyp_scores = [] hyp_states = [] for lidx in xrange(n_layers_lstm): hyp_states.append([]) hyp_memories = [] for lidx in xrange(n_layers_lstm): hyp_memories.append([]) for idx in xrange(len(new_hyp_samples)): if new_hyp_samples[idx][-1] == 0: sample.append(new_hyp_samples[idx]) sample_score.append(new_hyp_scores[idx]) dead_k += 1 else: new_live_k += 1 hyp_samples.append(new_hyp_samples[idx]) hyp_scores.append(new_hyp_scores[idx]) for lidx in xrange(n_layers_lstm): hyp_states[lidx].append(new_hyp_states[lidx][idx]) for lidx in xrange(n_layers_lstm): hyp_memories[lidx].append( new_hyp_memories[lidx][idx]) hyp_scores = numpy.array(hyp_scores) live_k = new_live_k if new_live_k < 1: break if dead_k >= k: break next_w = numpy.array([w[-1] for w in hyp_samples]) next_state = [] for lidx in xrange(n_layers_lstm): next_state.append(numpy.array(hyp_states[lidx])) next_memory = [] for lidx in xrange(n_layers_lstm): next_memory.append(numpy.array(hyp_memories[lidx])) if not stochastic: # dump every remaining one if live_k > 0: for idx in xrange(live_k): sample.append(hyp_samples[idx]) sample_score.append(hyp_scores[idx]) return sample, sample_score, next_state, next_memory def pred_probs(self, engine, whichset, f_log_probs, verbose=True): probs = [] n_done = 0 NLL = [] L = [] if whichset == 'train': tags = engine.train iterator = engine.kf_train elif whichset == 'valid': tags = engine.valid iterator = engine.kf_valid elif whichset == 'test': tags = engine.test iterator = engine.kf_test else: raise NotImplementedError() n_samples = numpy.sum([len(index) for index in iterator]) for index in iterator: tag = [tags[i] for i in index] x, mask, ctx, ctx_mask, ctx_c, ctx_mask_c = prepare_data( engine, tag) pred_probs = f_log_probs(x, mask, ctx, ctx_mask, ctx_c, ctx_mask_c) L.append(mask.sum(0).tolist()) NLL.append((-1 * pred_probs).tolist()) probs.append(pred_probs.tolist()) n_done += len(tag) if verbose: sys.stdout.write('\rComputing LL on %d/%d examples' % (n_done, n_samples)) sys.stdout.flush() print probs = utils.flatten_list_of_list(probs) NLL = utils.flatten_list_of_list(NLL) L = utils.flatten_list_of_list(L) perp = 2**(numpy.sum(NLL) / numpy.sum(L) / numpy.log(2)) return -1 * numpy.mean(probs), perp def sample_execute(self, engine, options, tparams, f_init, f_next, x, ctx, ctx_c, ctx_mask, ctx_mask_c, trng): stochastic = False for jj in xrange(numpy.minimum(10, x.shape[1])): sample, score, _, _ = self.gen_sample(tparams, f_init, f_next, ctx[jj], ctx_c[jj], ctx_mask[jj], ctx_mask_c[jj], options, trng=trng, k=5, maxlen=30, stochastic=stochastic) if not stochastic: best_one = numpy.argmin(score) sample = sample[best_one] else: sample = sample print 'Truth ', jj, ': ', for vv in x[:, jj]: if vv == 0: break if vv in engine.word_idict: print engine.word_idict[vv], else: print 'UNK', print for kk, ss in enumerate([sample]): print 'Sample (', jj, ') ', ': ', for vv in ss: if vv == 0: break if vv in engine.word_idict: print engine.word_idict[vv], else: print 'UNK', print
class Scene(object): def __init__(self): self.db = Database('flatstack.db') self.db.create_default_tables() self.scene = canvas() self.scene.background = color.gray(0.8) self.scene.forward = vec(0,-0.2,-1) self.scene.fov = 0.2 self.set_range() self.scene.caption = """Right button drag or Ctrl-drag to rotate "camera" to view scene. To zoom, drag with middle button or Alt/Option depressed, or use scroll wheel. On a two-button mouse, middle is left + right. Touch screen: pinch/extend to zoom, swipe or two-finger rotate.\n""" self.clear() def clear(self): self.objects = [] for a in self.scene.objects: a.visible = False del a self.layers = None def set_range(self): #Todo: range to enclose bounding box of all points self.scene.range = 100 def rate(self, sceneRate): rate(sceneRate) def apply(self): for l in self.layers.layers: extr = extrusion(path=[vec(0,0,l.depth), vec(0,0,0)], color=color.cyan, shape=[ l.path ], pos=(self.vector_to_vec(l.position) + vec(0,0,l.depth / 2)), angle=l.angle, axis=self.vector_to_vec(l.axis)) if l.showAxis: self.draw_axis(l) if l.showPoints: self.draw_points(l) def draw_axis(self, layer): position = self.vector_to_vec(layer.position) mArrow = arrow(angle=layer.angle, axis=self.vector_to_vec(layer.axis), color=color.orange, length=40, pos=position) rArrow = arrow(axis=vec(1,0,0), color=color.red, length=50, pos=position, shaftwidth=1) gArrow = arrow(axis=vec(0,1,0), color=color.green, length=50, pos=position, shaftwidth=1) bArrow = arrow(axis=vec(0,0,1), color=color.blue, length=50, pos=position, shaftwidth=1) def draw_points(self, layer): position = self.vector_to_vec(layer.position) # print(layer.path) # for p in layer.path: # pBall = sphere(pos=(vec(p[0],p[1], 0) + position), radius=5) for v in layer.volume: p = self.vector_to_vec(v) pBall = sphere(pos=p, radius=5) def load_svg(self, filename): paths, attributes, svg_attributes = svg2paths2(filename) self.layers = Layers() self.layers.load_layers(paths, attributes) # def populate_db(self, filename): # paths, attributes, svg_attributes = svg2paths2(filename) # layers = Layers() # #layers.load_layers(paths, attributes) # #for l in layers.layers: # # print(l) # joints = {} # layerlist = [] # for p, a in zip(paths, attributes): # if 'fsjoint' in a: # if a['fsjoint'] in joints: # joints[a['fsjoint']].append(p) # else: # joints[a['fsjoint']] = [p] # else: # #hacky. gross. # el, ea = Layer(p,a).explode() # self.db.insert_layer(ea) # #self.layers = self.translate_joints(joints, layers) def vector_to_vec(self, inVec): return vec(inVec.to_points()[0], inVec.to_points()[1], inVec.to_points()[2])
class HAN: def __init__(self, config): self.config = config self.layers = Layers(config) def build_HAN_net(self): X_id = self.layers.X_input() senti_Y = self.layers.senti_Y_input() table = self.layers.word_embedding_table() mask = self.layers.padded_word_mask(X_id) X = self.layers.lookup(X_id, table, mask) seq_len = self.layers.sequence_length(X_id) sent_repr_ls = [] for i in range(self.config['model']['sentAtt_num']): name = '_layer%d' % i X = self.layers.biLSTM(X, seq_len, name) graph = tf.get_default_graph() tf.add_to_collection( 'reg', tf.contrib.layers.l2_regularizer( self.config['model']['reg_rate'])(graph.get_tensor_by_name( 'biLSTM%s/bidirectional_rnn/fw/lstm_cell/kernel:0' % name))) tf.add_to_collection( 'reg', tf.contrib.layers.l2_regularizer( self.config['model']['reg_rate'])(graph.get_tensor_by_name( 'biLSTM%s/bidirectional_rnn/bw/lstm_cell/kernel:0' % name))) sent_att = self.layers.sent_attention(X, X_id) # (batch size, max sent len) sent_repr = self.layers.sent_repr(sent_att, X) sent_repr_ls.append(sent_repr) # (batch size, sentAtt_num * max sent len) sent_repr = tf.concat(sent_repr_ls, axis=1) senti_score = self.layers.score(sent_repr) pred = self.layers.senti_prediction(senti_score) loss = self.layers.senti_loss(senti_score, senti_Y) train_step = tf.train.AdamOptimizer( self.config['model']['lr']).minimize(loss) saver = tf.train.Saver(tf.global_variables(), max_to_keep=2) return { 'loss': loss, 'pred': pred, 'graph': tf.get_default_graph(), 'train_step': train_step, 'saver': saver }
def __init__(self): self.layers = Layers()
class Model: def __init__(self, config): self.config = config self.layers = Layers(config) def build_senti_net(self): X_id = self.layers.X_input() senti_Y = self.layers.senti_Y_input() table = self.layers.word_embedding_table() mask = self.layers.padded_word_mask(X_id) X = self.layers.lookup(X_id, table, mask) seq_len = self.layers.sequence_length(X_id) bisru_name = 'share' for i in range(self.config['model']['biSRU']['shared_layers_num']): # (batch size, max sent len, rnn_dim) X = self.layers.biSRU(X, seq_len, name=bisru_name) graph = tf.get_default_graph() tf.add_to_collection( 'reg', tf.contrib.layers.l2_regularizer(self.config['model']['reg_rate'])( graph.get_tensor_by_name( 'biSRU_%s/bidirectional_rnn/fw/sru_cell/kernel:0' % bisru_name))) tf.add_to_collection( 'reg', tf.contrib.layers.l2_regularizer(self.config['model']['reg_rate'])( graph.get_tensor_by_name( 'biSRU_%s/bidirectional_rnn/bw/sru_cell/kernel:0' % bisru_name))) # (batch size, rnn dim) sent_repr_ls = [] for i in range(self.config['model']['biSRU']['separated_layers_num']): bisru_name = 'sep_layer' + str(i) X = self.layers.biSRU(X, seq_len, name=bisru_name) tf.add_to_collection( 'reg', tf.contrib.layers.l2_regularizer( self.config['model']['reg_rate'])(graph.get_tensor_by_name( 'biSRU_%s/bidirectional_rnn/fw/sru_cell/kernel:0' % bisru_name))) tf.add_to_collection( 'reg', tf.contrib.layers.l2_regularizer( self.config['model']['reg_rate'])(graph.get_tensor_by_name( 'biSRU_%s/bidirectional_rnn/bw/sru_cell/kernel:0' % bisru_name))) # (attr num, rnn dim) A = self.layers.attr_matrix(name=bisru_name) # (batch size, attr num, max sent len) att = self.layers.attention(A, X, X_id) # (batch size, attr num, rnn dim) sent_repr = self.layers.sent_repr(att, X) sent_repr_ls.append(sent_repr) # (batch size, attr num, sep bisru layers num * rnn dim) sent_repr = tf.concat(sent_repr_ls, axis=2) senti_score = self.layers.senti_score(sent_repr) pred = self.layers.senti_prediction(senti_score) loss = self.layers.senti_loss(senti_score, senti_Y) train_step = tf.train.AdamOptimizer( self.config['model']['lr']).minimize(loss) saver = tf.train.Saver(tf.global_variables(), max_to_keep=2) return { 'loss': loss, 'pred': pred, 'graph': tf.get_default_graph(), 'train_step': train_step, 'saver': saver }