def _body(layer_id, inputs, prev_c, prev_h, anchors, anchors_w_1, arc_seq, entropy, log_prob): indices = tf.range(0, layer_id, dtype=tf.int32) start_id = 4 * (layer_id - 2) prev_layers = [] for i in range(2): # index_1, index_2 next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h query = anchors_w_1.gather(indices) query = tf.reshape(query, [layer_id, self.lstm_size]) query = tf.tanh(query + tf.matmul(next_h[-1], self.w_attn_2)) query = tf.matmul(query, self.v_attn) logits = tf.reshape(query, [1, layer_id]) if self.temperature is not None: logits /= self.temperature if self.tanh_constant is not None: logits = self.tanh_constant * tf.tanh(logits) index = tf.multinomial(logits, 1) index = tf.to_int32(index) index = tf.reshape(index, [1]) arc_seq = arc_seq.write(start_id + 2 * i, index) curr_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=index) log_prob += curr_log_prob curr_ent = tf.stop_gradient(tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=tf.nn.softmax(logits))) entropy += curr_ent prev_layers.append(anchors.read(tf.reduce_sum(index))) inputs = prev_layers[-1] for i in range(2): # op_1, op_2 next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h logits = tf.matmul(next_h[-1], self.w_soft) + self.b_soft if self.temperature is not None: logits /= self.temperature if self.tanh_constant is not None: op_tanh = self.tanh_constant / self.op_tanh_reduce logits = op_tanh * tf.tanh(logits) if use_bias: logits += self.b_soft_no_learn op_id = tf.multinomial(logits, 1) op_id = tf.to_int32(op_id) op_id = tf.reshape(op_id, [1]) arc_seq = arc_seq.write(start_id + 2 * i + 1, op_id) curr_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=op_id) log_prob += curr_log_prob curr_ent = tf.stop_gradient(tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=tf.nn.softmax(logits))) entropy += curr_ent inputs = tf.nn.embedding_lookup(self.w_emb, op_id) next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) anchors = anchors.write(layer_id, next_h[-1]) anchors_w_1 = anchors_w_1.write(layer_id, tf.matmul(next_h[-1], self.w_attn_1)) inputs = self.g_emb return (layer_id + 1, inputs, next_c, next_h, anchors, anchors_w_1, arc_seq, entropy, log_prob)
def _build_sampler(self): """Build the sampler ops and the log_prob ops.""" arc_seq = [] sample_log_probs = [] all_h = [] # sampler ops inputs = self.g_emb prev_c = [ tf.zeros([1, self.lstm_size], dtype=tf.float32) for _ in xrange(self.lstm_num_layers) ] prev_h = [ tf.zeros([1, self.lstm_size], dtype=tf.float32) for _ in xrange(self.lstm_num_layers) ] for layer_id in xrange(self.num_layers): for branch_id in xrange(self.num_branches): next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) all_h.append(tf.stop_gradient(next_h[-1])) logits = tf.matmul(next_h[-1], self.w_soft) if self.temperature is not None: logits /= self.temperature if self.tanh_constant is not None: logits = self.tanh_constant * tf.tanh(logits) config_id = tf.multinomial(logits, 1) # Deprecated # config_id = tf.random.categorical(logits, 1) config_id = tf.cast(config_id, tf.int32) config_id = tf.reshape(config_id, [1]) arc_seq.append(config_id) log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=config_id ) sample_log_probs.append(log_prob) inputs = tf.nn.embedding_lookup(self.w_emb, config_id) arc_seq = tf.concat(arc_seq, axis=0) self.sample_arc = arc_seq self.sample_log_probs = tf.concat(sample_log_probs, axis=0) self.ppl = tf.exp( tf.reduce_sum(self.sample_log_probs) / tf.cast(self.num_layers * self.num_branches, tf.float32) ) self.all_h = all_h
def _build_sampler(self): """Build the sampler ops and the log_prob ops.""" arc_seq = [] sample_log_probs = [] sample_entropy = [] all_h = [] all_h_w = [] # sampler ops inputs = self.g_emb prev_c, prev_h = [], [] for _ in range(self.lstm_num_layers): prev_c.append(tf.zeros([1, self.lstm_size], dtype=tf.float32)) prev_h.append(tf.zeros([1, self.lstm_size], dtype=tf.float32)) # used = tf.zeros([self.rhn_depth, 2], dtype=tf.int32) for layer_id in range(self.rhn_depth): next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h all_h.append(next_h[-1]) all_h_w.append(tf.matmul(next_h[-1], self.attn_w_1)) if layer_id > 0: query = tf.matmul(next_h[-1], self.attn_w_2) query = query + tf.concat(all_h_w[:-1], axis=0) query = tf.tanh(query) logits = tf.matmul(query, self.attn_v) logits = tf.reshape(logits, [1, layer_id]) if self.temperature is not None: logits /= self.temperature if self.tanh_constant is not None: logits = self.tanh_constant * tf.tanh(logits) diff = tf.to_float(layer_id - tf.range(0, layer_id))**2 logits -= tf.reshape(diff, [1, layer_id]) / 6.0 skip_index = tf.compat.v1.random.categorical(logits, 1, dtype=tf.int32) skip_index = tf.reshape(skip_index, [1]) arc_seq.append(skip_index) log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=skip_index) sample_log_probs.append(log_prob) entropy = log_prob * tf.exp(-log_prob) sample_entropy.append(tf.stop_gradient(entropy)) inputs = tf.nn.embedding_lookup(tf.concat(all_h[:-1], axis=0), skip_index) inputs /= (0.1 + tf.to_float(layer_id - skip_index)) else: inputs = self.g_emb next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h logits = tf.matmul(next_h[-1], self.w_soft) if self.temperature is not None: logits /= self.temperature if self.tanh_constant is not None: logits = self.tanh_constant * tf.tanh(logits) func = tf.compat.v1.random.categorical(logits, 1, dtype=tf.int32) func = tf.reshape(func, [1]) arc_seq.append(func) log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=func) sample_log_probs.append(log_prob) entropy = log_prob * tf.exp(-log_prob) sample_entropy.append(tf.stop_gradient(entropy)) inputs = tf.nn.embedding_lookup(self.w_emb, func) arc_seq = tf.concat(arc_seq, axis=0) self.sample_arc = arc_seq self.sample_log_probs = tf.concat(sample_log_probs, axis=0) self.ppl = tf.exp(tf.reduce_mean(self.sample_log_probs)) sample_entropy = tf.concat(sample_entropy, axis=0) self.sample_entropy = tf.reduce_sum(sample_entropy) self.all_h = all_h
def _build_sampler(self): """Build the sampler ops and the log_prob ops.""" print "-" * 80 print "Build controller sampler" anchors = [] anchors_w_1 = [] arc_seq = [] entropys = [] log_probs = [] skip_count = [] skip_penaltys = [] prev_c = [tf.zeros([1, self.lstm_size], tf.float32) for _ in xrange(self.lstm_num_layers)] prev_h = [tf.zeros([1, self.lstm_size], tf.float32) for _ in xrange(self.lstm_num_layers)] inputs = self.g_emb skip_targets = tf.constant([1.0 - self.skip_target, self.skip_target], dtype=tf.float32) for layer_id in xrange(self.num_layers): if self.search_whole_channels: next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h logit = tf.matmul(next_h[-1], self.w_soft) if self.temperature is not None: logit /= self.temperature if self.tanh_constant is not None: logit = self.tanh_constant * tf.tanh(logit) if self.search_for == "macro" or self.search_for == "branch": branch_id = tf.multinomial(logit, 1) branch_id = tf.to_int32(branch_id) branch_id = tf.reshape(branch_id, [1]) elif self.search_for == "connection": branch_id = tf.constant([0], dtype=tf.int32) else: raise ValueError("Unknown search_for {}".format(self.search_for)) arc_seq.append(branch_id) log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logit, labels=branch_id) log_probs.append(log_prob) entropy = tf.stop_gradient(log_prob * tf.exp(-log_prob)) entropys.append(entropy) inputs = tf.nn.embedding_lookup(self.w_emb, branch_id) else: for branch_id in xrange(self.num_branches): next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h logit = tf.matmul(next_h[-1], self.w_soft["start"][branch_id]) if self.temperature is not None: logit /= self.temperature if self.tanh_constant is not None: logit = self.tanh_constant * tf.tanh(logit) start = tf.multinomial(logit, 1) start = tf.to_int32(start) start = tf.reshape(start, [1]) arc_seq.append(start) log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logit, labels=start) log_probs.append(log_prob) entropy = tf.stop_gradient(log_prob * tf.exp(-log_prob)) entropys.append(entropy) inputs = tf.nn.embedding_lookup(self.w_emb["start"][branch_id], start) next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h logit = tf.matmul(next_h[-1], self.w_soft["count"][branch_id]) if self.temperature is not None: logit /= self.temperature if self.tanh_constant is not None: logit = self.tanh_constant * tf.tanh(logit) mask = tf.range(0, limit=self.out_filters-1, delta=1, dtype=tf.int32) mask = tf.reshape(mask, [1, self.out_filters - 1]) mask = tf.less_equal(mask, self.out_filters-1 - start) logit = tf.where(mask, x=logit, y=tf.fill(tf.shape(logit), -np.inf)) count = tf.multinomial(logit, 1) count = tf.to_int32(count) count = tf.reshape(count, [1]) arc_seq.append(count + 1) log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logit, labels=count) log_probs.append(log_prob) entropy = tf.stop_gradient(log_prob * tf.exp(-log_prob)) entropys.append(entropy) inputs = tf.nn.embedding_lookup(self.w_emb["count"][branch_id], count) next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h if layer_id > 0: query = tf.concat(anchors_w_1, axis=0) query = tf.tanh(query + tf.matmul(next_h[-1], self.w_attn_2)) query = tf.matmul(query, self.v_attn) logit = tf.concat([-query, query], axis=1) if self.temperature is not None: logit /= self.temperature if self.tanh_constant is not None: logit = self.tanh_constant * tf.tanh(logit) skip = tf.multinomial(logit, 1) skip = tf.to_int32(skip) skip = tf.reshape(skip, [layer_id]) arc_seq.append(skip) skip_prob = tf.sigmoid(logit) kl = skip_prob * tf.log(skip_prob / skip_targets) kl = tf.reduce_sum(kl) skip_penaltys.append(kl) log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logit, labels=skip) log_probs.append(tf.reduce_sum(log_prob, keep_dims=True)) entropy = tf.stop_gradient( tf.reduce_sum(log_prob * tf.exp(-log_prob), keep_dims=True)) entropys.append(entropy) skip = tf.to_float(skip) skip = tf.reshape(skip, [1, layer_id]) skip_count.append(tf.reduce_sum(skip)) inputs = tf.matmul(skip, tf.concat(anchors, axis=0)) inputs /= (1.0 + tf.reduce_sum(skip)) else: inputs = self.g_emb anchors.append(next_h[-1]) anchors_w_1.append(tf.matmul(next_h[-1], self.w_attn_1)) arc_seq = tf.concat(arc_seq, axis=0) self.sample_arc = tf.reshape(arc_seq, [-1]) entropys = tf.stack(entropys) self.sample_entropy = tf.reduce_sum(entropys) log_probs = tf.stack(log_probs) self.sample_log_prob = tf.reduce_sum(log_probs) skip_count = tf.stack(skip_count) self.skip_count = tf.reduce_sum(skip_count) skip_penaltys = tf.stack(skip_penaltys) self.skip_penaltys = tf.reduce_mean(skip_penaltys)
def _build_sampler(self, prev_c=None, prev_h=None, use_bias=False): """Build the sampler ops and the log_prob ops.""" print("-" * 80) print("Build controller sampler") anchors = tf.TensorArray(tf.float32, size=self.num_cells + 2, clear_after_read=False) anchors_w_1 = tf.TensorArray(tf.float32, size=self.num_cells + 2, clear_after_read=False) arc_seq = tf.TensorArray(tf.int32, size=self.num_cells * 4) if prev_c is None: assert prev_h is None, "prev_c and prev_h must both be None" prev_c = [ tf.zeros([1, self.lstm_size], tf.float32) for _ in range(self.lstm_num_layers) ] prev_h = [ tf.zeros([1, self.lstm_size], tf.float32) for _ in range(self.lstm_num_layers) ] inputs = self.g_emb for layer_id in range(2): next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h anchors = anchors.write(layer_id, tf.zeros_like(next_h[-1])) anchors_w_1 = anchors_w_1.write( layer_id, tf.matmul(next_h[-1], self.w_attn_1)) def _condition(layer_id, *args): return tf.less(layer_id, self.num_cells + 2) def _body(layer_id, inputs, prev_c, prev_h, anchors, anchors_w_1, arc_seq, entropy, log_prob): indices = tf.range(0, layer_id, dtype=tf.int32) start_id = 4 * (layer_id - 2) prev_layers = [] for i in range(2): # index_1, index_2 next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h query = anchors_w_1.gather(indices) query = tf.reshape(query, [layer_id, self.lstm_size]) query = tf.tanh(query + tf.matmul(next_h[-1], self.w_attn_2)) query = tf.matmul(query, self.v_attn) logits = tf.reshape(query, [1, layer_id]) if self.temperature is not None: logits /= self.temperature if self.tanh_constant is not None: logits = self.tanh_constant * tf.tanh(logits) index = tf.multinomial(logits, 1) index = tf.to_int32(index) index = tf.reshape(index, [1]) arc_seq = arc_seq.write(start_id + 2 * i, index) curr_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=index) log_prob += curr_log_prob curr_ent = tf.stop_gradient( tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=tf.nn.softmax(logits))) entropy += curr_ent prev_layers.append(anchors.read(tf.reduce_sum(index))) inputs = prev_layers[-1] for i in range(2): # op_1, op_2 next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h logits = tf.matmul(next_h[-1], self.w_soft) + self.b_soft if self.temperature is not None: logits /= self.temperature if self.tanh_constant is not None: op_tanh = self.tanh_constant / self.op_tanh_reduce logits = op_tanh * tf.tanh(logits) if use_bias: logits += self.b_soft_no_learn op_id = tf.multinomial(logits, 1) op_id = tf.to_int32(op_id) op_id = tf.reshape(op_id, [1]) arc_seq = arc_seq.write(start_id + 2 * i + 1, op_id) curr_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=op_id) log_prob += curr_log_prob curr_ent = tf.stop_gradient( tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=tf.nn.softmax(logits))) entropy += curr_ent inputs = tf.nn.embedding_lookup(self.w_emb, op_id) next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) anchors = anchors.write(layer_id, next_h[-1]) anchors_w_1 = anchors_w_1.write( layer_id, tf.matmul(next_h[-1], self.w_attn_1)) inputs = self.g_emb return (layer_id + 1, inputs, next_c, next_h, anchors, anchors_w_1, arc_seq, entropy, log_prob) loop_vars = [ tf.constant(2, dtype=tf.int32, name="layer_id"), inputs, prev_c, prev_h, anchors, anchors_w_1, arc_seq, tf.constant([0.0], dtype=tf.float32, name="entropy"), tf.constant([0.0], dtype=tf.float32, name="log_prob"), ] loop_outputs = tf.while_loop(_condition, _body, loop_vars, parallel_iterations=1) arc_seq = loop_outputs[-3].stack() arc_seq = tf.reshape(arc_seq, [-1]) entropy = tf.reduce_sum(loop_outputs[-2]) log_prob = tf.reduce_sum(loop_outputs[-1]) last_c = loop_outputs[-7] last_h = loop_outputs[-6] return arc_seq, entropy, log_prob, last_c, last_h
def _body(layer_id, inputs, prev_c, prev_h, anchors, anchors_w_1, arc_seq, entropy, log_prob): indices = tf.range(0, layer_id, dtype=tf.int32) start_id = 4 * (layer_id - 2) prev_layers = [] for i in range(2): # index_1, index_2 next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h query = anchors_w_1.gather( indices) # 将anchors_w_1中的位于indices位置上的数据合并起来得到一个Tensor query = tf.reshape(query, [layer_id, self.lstm_size]) query = tf.tanh(query + tf.matmul(next_h[-1], self.w_attn_2)) query = tf.matmul(query, self.v_attn) logits = tf.reshape(query, [1, layer_id]) # 预测值 if self.temperature is not None: logits /= self.temperature if self.tanh_constant is not None: logits = self.tanh_constant * tf.tanh(logits) ''' tf.multinomial(logits, num_samples): 第一个参数logits可以是一个数组,每个元素的值表示对应index的选择概率。 假设logits有两个元素,即[0.6,0.4],这表示的意思是取 0 的概率是0.6, 取 1 的概率是0.4。 第二个参数num_samples表示抽样的个数。 例如: tf.multinomial(tf.log([[0.1]]),3) 不管重复运行多少次结果都是 [0,0,0] tf.multinomial(tf.log([[0.1, 0.6]]),3) 结果可能 [0,0,0],也可能是[0,1,1],当然也有其他可能。 ''' index = tf.multinomial(logits, 1) index = tf.to_int32(index) index = tf.reshape(index, [1]) # 生成index_i 的值 arc_seq = arc_seq.write(start_id + 2 * i, index) # 将生成的index写入到arc_seq中去 curr_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=index) log_prob += curr_log_prob curr_ent = tf.stop_gradient( tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=tf.nn.softmax(logits))) entropy += curr_ent prev_layers.append(anchors.read(tf.reduce_sum(index))) inputs = prev_layers[-1] for i in range(2): # op_1, op_2 next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h logits = tf.matmul(next_h[-1], self.w_soft) + self.b_soft if self.temperature is not None: logits /= self.temperature if self.tanh_constant is not None: op_tanh = self.tanh_constant / self.op_tanh_reduce logits = op_tanh * tf.tanh(logits) if use_bias: logits += self.b_soft_no_learn op_id = tf.multinomial(logits, 1) op_id = tf.to_int32(op_id) op_id = tf.reshape(op_id, [1]) arc_seq = arc_seq.write(start_id + 2 * i + 1, op_id) curr_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=op_id) log_prob += curr_log_prob curr_ent = tf.stop_gradient( tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=tf.nn.softmax(logits))) entropy += curr_ent inputs = tf.nn.embedding_lookup(self.w_emb, op_id) next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) anchors = anchors.write(layer_id, next_h[-1]) anchors_w_1 = anchors_w_1.write( layer_id, tf.matmul(next_h[-1], self.w_attn_1)) inputs = self.g_emb return (layer_id + 1, inputs, next_c, next_h, anchors, anchors_w_1, arc_seq, entropy, log_prob)
def _build_sampler(self): """Build the sampler ops and the log_prob ops.""" print "-" * 80 print "Build controller sampler" #anchors = [] #anchors_w_1 = [] arc_seq = [] entropys = [] log_probs = [] #skip_count = [] #skip_penaltys = [] #initialize the first c and h as zero prev_c = [ tf.zeros([1, self.lstm_size], tf.float32) for _ in xrange(self.lstm_num_layers) ] prev_h = [ tf.zeros([1, self.lstm_size], tf.float32) for _ in xrange(self.lstm_num_layers) ] #shape(self.g_emb) = [1, self.lstm_size] inputs = self.g_emb #the first input = self.g_emb #skip_targets = tf.constant([1.0 - self.skip_target, self.skip_target], # dtype=tf.float32) for layer_id in xrange(self.num_layers): if self.search_whole_channels: #the shapes of c and h are both [1,self.lstm_size] next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h #logit is the result of lstm after softmax logit = tf.matmul(next_h[-1], self.w_soft) if self.temperature is not None: logit /= self.temperature if self.tanh_constant is not None: logit = self.tanh_constant * tf.tanh(logit) if self.search_for == "macro" or self.search_for == "branch": branch_id = tf.multinomial(logit, 1) branch_id = tf.to_int32(branch_id) branch_id = tf.reshape(branch_id, [1]) elif self.search_for == "connection": branch_id = tf.constant([0], dtype=tf.int32) else: raise ValueError("Unknown search_for {}".format( self.search_for)) arc_seq.append(branch_id) log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logit, labels=branch_id) log_probs.append(log_prob) entropy = tf.stop_gradient(log_prob * tf.exp(-log_prob)) entropys.append(entropy) inputs = tf.nn.embedding_lookup(self.w_emb, branch_id) else: raise ValueError( "Just consider the situation when self.search_whole_channels=true" ) next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h """ if layer_id > 0: query = tf.concat(anchors_w_1, axis=0) query = tf.tanh(query + tf.matmul(next_h[-1], self.w_attn_2)) query = tf.matmul(query, self.v_attn) logit = tf.concat([-query, query], axis=1) if self.temperature is not None: logit /= self.temperature if self.tanh_constant is not None: logit = self.tanh_constant * tf.tanh(logit) skip = tf.multinomial(logit, 1) skip = tf.to_int32(skip) skip = tf.reshape(skip, [layer_id]) arc_seq.append(skip) skip_prob = tf.sigmoid(logit) kl = skip_prob * tf.log(skip_prob / skip_targets) kl = tf.reduce_sum(kl) skip_penaltys.append(kl) log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logit, labels=skip) log_probs.append(tf.reduce_sum(log_prob, keep_dims=True)) entropy = tf.stop_gradient( tf.reduce_sum(log_prob * tf.exp(-log_prob), keep_dims=True)) entropys.append(entropy) skip = tf.to_float(skip) skip = tf.reshape(skip, [1, layer_id]) skip_count.append(tf.reduce_sum(skip)) inputs = tf.matmul(skip, tf.concat(anchors, axis=0)) inputs /= (1.0 + tf.reduce_sum(skip)) else: inputs = self.g_emb """ #anchors.append(next_h[-1]) #anchors_w_1.append(tf.matmul(next_h[-1], self.w_attn_1)) arc_seq = tf.concat(arc_seq, axis=0) self.sample_arc = tf.reshape(arc_seq, [-1]) entropys = tf.stack(entropys) self.sample_entropy = tf.reduce_sum(entropys) log_probs = tf.stack(log_probs) self.sample_log_prob = tf.reduce_sum(log_probs)
def _build_sampler(self, prev_c=None, prev_h=None): anchors = tf.TensorArray(tf.float32, size=self.num_cells + 1, clear_after_read=False) anchors_w_1 = tf.TensorArray(tf.float32, size=self.num_cells + 1, clear_after_read=False) arc_seq = tf.TensorArray(tf.int32, size=self.num_cells * 2) if prev_c is None or prev_h is None: prev_c = [ tf.zeros([1, self.lstm_size], tf.float32) for _ in range(self.lstm_num_layers) ] prev_h = [ tf.zeros([1, self.lstm_size], tf.float32) for _ in range(self.lstm_num_layers) ] inputs = self.g_emb next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h anchors = anchors.write(0, tf.zeros_like(next_h[-1])) anchors_w_1 = anchors_w_1.write(0, tf.matmul(next_h[-1], self.w_attn_1)) def _condition(layer_id, *args): return tf.less(layer_id, self.num_cells + 2) def _body(layer_id, inputs, prev_c, prev_h, anchors, anchors_w_1, arc_seq, entropy, log_prob): indices = tf.range(0, layer_id, dtype=tf.int32) start_id = 2 * (layer_id - 1) prev_layers = [] # index next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h query = anchors_w_1.gather(indices) query = tf.reshape(query, [layer_id, self.lstm_size]) query = tf.tanh(query + tf.matmul(next_h[-1], self.w_attn_2)) query = tf.matmul(query, self.v_attn) logits = tf.reshape(query, [1, layer_id]) if self.temperature: logits /= self.temperature if self.tanh_constant: logits = self.tanh_constant * tf.tanh(logits) index = tf.multinomial(logits, 1) index = tf.to_int32(index) index = tf.reshape(index, [1]) arc_seq = arc_seq.write(start_id, index) curr_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=index) log_prob += curr_log_prob curr_ent = tf.stop_gradient( tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=tf.nn.softmax(logits))) entropy += curr_ent prev_layers.append(anchors.read(tf.reduce_sum(index))) inputs = prev_layers[-1] # op next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h logits = tf.matmul(next_h[-1], self.w_soft) + self.b_soft if self.temperature: logits /= self.temperature if self.tanh_constant: op_tanh = self.tanh_constant / self.op_tanh_reduce logits = op_tanh * tf.tanh(logits) op_id = tf.multinomial(logits, 1) op_id = tf.to_int32(op_id) op_id = tf.reshape(op_id, [1]) arc_seq = arc_seq.write(start_id + 1, op_id) curr_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=op_id) log_prob += curr_log_prob curr_ent = tf.stop_gradient( tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=tf.nn.softmax(logits))) entropy += curr_ent inputs = tf.nn.embedding_lookup(self.w_emb, op_id) return (layer_id + 1, inputs, next_c, next_h, anchors, anchors_w_1, arc_seq, entropy, log_prob) loop_vars = [ tf.constant(1, dtype=tf.int32, name='layer_id'), inputs, prev_c, prev_h, anchors, anchors_w_1, arc_seq, tf.constant([0.0], dtype=tf.float32, name='entropy'), tf.constant([0.0], dtype=tf.float32, name='log_prob') ] loop_outputs = tf.while_loop(_condition, _body, loop_vars, parallel_iterations=1) arc_seq = loop_outputs[-3].stack() arc_seq = tf.reshape(arc_seq, [-1]) entropy = tf.reduce_sum(loop_outputs[-2]) log_prob = tf.reduce_sum(loop_outputs[-1]) last_c = loop_outputs[2] last_h = loop_outputs[3] return arc_seq, entropy, log_prob, last_c, last_h
def _build_trainer(self): print("-" * 80) print("Build controller trainer") anchors = [] anchors_w_1 = [] ops_each_layer = 2 if self.search_count else 1 total_arc_len = sum([ops_each_layer] + [ ops_each_layer+i for i in range(1, self.num_layers) ]) self.total_arc_len = total_arc_len self.input_arc = [tf.placeholder(shape=(), dtype=tf.int32, name='arc_{}'.format(i)) for i in range(total_arc_len)] entropys = [] log_probs = [] skip_count = [] skip_penaltys = [] masks = [] prev_c = [tf.zeros([1, self.lstm_size], tf.float32) for _ in range(self.lstm_num_layers)] prev_h = [tf.zeros([1, self.lstm_size], tf.float32) for _ in range(self.lstm_num_layers)] inputs = self.g_emb skip_targets = tf.constant([1.0 - self.skip_target, self.skip_target], dtype=tf.float32) arc_pointer = 0 for layer_id in range(self.num_layers): ### ### for each layer, sample num_branches operations ### #for branch_id in range(self.num_branches): next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h logit = tf.matmul(next_h[-1], self.w_soft["start"][layer_id]) # out_filter x 1 if self.temperature is not None: logit /= self.temperature if self.tanh_constant is not None: logit = self.tanh_constant * tf.tanh(logit) # start: a random number from 0 to out_filters[i] start = self.input_arc[arc_pointer] start = tf.reshape(start, [1]) log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logit, labels=start) log_probs.append(log_prob) entropy = tf.stop_gradient(log_prob * tf.exp(-log_prob)) entropys.append(entropy) # inputs: get a row slice of [out_filter[i], lstm_size] #inputs = tf.nn.embedding_lookup(self.w_emb["start"][branch_id], start) inputs = tf.nn.embedding_lookup(self.w_emb["start"][layer_id], start) next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h if self.search_count: #logit = tf.matmul(next_h[-1], self.w_soft["count"][branch_id]) logit = tf.matmul(next_h[-1], self.w_soft["count"][layer_id]) if self.temperature is not None: logit /= self.temperature if self.tanh_constant is not None: logit = self.tanh_constant * tf.tanh(logit) # mask: a boolean list of length out_filter[i]-1 # that is true for all <=out_filter[i]-start elements mask = tf.range(0, limit=self.out_filters[layer_id]-1, delta=1, dtype=tf.int32) mask = tf.reshape(mask, [1, self.out_filters[layer_id] - 1]) mask = tf.less_equal(mask, self.out_filters[layer_id]-1 - start) masks.append([mask, start]) # tf.where: for index of false in mask, x will be replaced with y logit = tf.where(mask, x=logit, y=tf.fill(tf.shape(logit), -np.inf)) # logit: >out_filter[i]-start will be masked to 0 # e.g.: if start is 3 and out_filter[i] is 10, then 8,9 will be masked to 0 count = self.input_arc[arc_pointer+1] count = tf.reshape(count, [1]) count = count - 1 #arc_seq.append(count + 1) log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logit, labels=count) log_probs.append(log_prob) entropy = tf.stop_gradient(log_prob * tf.exp(-log_prob)) entropys.append(entropy) # inputs: get a row slice of [out_filter[i]-1, lstm_size] #inputs = tf.nn.embedding_lookup(self.w_emb["count"][branch_id], count) inputs = tf.nn.embedding_lookup(self.w_emb["count"][layer_id], count) next_c, next_h = stack_lstm(inputs, prev_c, prev_h, self.w_lstm) prev_c, prev_h = next_c, next_h ### ### sample the connections, unless the first layer ### the number `skip` of each layer grows as layer_id grows ### if layer_id > 0: query = tf.concat(anchors_w_1, axis=0) # layer_id x lstm_size # w_attn_2: lstm_size x lstm_size query = tf.tanh(query + tf.matmul(next_h[-1], self.w_attn_2)) # query: layer_id x lstm_size ## P(Layer j is an input to layer i) = sigmoid(v^T %*% tanh(W_prev ∗ h_j + W_curr ∗ h_i)) query = tf.matmul(query, self.v_attn) # query: layer_id x 1 logit = tf.concat([-query, query], axis=1) # logit: layer_id x 2 if self.temperature is not None: logit /= self.temperature if self.tanh_constant is not None: logit = self.tanh_constant * tf.tanh(logit) skip = self.input_arc[(arc_pointer+ops_each_layer) : (arc_pointer+ops_each_layer + layer_id)] #print(layer_id, (arc_pointer+2), (arc_pointer+2 + layer_id), skip) skip = tf.reshape(skip, [layer_id]) skip_prob = tf.sigmoid(logit) kl = skip_prob * tf.log(skip_prob / skip_targets) kl = tf.reduce_sum(kl) skip_penaltys.append(kl) log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logit, labels=skip) log_probs.append(tf.reshape(tf.reduce_sum(log_prob),[-1])) entropy = tf.stop_gradient( tf.reshape(tf.reduce_sum(log_prob * tf.exp(-log_prob)), [-1]) ) entropys.append(entropy) skip = tf.to_float(skip) skip = tf.reshape(skip, [1, layer_id]) skip_count.append(tf.reduce_sum(skip)) inputs = tf.matmul(skip, tf.concat(anchors, axis=0)) inputs /= (1.0 + tf.reduce_sum(skip)) else: inputs = self.g_emb anchors.append(next_h[-1]) # next_h: 1 x lstm_size # anchors_w_1: 1 x lstm_size anchors_w_1.append(tf.matmul(next_h[-1], self.w_attn_1)) arc_pointer += ops_each_layer + layer_id entropys = tf.stack(entropys) self.onehot_entropy = tf.reduce_sum(entropys) log_probs = tf.stack(log_probs) self.onehot_log_prob = tf.reduce_sum(log_probs) skip_count = tf.stack(skip_count) self.onehot_skip_count = tf.reduce_sum(skip_count) skip_penaltys = tf.stack(skip_penaltys) self.onehot_skip_penaltys = tf.reduce_mean(skip_penaltys)