def test_ceil(self): program = Program() with program_guard(program): input = layers.data(name="input", shape=[16], dtype="float32") out = layers.ceil(input, name='ceil') self.assertIsNotNone(out) print(str(program))
def forward(self, x, alpha=1.0, target=None): """ Compute length of mel from encoder output use TransformerTTS attention Args: x (Variable): shape(B, T, C), dtype float32, the encoder output. alpha (float32, optional): the hyperparameter to determine the length of the expanded sequence mel, thereby controlling the voice speed. Defaults to 1.0. target (Variable, optional): shape(B, T_text), dtype int64, the duration of phoneme compute from pretrained transformerTTS. Defaults to None. Returns: output (Variable): shape(B, T, C), the output after exppand. duration_predictor_output (Variable): shape(B, T, C), the output of duration predictor. """ duration_predictor_output = self.duration_predictor(x) if fluid.framework._dygraph_tracer()._train_mode: output = self.LR(x, target) return output, duration_predictor_output else: duration_predictor_output = duration_predictor_output * alpha duration_predictor_output = layers.ceil(duration_predictor_output) output = self.LR(x, duration_predictor_output) mel_pos = dg.to_variable(np.arange(1, output.shape[1] + 1)).astype( np.int64) mel_pos = layers.unsqueeze(mel_pos, [0]) return output, mel_pos
def communicate_avg_loss(): communicate() self._generate_avg_loss(main_block, loss, avg_loss) next_local_steps = layers.cast(layers.ceil( layers.sqrt(lr_0 * avg_loss / (global_lr * loss_0) * float(init_k_steps))), dtype='int64') max_local_steps = layers.fill_constant(shape=[1], dtype='int64', value=16) min_local_steps = layers.fill_constant(shape=[1], dtype='int64', value=1) next_local_steps = layers.elementwise_min( next_local_steps, max_local_steps) next_local_steps = layers.elementwise_max( next_local_steps, min_local_steps) layers.assign(next_local_steps, k_steps)
def __compute_graph_bias(q, graph_attn_mask, pos_win): """ :param q: (batch_size, n_heads, query_len, dim_per_head) :param graph_attn_mask: (batch_size, n_head, key_s_len, key_s_len) :param pos_win: :return: """ # (batch_size, n_heads, query_len, dim_per_head) pos_v = layers.fc(input=q, size=d_value, num_flatten_dims=3, param_attr=fluid.ParamAttr( name=name + '_pos_fc.w_0', initializer=param_initializer), bias_attr=name + '_pos_fc.b_0') # (batch_size, n_heads, query_len, 1) pos_s = layers.fc(input=layers.tanh(pos_v), size=1, num_flatten_dims=3, param_attr=fluid.ParamAttr( name=name + '_pos_score_fc.w_0', initializer=param_initializer), bias_attr=False) # (batch_size, n_heads, query_len, 1) pos = layers.sigmoid(pos_s) * (key_s_len - 1) # (batch_size, n_heads, query_len, 1) pos_up = layers.cast(layers.ceil(pos), dtype='int64') # print("pos_up.shape = %s" % str(pos_up.shape)) pos_down = layers.cast(layers.floor(pos), dtype='int64') # print("pos_down.shape = %s" % str(pos_down.shape)) batch_ind = layers.range(0, layers.cast(batch_size, dtype='int64'), 1, 'int64') # print("batch_ind.shape = %s" % str(batch_ind.shape)) batch_ind = layers.unsqueeze(batch_ind, axes=[1, 2, 3]) # (batch_size, 1, 1, 1) batch_ind = layers.expand( batch_ind, expand_times=[1, n_head, query_len, 1]) # (batch_size, n_heads, query_len, 1) # print("batch_ind.shape = %s" % str(batch_ind.shape)) head_ind = layers.range(0, n_head, 1, 'int64') # print("head_ind.shape = %s" % str(head_ind.shape)) head_ind = layers.unsqueeze(head_ind, axes=[0, 2, 3]) # (1, n_heads, 1, 1) head_ind = layers.expand(head_ind, expand_times=[batch_size, 1, query_len, 1]) # print("head_ind.shape = %s" % str(head_ind.shape)) query_ind = layers.range(0, layers.cast(query_len, dtype='int64'), 1, 'int64') # print("query_ind.shape = %s" % str(query_ind.shape)) query_ind = layers.unsqueeze(query_ind, axes=[0, 1, 3]) # (1, 1, query_len, 1) query_ind = layers.expand(query_ind, expand_times=[batch_size, n_head, 1, 1]) # print("query_ind.shape = %s" % str(query_ind.shape)) # (batch_size, n_heads, query_len, 4) pos_up_ind = layers.concat( input=[batch_ind, head_ind, query_ind, pos_up], axis=-1) # print("pos_up_ind.shape = %s" % str(pos_up_ind.shape)) pos_up_ind.stop_gradient = True pos_down_ind = layers.concat( input=[batch_ind, head_ind, query_ind, pos_down], axis=-1) # print("pos_down_ind.shape = %s" % str(pos_down_ind.shape)) pos_down_ind.stop_gradient = True # (batch_size, n_heads, query_len, key_s_len, key_s_len) graph_attn_mask = layers.unsqueeze(graph_attn_mask, axes=[2]) # print("graph_attn_mask.shape = %s" % str(graph_attn_mask.shape)) graph_attn_mask = layers.expand(graph_attn_mask, expand_times=[1, 1, query_len, 1, 1]) # print("graph_attn_mask.shape = %s" % str(graph_attn_mask.shape)) # (batch_size, n_heads, query_len, key_s_len) graph_attn_mask_up = layers.gather_nd(input=graph_attn_mask, index=pos_up_ind) graph_attn_mask_down = layers.gather_nd(input=graph_attn_mask, index=pos_down_ind) # print("graph_attn_mask_up.shape = %s" % str(graph_attn_mask_up.shape)) # print("graph_attn_mask_down.shape = %s" % str(graph_attn_mask_down.shape)) # print("pos_up.shape = %s" % str(pos_up.shape)) # print("pos_down.shape = %s" % str(pos_down.shape)) # linearly combine up and down (batch_size, n_heads, query_len, key_s_len) graph_attn_mask_select = graph_attn_mask_up * (1.0 - (layers.cast(pos_up, dtype='float32') - pos)) + \ graph_attn_mask_down * (1.0 - (pos - layers.cast(pos_down, dtype='float32'))) # print("graph_attn_mask_select.shape = %s" % str(graph_attn_mask_select.shape)) # re-weight the attention score with gaussian weights gaussian_w = ( -0.5 * graph_attn_mask_select * graph_attn_mask_select) / ( (0.5 * pos_win)**2) # [batch, n_heads, query_len, key_s_len] # print("gaussian_w.shape = %s" % str(gaussian_w.shape)) return gaussian_w
def communicate(): sub_block = default_main_program().current_block() ring_id = -1 for param, snapshot in p2s: sub_block.append_op(type='elementwise_sub', inputs={ 'X': [snapshot], 'Y': [param] }, outputs={'Out': [param]}, attrs={OP_ROLE_KEY: OpRole.Optimize}) sub_block.append_op(type='c_sync_calc_stream', inputs={'X': param}, outputs={'Out': param}, attrs={OP_ROLE_KEY: OpRole.Optimize}) ring_id = (ring_id + 1) % self.nrings sub_block.append_op(type='c_allreduce_sum', inputs={'X': [param]}, outputs={'Out': [param]}, attrs={ 'ring_id': ring_id, OP_ROLE_KEY: OpRole.Optimize }) for ring_id in range(self.nrings): sub_block.append_op(type='c_sync_comm_stream', inputs={'X': param}, outputs={'Out': param}, attrs={ 'ring_id': ring_id, OP_ROLE_KEY: OpRole.Optimize }) for param, snapshot in p2s: sub_block.append_op(type='scale', inputs={'X': [param]}, outputs={'Out': [param]}, attrs={ 'scale': 1.0 / self.role_maker.worker_num(), OP_ROLE_KEY: OpRole.Optimize }) sub_block.append_op(type='elementwise_sub', inputs={ 'X': [snapshot], 'Y': [param] }, outputs={'Out': [param]}, attrs={OP_ROLE_KEY: OpRole.Optimize}) sub_block.append_op(type='assign', inputs={'X': [param]}, outputs={'Out': [snapshot]}, attrs={OP_ROLE_KEY: OpRole.Optimize}) if auto_steps: next_local_steps = layers.cast(layers.ceil( layers.sqrt(lr_0 * loss / (global_lr * loss_0) * float(init_k_steps))), dtype='int64') max_local_steps = layers.fill_constant(shape=[1], dtype='int64', value=16) next_local_steps = layers.elementwise_min( next_local_steps, max_local_steps) layers.assign(next_local_steps, k_steps) layers.assign(step, last_step)
def topk_pool(gw, score, graph_id, ratio): """Implementation of topk pooling, where k means pooling ratio. Args: gw: Graph wrapper object. score: The attention score of all nodes, which is used to select important nodes. graph_id: The graphs that the nodes belong to. ratio: The pooling ratio of nodes we want to select. Return: perm: The index of nodes we choose. ratio_length: The selected node numbers of each graph. """ graph_lod = gw.graph_lod graph_nodes = gw.num_nodes num_graph = gw.num_graph num_nodes = L.ones(shape=[graph_nodes], dtype="float32") num_nodes = L.lod_reset(num_nodes, graph_lod) num_nodes_per_graph = L.sequence_pool(num_nodes, pool_type='sum') max_num_nodes = L.reduce_max(num_nodes_per_graph, dim=0) max_num_nodes = L.cast(max_num_nodes, dtype="int32") index = L.arange(0, gw.num_nodes, dtype="int64") offset = L.gather(graph_lod, graph_id, overwrite=False) index = (index - offset) + (graph_id * max_num_nodes) index.stop_gradient = True # padding dense_score = L.fill_constant(shape=[num_graph * max_num_nodes], dtype="float32", value=-999999) index = L.reshape(index, shape=[-1]) dense_score = L.scatter(dense_score, index, updates=score) num_graph = L.cast(num_graph, dtype="int32") dense_score = L.reshape(dense_score, shape=[num_graph, max_num_nodes]) # record the sorted index _, sort_index = L.argsort(dense_score, axis=-1, descending=True) # recover the index range graph_lod = graph_lod[:-1] graph_lod = L.reshape(graph_lod, shape=[-1, 1]) graph_lod = L.cast(graph_lod, dtype="int64") sort_index = L.elementwise_add(sort_index, graph_lod, axis=-1) sort_index = L.reshape(sort_index, shape=[-1, 1]) # use sequence_slice to choose selected node index pad_lod = L.arange(0, (num_graph + 1) * max_num_nodes, step=max_num_nodes, dtype="int32") sort_index = L.lod_reset(sort_index, pad_lod) ratio_length = L.ceil(num_nodes_per_graph * ratio) ratio_length = L.cast(ratio_length, dtype="int64") ratio_length = L.reshape(ratio_length, shape=[-1, 1]) offset = L.zeros(shape=[num_graph, 1], dtype="int64") choose_index = L.sequence_slice(input=sort_index, offset=offset, length=ratio_length) perm = L.reshape(choose_index, shape=[-1]) return perm, ratio_length