Esempio n. 1
0
    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
        """
        Scaled Dot-Product Attention
        [[
          0 L*L -inf
          -inf -inf
        ]]maxLen*maxLen
        """
        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
        if attn_bias:
            product += attn_bias
        weights = layers.softmax(product)
        ############################
        # add code
        layers.Print(attn_bias, message="The content of input layer:")

        attn_mask = attn_bias == 0
        attn_mask = layers.cast(attn_mask, 'float64')
        layers.Print(weights)
        weights = layers.elementwise_mul(attn_mask, weights)
        layers.Print(weights)

        #         weights = layers.elementwise_mul(weights, attn_mask)
        ############################
        if dropout_rate:
            weights = layers.dropout(weights,
                                     dropout_prob=dropout_rate,
                                     dropout_implementation="upscale_in_train",
                                     is_test=False)
        out = layers.matmul(weights, v)
        return out
Esempio n. 2
0
 def build_network(self, only_forward, **kargs):
     x = layers.data('x', shape=[3], dtype='float32', lod_level=1)
     x.stop_gradient = False
     layers.Print(input=x, **kargs)
     loss = layers.mean(x)
     append_backward(loss=loss)
     return loss
Esempio n. 3
0
def static_func(x):
    x = fluid.layers.assign(x)
    iter_num = fluid.layers.fill_constant(shape=[1], value=3, dtype='int32')
    a = fluid.layers.create_array(dtype='float32')
    i = 0
    a = fluid.dygraph.dygraph_to_static.variable_trans_func.to_static_variable(
        a)
    i = fluid.dygraph.dygraph_to_static.variable_trans_func.to_static_variable(
        i)
    iter_num = (fluid.dygraph.dygraph_to_static.variable_trans_func.
                to_static_variable(iter_num))
    x = fluid.dygraph.dygraph_to_static.variable_trans_func.to_static_variable(
        x)

    def while_condition_0(a, i, iter_num, x):
        return i < iter_num

    def while_body_0(a, i, iter_num, x):
        fluid.layers.array_write(x=x, i=fluid.layers.array_length(a), array=a)
        i += 1
        return a, i, iter_num, x

    a, i, iter_num, x = fluid.layers.while_loop(while_condition_0,
                                                while_body_0,
                                                [a, i, iter_num, x])
    length = layers.array_length(a)
    layers.Print(length)
    return a[0]
Esempio n. 4
0
    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
        """
        Scaled Dot-Product Attention
        """
        product = layers.matmul(x=q, y=k, transpose_y=True, alpha=d_key**-0.5)
        if attn_bias:
            product += attn_bias
        weights = layers.softmax(product)
        layers.Print(weights)

        layers.Print(weights)
        if dropout_rate:
            weights = layers.dropout(weights,
                                     dropout_prob=dropout_rate,
                                     seed=dropout_seed,
                                     is_test=False)
        out = layers.matmul(weights, v)
        return out
Esempio n. 5
0
def dygraph_func(x):
    x = fluid.dygraph.to_variable(x)
    iter_num = fluid.layers.fill_constant(shape=[1], value=3, dtype="int32")
    a = []
    i = 0
    while i < iter_num:
        a.append(x)
        i += 1
    length = layers.array_length(a)
    layers.Print(length)
    return a[0]
Esempio n. 6
0
def lstmp_encoder(input_seq, gate_size, h_0, c_0, para_name, proj_size, test_mode, args):
    # A lstm encoder implementation with projection.
    # Linear transformation part for input gate, output gate, forget gate
    # and cell activation vectors need be done outside of dynamic_lstm.
    # So the output size is 4 times of gate_size.

    if args.para_init:
        init = fluid.initializer.Constant(args.init1)
        init_b = fluid.initializer.Constant(0.0)
    else:
        init = None
        init_b = None
    input_seq = dropout(input_seq, test_mode, args)
    input_proj = layers.fc(input=input_seq,
                           param_attr=fluid.ParamAttr(
                               name=para_name + '_gate_w', initializer=init),
                           size=gate_size * 4,
                           act=None,
                           bias_attr=False)
    if args.debug:
        layers.Print(input_seq, message='input_seq', summarize=10)
        layers.Print(input_proj, message='input_proj', summarize=10)
    hidden, cell = layers.dynamic_lstmp(
        input=input_proj,
        size=gate_size * 4,
        proj_size=proj_size,
        h_0=h_0,
        c_0=c_0,
        use_peepholes=False,
        proj_clip=args.proj_clip,
        cell_clip=args.cell_clip,
        proj_activation="identity",
        param_attr=fluid.ParamAttr(initializer=init),
        bias_attr=fluid.ParamAttr(initializer=init_b))

    return hidden, cell, input_proj
Esempio n. 7
0
    def test_all_parameters(self):
        x = layers.data('x', shape=[3], dtype='float32', lod_level=1)
        x.stop_gradient = False

        for print_tensor_name in [True, False]:
            for print_tensor_type in [True, False]:
                for print_tensor_shape in [True, False]:
                    for print_tensor_lod in [True, False]:
                        layers.Print(
                            input=x,
                            print_tensor_name=print_tensor_name,
                            print_tensor_type=print_tensor_type,
                            print_tensor_shape=print_tensor_shape,
                            print_tensor_lod=print_tensor_lod,
                        )
        loss = layers.mean(x)
        append_backward(loss=loss)
        exe = Executor(self.place)
        outs = exe.run(feed={'x': self.x_tensor},
                       fetch_list=[loss],
                       return_numpy=False)
Esempio n. 8
0
    def run_boxps_preload(self, is_cpu=True):
        x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0)
        y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0)
        emb_x, emb_y = _pull_box_sparse([x, y], size=2)
        emb_xp = _pull_box_sparse(x, size=2)
        layers.Print(emb_xp)
        concat = layers.concat([emb_x, emb_y], axis=1)
        fc = layers.fc(input=concat,
                       name="fc",
                       size=1,
                       num_flatten_dims=1,
                       bias_attr=False)
        loss = layers.reduce_mean(fc)
        layers.Print(loss)
        place = fluid.CPUPlace(
        ) if is_cpu or not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
        exe = fluid.Executor(place)
        optimizer = fluid.optimizer.SGD(learning_rate=0.5)
        batch_size = 2

        def binary_print(slot, fout):
            fout.write(str(len(slot)) + " ")
            for e in slot:
                fout.write(str(e) + " ")

        batch1 = np.ones(
            (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1)
        filelist = []
        place_str = "cpu" if is_cpu else "gpu"
        for i in range(2):
            filelist.append("test_hdfs_" + place_str + "_" + str(i))
        for f in filelist:
            with open(f, "w") as fout:
                for ins in batch1:
                    for slot in ins:
                        binary_print(slot, fout)
                fout.write("\n")

        def create_dataset():
            dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
            dataset.set_use_var([x, y])
            dataset.set_batch_size(2)
            dataset.set_thread(1)
            dataset.set_filelist(filelist)
            return dataset

        datasets = []
        datasets.append(create_dataset())
        datasets.append(create_dataset())
        optimizer.minimize(loss)
        exe.run(fluid.default_startup_program())
        datasets[0].load_into_memory()
        datasets[0].begin_pass()
        datasets[1].preload_into_memory()
        exe.train_from_dataset(program=fluid.default_main_program(),
                               dataset=datasets[0],
                               print_period=1)
        datasets[0].end_pass()
        datasets[1].wait_preload_done()
        datasets[1].begin_pass()
        exe.train_from_dataset(program=fluid.default_main_program(),
                               dataset=datasets[1],
                               print_period=1)
        datasets[1].end_pass()
        for f in filelist:
            os.remove(f)
Esempio n. 9
0
        def beam_search():
            """Beam search function"""

            max_len = layers.fill_constant(shape=[1],
                                           dtype=start_tokens.dtype,
                                           value=self.max_out_len,
                                           force_cpu=True)
            min_len = layers.fill_constant(shape=[1],
                                           dtype=start_tokens.dtype,
                                           value=self.min_out_len)
            neg_inf = layers.fill_constant(shape=[1],
                                           dtype='float32',
                                           value=-INF)
            step_idx = layers.fill_constant(shape=[1],
                                            dtype=start_tokens.dtype,
                                            value=0,
                                            force_cpu=True)
            step_next_idx = layers.fill_constant(shape=[1],
                                                 dtype=start_tokens.dtype,
                                                 value=1,
                                                 force_cpu=True)
            cond = layers.less_than(x=step_idx,
                                    y=max_len)  # default force_cpu=True
            while_op = layers.While(cond)
            # array states will be stored for each step.
            ids = layers.array_write(layers.reshape(start_tokens, (-1, 1)),
                                     step_idx)
            scores = layers.array_write(init_scores, step_idx)
            # cell states will be overwrited at each step.
            # caches contains states of history steps in decoder self-attention
            # and static encoder output projections in encoder-decoder attention
            # to reduce redundant computation.
            caches = [
                {
                    "k":  # for self attention
                        layers.fill_constant_batch_size_like(
                            input=start_tokens,
                            shape=[-1, self._n_head, 0, self._emb_size // self._n_head],
                            dtype=enc_words_output.dtype,
                            value=0),
                    "v":  # for self attention
                        layers.fill_constant_batch_size_like(
                            input=start_tokens,
                            shape=[-1, self._n_head, 0, self._emb_size // self._n_head],
                            dtype=enc_words_output.dtype,
                            value=0),
                    "static_k_word":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_words_output.dtype),
                    "static_v_word":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_words_output.dtype),
                    "static_k_sent":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_sents_output.dtype),
                    "static_v_sent":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_sents_output.dtype)
                } for i in range(self._dec_n_layer)
            ]

            trigram_blocking = TrigramBlocking(start_tokens,
                                               self.tokenizer,
                                               use_fp16=self._use_fp16,
                                               beam_size=self.beam_size)

            with while_op.block():
                pre_ids = layers.array_read(array=ids, i=step_idx)
                pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
                # Since beam_search_op dosen't enforce pre_ids' shape, we can do
                # inplace reshape here which actually change the shape of pre_ids.
                # pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
                pre_scores = layers.array_read(array=scores, i=step_idx)
                # gather cell states corresponding to selected parent
                pre_src_words_attn_bias = layers.gather(
                    tgt_src_words_attn_bias, index=parent_idx)
                pre_src_sents_attn_bias = layers.gather(
                    tgt_src_sents_attn_bias, index=parent_idx)
                pre_graph_attn_bias = layers.gather(graph_attn_bias,
                                                    index=parent_idx)
                pre_pos = layers.elementwise_mul(
                    x=layers.fill_constant_batch_size_like(
                        input=
                        pre_src_sents_attn_bias,  # cann't use lod tensor here
                        value=1,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype),
                    y=step_idx,
                    axis=0)

                logits = self.decode(
                    dec_input=(pre_ids, pre_pos, None, pre_src_words_attn_bias,
                               pre_src_sents_attn_bias, pre_graph_attn_bias),
                    enc_words_output=enc_words_output,
                    enc_sents_output=enc_sents_output,
                    caches=caches,
                    gather_idx=parent_idx)

                # prevent generating end token if length less than min_out_len
                eos_index = layers.fill_constant(
                    shape=[layers.shape(logits)[0]],
                    dtype='int64',
                    value=self.eos_idx)
                eos_index = fluid.one_hot(eos_index, depth=self.voc_size)
                less_cond = layers.cast(layers.less_than(x=step_idx,
                                                         y=min_len),
                                        dtype='float32')
                less_val = layers.elementwise_mul(less_cond, neg_inf)
                eos_val = layers.elementwise_mul(eos_index, less_val, axis=0)
                revised_logits = layers.elementwise_add(logits,
                                                        eos_val,
                                                        axis=0)

                # topK reduction across beams, also contain special handle of
                # end beams and end sentences(batch reduction)
                topk_scores, topk_indices = layers.topk(
                    input=layers.softmax(revised_logits), k=self.beam_size)

                # Roll-Back previous-scores for length-penalty
                # previous-scores has been length-penaltied, before this timestep length-penalty, need roll-back
                # because of doing this, we need store the length-penaltied score in `scores`
                # while calculating use the un-penaltied score
                # -> safe for step_idx == 0 (initialization state), because previous-score == 0
                pre_timestep_length_penalty = fluid.layers.pow(
                    ((5.0 + fluid.layers.cast(step_idx, pre_scores.dtype)) /
                     6.0), self.len_penalty)
                pre_scores_wo_len_penalty = fluid.layers.elementwise_mul(
                    pre_scores, pre_timestep_length_penalty)

                # calc trigram-blocking delta scores for current alive sequence
                if self.block_trigram:
                    trigram_blocking.update_seq(pre_ids, parent_idx)
                    trigram_blocking.expand_cand_seq(topk_indices)
                    fluid.layers.py_func(
                        func=trigram_blocking.blocking_forward,
                        x=[
                            trigram_blocking.cand_seq,
                            trigram_blocking.id2is_full_token
                        ],
                        out=trigram_blocking.delta_score_out,
                        backward_func=None)
                    layers.Print(trigram_blocking.delta_score_out,
                                 summarize=100,
                                 message="trigram_blocking.delta_score_out")
                    pre_scores_wo_len_penalty = fluid.layers.elementwise_add(
                        x=trigram_blocking.delta_score_out,
                        y=pre_scores_wo_len_penalty,
                        axis=0)
                # => [N, topk]

                accu_scores = layers.elementwise_add(
                    x=layers.log(topk_scores),
                    y=pre_scores_wo_len_penalty,
                    axis=0)

                cur_timestep_length_penalty = layers.pow(
                    ((5.0 + layers.cast(step_next_idx, accu_scores.dtype)) /
                     6.0), self.len_penalty)
                curr_scores = layers.elementwise_div(
                    accu_scores, cur_timestep_length_penalty)

                # beam_search op uses lod to differentiate branches.
                curr_scores = layers.lod_reset(curr_scores, pre_ids)
                topk_indices = layers.lod_reset(topk_indices, pre_ids)
                selected_ids, selected_scores, gather_idx = layers.beam_search(
                    pre_ids=pre_ids,
                    pre_scores=pre_scores,
                    ids=topk_indices,
                    scores=curr_scores,
                    beam_size=self.beam_size,
                    end_id=self.eos_idx,
                    return_parent_idx=True)

                layers.increment(x=step_idx, value=1.0, in_place=True)
                layers.increment(x=step_next_idx, value=1.0, in_place=True)
                # cell states(caches) have been updated in wrap_decoder,
                # only need to update beam search states here.
                layers.array_write(selected_ids, i=step_idx, array=ids)
                layers.array_write(selected_scores, i=step_idx, array=scores)
                layers.assign(gather_idx, parent_idx)
                layers.assign(pre_src_words_attn_bias, tgt_src_words_attn_bias)
                layers.assign(pre_src_sents_attn_bias, tgt_src_sents_attn_bias)
                layers.assign(pre_graph_attn_bias, graph_attn_bias)

                length_cond = layers.less_than(x=step_idx, y=max_len)
                finish_cond = layers.logical_not(
                    layers.is_empty(x=selected_ids))
                layers.logical_and(x=length_cond, y=finish_cond, out=cond)

            finished_ids, finished_scores = layers.beam_search_decode(
                ids, scores, beam_size=self.beam_size, end_id=self.eos_idx)

            return finished_ids, finished_scores
Esempio n. 10
0
fc0 = fluid.layers.fc(image,
                      size=3,
                      act=None,
                      bias_attr=False,
                      param_attr=fluid.initializer.Constant(value=2.0))
fc1 = fluid.layers.fc(fc0,
                      size=3,
                      act=None,
                      bias_attr=False,
                      param_attr=fluid.initializer.TruncatedNormal(loc=0.0,
                                                                   scale=0.02,
                                                                   seed=0))
#fc1 = fluid.layers.fc(fc0, size=cls_num, act='relu', bias_attr=False, param_attr=fluid.initializer.Constant(value=2.0))

if ASCEND == False:
    fc0 = layers.Print(fc0, message="fc0")
    fc1 = layers.Print(fc1, message="fc1")

# CLASS_NUM = 10
# fc1 = fluid.layers.fc(fc0, size=CLASS_NUM, bias_attr=False,param_attr=fluid.initializer.Constant(value=2.0))
# layers.Print(fc1)
cross_entropy = fluid.layers.softmax_with_cross_entropy(fc1, label)
if ASCEND == False:
    cross_entropy = layers.Print(cross_entropy, message="cross_entropy")
cost = fluid.layers.reduce_sum(cross_entropy)
#cost = fluid.layers.log(cost)
#cost = fluid.layers.tanh(cost)
#cost = fluid.layers.pow(cost, 2)
#cost = fluid.layers.sqrt(cost)
#cost = fluid.layers.mean(cost)
if ASCEND == False:
Esempio n. 11
0
                              dtype="float64")
        attn_bias = fluid.layers.data(name='attn_bias',
                                      shape=[None, n_head, max_len, max_len],
                                      dtype="float64")
        #     layers.Print(attn_bias)
        attn_bias1 = fluid.layers.data(name='attn_bias1',
                                       shape=[None, n_head, max_len, max_len],
                                       dtype="float64")

        output = multi_head_attention(q, k, v, attn_bias, d_model, d_model,
                                      d_model, n_head)
        output1 = multi_head_attention(q, k, v, attn_bias1, d_model, d_model,
                                       d_model, n_head)

        soft_max = layers.softmax(attn_bias1)
        layers.Print(soft_max)
    #     layers.Print(output)

    # work
    INF = -2 ^ 32 + 1
    input_data = np.random.rand(batch_size, max_len, d_model)
    attn_data = np.zeros((batch_size, n_head, max_len))
    attn_data[:, :, 4:] = -INF
    attn_data = np.zeros(
        (batch_size, n_head, max_len, max_len)) + np.expand_dims(attn_data,
                                                                 axis=2)

    attn_data1 = np.zeros((batch_size, n_head, max_len))
    attn_data1[:, :, 4:] = -INF
    a = np.expand_dims(attn_data1, axis=2)
    b = np.expand_dims(attn_data1, axis=3)
Esempio n. 12
0
    def attn_flow(q_enc, p_enc, p_ids_name):
        tag = p_ids_name + "::" 
	drnn = layers.DynamicRNN()
	with drnn.block():
	    h_cur = drnn.step_input(p_enc)
	    u_all = drnn.static_input(q_enc)
	    h_expd = layers.sequence_expand(x=h_cur, y=u_all)
	    s_t_ = layers.elementwise_mul(x=u_all, y=h_expd, axis=0)
	    s_t1 = layers.reduce_sum(input=s_t_, dim=1) 
	    s_t = layers.sequence_softmax(input=s_t1)
	    u_expr = layers.elementwise_mul(x=u_all, y=s_t, axis=0)
	    u_expr = layers.sequence_pool(input=u_expr, pool_type='sum') 
	    
     
	    if args.debug == True:
		'''
		layers.Print(h_expd, message='h_expd')
		layers.Print(h_cur, message='h_cur')
		layers.Print(u_all, message='u_all')
		layers.Print(s_t, message='s_t')
		layers.Print(s_t_, message='s_t_')
		layers.Print(u_expr, message='u_expr')
		'''
	    drnn.output(u_expr)
	    
	U_expr = drnn() 
	#'''
	drnn2 = layers.DynamicRNN()
	with drnn2.block():
	    h_cur = drnn2.step_input(p_enc)
	    u_all = drnn2.static_input(q_enc)
	    h_expd = layers.sequence_expand(x=h_cur, y=u_all)
	    s_t_ = layers.elementwise_mul(x=u_all, y=h_expd, axis=0)
	    s_t2 = layers.reduce_sum(input=s_t_, dim=1, keep_dim=True) 
	    b_t = layers.sequence_pool(input=s_t2, pool_type='max') 
	   
     
	    if args.debug == True:
		'''
		layers.Print(s_t2, message='s_t2')
		layers.Print(b_t, message='b_t')
		'''
	    drnn2.output(b_t)
	b = drnn2()
	b_norm = layers.sequence_softmax(input=b) 
	h_expr = layers.elementwise_mul(x=p_enc, y=b_norm, axis=0)
	h_expr = layers.sequence_pool(input=h_expr, pool_type='sum') 
	    

	H_expr = layers.sequence_expand(x=h_expr, y=p_enc)
	H_expr = layers.lod_reset(x=H_expr, y=p_enc) 
	h_u = layers.elementwise_mul(x=H_expr, y=U_expr, axis=0)
	h_h = layers.elementwise_mul(x=H_expr, y=p_enc, axis=0) 
	
	g = layers.concat(input=[H_expr, U_expr, h_u, h_h], axis = 1) 

        #fusion
	m = bi_lstm_encoder(input_seq=g, gate_size=embedding_dim) 
	if args.debug == True:
	    layers.Print(U_expr, message=tag + 'U_expr')
	    layers.Print(H_expr, message=tag + 'H_expr')
	    layers.Print(b, message=tag + 'b')
	    layers.Print(b_norm, message=tag + 'b_norm')
	    layers.Print(g, message=tag +'g')
	    layers.Print(m, message=tag + 'm')
	    layers.Print(h_h, message=tag + 'h_h')
	    layers.Print(q_enc, message=tag + 'q_enc')
	    layers.Print(p_enc, message=tag + 'p_enc')
       
        return m, g
Esempio n. 13
0
def bidaf(embedding_dim, encoder_size, decoder_size, source_dict_dim,
                   target_dict_dim,  max_length, args):
    def bi_lstm_encoder(input_seq, gate_size):
        # A bi-directional lstm encoder implementation.
        # Linear transformation part for input gate, output gate, forget gate
        # and cell activation vectors need be done outside of dynamic_lstm.
        # So the output size is 4 times of gate_size.
        input_forward_proj = layers.fc(input=input_seq,
                                             size=gate_size * 4,
                                             act='tanh',
                                             bias_attr=False)
        forward, _ = layers.dynamic_lstm(
            input=input_forward_proj, size=gate_size * 4, use_peepholes=False)
        input_reversed_proj = layers.fc(input=input_seq,
                                              size=gate_size * 4,
                                              act='tanh',
                                              bias_attr=False)
        reversed, _ = layers.dynamic_lstm(
            input=input_reversed_proj,
            size=gate_size * 4,
            is_reverse=True,
            use_peepholes=False)
        encoder_out = layers.concat(input=[forward, reversed], axis = 1)
        return encoder_out

    def encoder(input_name):
        input_ids = layers.data(
            name=input_name, shape=[1], dtype='int64', lod_level=1)
        input_embedding = layers.embedding(
            input=input_ids,
            size=[source_dict_dim, embedding_dim],
            dtype='float32',
            is_sparse=True)
        encoder_out = bi_lstm_encoder(input_seq=input_embedding, gate_size=embedding_dim)
        return encoder_out

    def attn_flow(q_enc, p_enc, p_ids_name):
        tag = p_ids_name + "::" 
	drnn = layers.DynamicRNN()
	with drnn.block():
	    h_cur = drnn.step_input(p_enc)
	    u_all = drnn.static_input(q_enc)
	    h_expd = layers.sequence_expand(x=h_cur, y=u_all)
	    s_t_ = layers.elementwise_mul(x=u_all, y=h_expd, axis=0)
	    s_t1 = layers.reduce_sum(input=s_t_, dim=1) 
	    s_t = layers.sequence_softmax(input=s_t1)
	    u_expr = layers.elementwise_mul(x=u_all, y=s_t, axis=0)
	    u_expr = layers.sequence_pool(input=u_expr, pool_type='sum') 
	    
     
	    if args.debug == True:
		'''
		layers.Print(h_expd, message='h_expd')
		layers.Print(h_cur, message='h_cur')
		layers.Print(u_all, message='u_all')
		layers.Print(s_t, message='s_t')
		layers.Print(s_t_, message='s_t_')
		layers.Print(u_expr, message='u_expr')
		'''
	    drnn.output(u_expr)
	    
	U_expr = drnn() 
	#'''
	drnn2 = layers.DynamicRNN()
	with drnn2.block():
	    h_cur = drnn2.step_input(p_enc)
	    u_all = drnn2.static_input(q_enc)
	    h_expd = layers.sequence_expand(x=h_cur, y=u_all)
	    s_t_ = layers.elementwise_mul(x=u_all, y=h_expd, axis=0)
	    s_t2 = layers.reduce_sum(input=s_t_, dim=1, keep_dim=True) 
	    b_t = layers.sequence_pool(input=s_t2, pool_type='max') 
	   
     
	    if args.debug == True:
		'''
		layers.Print(s_t2, message='s_t2')
		layers.Print(b_t, message='b_t')
		'''
	    drnn2.output(b_t)
	b = drnn2()
	b_norm = layers.sequence_softmax(input=b) 
	h_expr = layers.elementwise_mul(x=p_enc, y=b_norm, axis=0)
	h_expr = layers.sequence_pool(input=h_expr, pool_type='sum') 
	    

	H_expr = layers.sequence_expand(x=h_expr, y=p_enc)
	H_expr = layers.lod_reset(x=H_expr, y=p_enc) 
	h_u = layers.elementwise_mul(x=H_expr, y=U_expr, axis=0)
	h_h = layers.elementwise_mul(x=H_expr, y=p_enc, axis=0) 
	
	g = layers.concat(input=[H_expr, U_expr, h_u, h_h], axis = 1) 

        #fusion
	m = bi_lstm_encoder(input_seq=g, gate_size=embedding_dim) 
	if args.debug == True:
	    layers.Print(U_expr, message=tag + 'U_expr')
	    layers.Print(H_expr, message=tag + 'H_expr')
	    layers.Print(b, message=tag + 'b')
	    layers.Print(b_norm, message=tag + 'b_norm')
	    layers.Print(g, message=tag +'g')
	    layers.Print(m, message=tag + 'm')
	    layers.Print(h_h, message=tag + 'h_h')
	    layers.Print(q_enc, message=tag + 'q_enc')
	    layers.Print(p_enc, message=tag + 'p_enc')
       
        return m, g
    
    def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
	def linear(inputs):
	    return layers.fc(input=inputs, size=size, bias_attr=True)

	forget_gate = layers.sigmoid(x=linear([hidden_t_prev, x_t]))
	input_gate = layers.sigmoid(x=linear([hidden_t_prev, x_t]))
	output_gate = layers.sigmoid(x=linear([hidden_t_prev, x_t]))
	cell_tilde = layers.tanh(x=linear([hidden_t_prev, x_t]))

	cell_t = layers.sums(input=[
	    layers.elementwise_mul(
		x=forget_gate, y=cell_t_prev), layers.elementwise_mul(
		    x=input_gate, y=cell_tilde)
	])

	hidden_t = layers.elementwise_mul(
	    x=output_gate, y=layers.tanh(x=cell_t))

	return hidden_t, cell_t 
    
    #point network
    def point_network_decoder(p_vec, q_vec, decoder_size):
        random_attn = layers.gaussian_random(shape=[1, decoder_size])
	random_attn = layers.sequence_expand(x=random_attn, y=q_vec)
        random_attn = layers.fc(input=random_attn, size=decoder_size, act=None)
        U = layers.fc(input=q_vec,
			    size=decoder_size,
			    act=None) + random_attn
        U = layers.tanh(U)
        
        logits = layers.fc(input=U,
			    size=1,
			    act=None)
        scores = layers.sequence_softmax(input=logits)
	pooled_vec = layers.elementwise_mul(x=q_vec, y=scores, axis=0)
	pooled_vec = layers.sequence_pool(input=pooled_vec, pool_type='sum')

        init_state = layers.fc(input=pooled_vec,
			    size=decoder_size,
			    act=None)

        def custom_dynamic_rnn(p_vec, init_state, decoder_size):
            context = layers.fc(input=p_vec,
			    size=decoder_size,
			    act=None)

	    drnn = layers.DynamicRNN()
	    with drnn.block():
		H_s = drnn.step_input(p_vec)
		ctx = drnn.static_input(context)

		c_prev = drnn.memory(init=init_state, need_reorder=True)
		m_prev = drnn.memory(init=init_state, need_reorder=True)
		m_prev1 = layers.fc(input=m_prev, size=decoder_size, act=None)
		m_prev1 = layers.sequence_expand(x=m_prev1, y=ctx)

		Fk = ctx + m_prev1
		Fk = layers.fc(input=Fk, size=decoder_size, act='tanh')
		logits = layers.fc(input=Fk, size=1, act=None)

		scores = layers.sequence_softmax(input=logits)
		attn_ctx = layers.elementwise_mul(x=ctx, y=scores, axis=0)
		attn_ctx = layers.sequence_pool(input=attn_ctx, pool_type='sum')
		hidden_t, cell_t = lstm_step(attn_ctx, hidden_t_prev=m_prev1, cell_t_prev=c_prev, size=decoder_size)

		drnn.update_memory(ex_mem=m_prev, new_mem=hidden_t)
		drnn.update_memory(ex_mem=c_prev, new_mem=cell_t)
      
		drnn.output(scores)
	    beta = drnn()
            return beta

        fw_outputs = custom_dynamic_rnn(p_vec, init_state, decoder_size) 
        bw_outputs = custom_dynamic_rnn(p_vec, init_state, decoder_size)
       
        def sequence_slice(x, index):
            #offset = layers.fill_constant(shape=[1, args.batch_size], value=index, dtype='float32')
            #length = layers.fill_constant(shape=[1, args.batch_size], value=1, dtype='float32')
            #return layers.sequence_slice(x, offset, length)
            idx = layers.fill_constant(shape=[1], value=1, dtype='int32')
            idx.stop_gradient = True
            from paddle.fluid.layers.control_flow import lod_rank_table 
            from paddle.fluid.layers.control_flow import lod_tensor_to_array 
            from paddle.fluid.layers.control_flow import array_read 
            from paddle.fluid.layers.control_flow import array_to_lod_tensor 
            table = lod_rank_table(x, level=0)
            table.stop_gradient = True
            array = lod_tensor_to_array(x, table)
            slice_array = array_read(array=array, i=idx)
            return array_to_lod_tensor(slice_array, table)
        
        start_prob = layers.elementwise_mul(x=sequence_slice(fw_outputs, 0), y=sequence_slice(bw_outputs, 1), axis=0) / 2
        end_prob = layers.elementwise_mul(x=sequence_slice(fw_outputs, 1), y=sequence_slice(bw_outputs, 0), axis=0) / 2
        return start_prob, end_prob
 
 
    q_enc = encoder('q_ids')

    if args.single_doc:
        p_enc = encoder('p_ids')
        m, g = attn_flow(q_enc, p_enc, 'p_ids')
        
    else:
        p_ids_names = []
        ms = []
        gs = []
	for i in range(args.doc_num):
	    p_ids_name = "pids_%d" % i
	    p_ids_names.append(p_ids_name)
	    p_enc = encoder(p_ids_name)
	    
	    m_i, g_i = attn_flow(q_enc, p_enc, p_ids_name)
	    ms.append(m_i)
	    gs.append(g_i)
	    m = layers.sequence_concat(x=ms, axis = 0) 
	    g = layers.sequence_concat(x=gs, axis = 0) 
            
    if args.simple_decode:
        m2 = bi_lstm_encoder(input_seq=m, gate_size=embedding_dim)
        
        gm1 = layers.concat(input=[g, m], axis = 1) 
        gm2 = layers.concat(input=[g, m2], axis = 1) 
        start_prob = layers.fc(input=gm1, size=1, act='softmax')
        end_prob = layers.fc(input=gm2, size=1, act='softmax')
    else:

	p_vec = layers.sequence_concat(x=m, axis = 0) 
	q_vec = bi_lstm_encoder(input_seq=q_enc, gate_size=embedding_dim)
        start_prob, end_prob = point_network_decoder(p_vec=p_vec, q_vec=q_vec, decoder_size = decoder_size)

    start_prob = layers.sequence_softmax(start_prob)
    end_prob = layers.sequence_softmax(end_prob)

    pred = layers.concat(input=[start_prob, end_prob], axis = 0) 
    #'''
    start_labels = layers.data(
	name="start_lables", shape=[1], dtype='float32', lod_level=1)
    
    end_labels = layers.data(
	name="end_lables", shape=[1], dtype='float32', lod_level=1)
    
    label = layers.concat(input=[start_labels, end_labels], axis=0)
    label.stop_gradient = True

    #compute loss
    cost = layers.cross_entropy(input=pred, label=label, soft_label=True)
    #cost = layers.cross_entropy(input=decode_out, label=end_labels, soft_label=True)
    cost = layers.reduce_sum(cost) / args.batch_size
     
    if args.debug == True:
        layers.Print(p1, message='p1')
        layers.Print(pred, message='pred')
        layers.Print(label, message='label')
        layers.Print(start_labels, message='start_labels')
        layers.Print(cost, message='cost')
    
    if args.single_doc:
        feeding_list = ['q_ids',  "start_lables", "end_lables", 'p_ids']
    else:
        feeding_list = ['q_ids',  "start_lables", "end_lables" ] + p_ids_names
    return cost, feeding_list
Esempio n. 14
0
    def run_boxps_preload(self, is_cpu=True, random_with_lineid=False):
        program = fluid.Program()
        with fluid.program_guard(program):
            x = fluid.layers.data(name='x',
                                  shape=[1],
                                  dtype='int64',
                                  lod_level=0)
            y = fluid.layers.data(name='y',
                                  shape=[1],
                                  dtype='int64',
                                  lod_level=0)
            emb_x, emb_y = _pull_box_sparse([x, y], size=2)
            emb_xp = _pull_box_sparse(x, size=2)
            concat = layers.concat([emb_x, emb_y], axis=1)
            fc = layers.fc(input=concat,
                           name="fc",
                           size=1,
                           num_flatten_dims=1,
                           bias_attr=False)
            loss = layers.reduce_mean(fc)
            layers.Print(loss)
            place = fluid.CPUPlace(
            ) if is_cpu or not core.is_compiled_with_cuda(
            ) else fluid.CUDAPlace(0)
            exe = fluid.Executor(place)
            batch_size = 100

            def binary_print(slot, fout):
                fout.write(str(len(slot)) + " ")
                for e in slot:
                    fout.write(str(e) + " ")

            batch1 = np.ones(
                (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1)
            filelist = []
            place_str = "cpu" if is_cpu else "gpu"
            for i in range(2):
                filelist.append("test_hdfs_" + place_str + "_" + str(i))
            for f in filelist:
                with open(f, "w") as fout:
                    for ins in batch1:
                        for slot in ins:
                            binary_print(slot, fout)
                    fout.write("\n")

            def create_dataset():
                dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
                dataset.set_date("20190930")
                dataset.set_use_var([x, y])
                dataset.set_batch_size(2)
                dataset.set_thread(1)
                dataset.set_filelist(filelist)
                return dataset

            datasets = []
            datasets.append(create_dataset())
            datasets.append(create_dataset())
            optimizer = fluid.optimizer.SGD(learning_rate=0.5)
            optimizer = fluid.optimizer.PipelineOptimizer(optimizer,
                                                          cut_list=[],
                                                          place_list=[place],
                                                          concurrency_list=[1],
                                                          queue_size=1,
                                                          sync_steps=-1)
            optimizer.minimize(loss)

            program._pipeline_opt["dump_fields"] = [
                "fc.tmp_0", "fc.tmp_0@GRAD", "hehe"
            ]
            program._pipeline_opt["dump_fields_path"] = "./dump_log/"
            program._pipeline_opt["dump_param"] = ["fc.w_0"]
            program._pipeline_opt["enable_random_dump"] = True
            program._pipeline_opt["dump_interval"] = 10
            program._pipeline_opt["random_with_lineid"] = random_with_lineid

            exe.run(fluid.default_startup_program())
            datasets[0].load_into_memory()
            datasets[0].begin_pass()
            datasets[1].preload_into_memory()
            exe.train_from_dataset(program=fluid.default_main_program(),
                                   dataset=datasets[0],
                                   print_period=1)
            datasets[0].end_pass(True)
            datasets[1].wait_preload_done()
            datasets[1].begin_pass()
            exe.train_from_dataset(program=fluid.default_main_program(),
                                   dataset=datasets[1],
                                   print_period=1,
                                   debug=True)
            datasets[1].end_pass(False)
            for f in filelist:
                os.remove(f)
            if os.path.isdir("dump_log"):
                shutil.rmtree("dump_log")
Esempio n. 15
0
def encoder(x,
            y,
            vocab_size,
            emb_size,
            init_hidden=None,
            init_cell=None,
            para_name='',
            custom_samples=None,
            custom_probabilities=None,
            test_mode=False,
            args=None):
    x_emb = layers.embedding(
        input=x,
        size=[vocab_size, emb_size],
        dtype='float32',
        is_sparse=False,
        param_attr=fluid.ParamAttr(name='embedding_para'))
    rnn_input = x_emb
    rnn_outs = []
    rnn_outs_ori = []
    cells = []
    projs = []
    for i in range(args.num_layers):
        rnn_input = dropout(rnn_input, test_mode, args)
        if init_hidden and init_cell:
            h0 = layers.squeeze(
                layers.slice(
                    init_hidden, axes=[0], starts=[i], ends=[i + 1]),
                axes=[0])
            c0 = layers.squeeze(
                layers.slice(
                    init_cell, axes=[0], starts=[i], ends=[i + 1]),
                axes=[0])
        else:
            h0 = c0 = None
        rnn_out, cell, input_proj = lstmp_encoder(
            rnn_input, args.hidden_size, h0, c0,
            para_name + 'layer{}'.format(i + 1), emb_size, test_mode, args)
        rnn_out_ori = rnn_out
        if i > 0:
            rnn_out = rnn_out + rnn_input
        rnn_out = dropout(rnn_out, test_mode, args)
        cell = dropout(cell, test_mode, args)
        rnn_outs.append(rnn_out)
        rnn_outs_ori.append(rnn_out_ori)
        rnn_input = rnn_out
        cells.append(cell)
        projs.append(input_proj)

    softmax_weight = layers.create_parameter(
        [vocab_size, emb_size], dtype="float32", name="softmax_weight")
    softmax_bias = layers.create_parameter(
        [vocab_size], dtype="float32", name='softmax_bias')
    projection = layers.matmul(rnn_outs[-1], softmax_weight, transpose_y=True)
    projection = layers.elementwise_add(projection, softmax_bias)

    projection = layers.reshape(projection, shape=[-1, vocab_size])

    if args.sample_softmax and (not test_mode):
        loss = layers.sampled_softmax_with_cross_entropy(
            logits=projection,
            label=y,
            num_samples=args.n_negative_samples_batch,
            seed=args.random_seed)
        if args.debug:
            layers.Print(loss, message='out_loss', summarize=100)
    else:
        label = layers.one_hot(input=y, depth=vocab_size)
        loss = layers.softmax_with_cross_entropy(
            logits=projection, label=label, soft_label=True)
    return [x_emb, projection, loss], rnn_outs, rnn_outs_ori, cells, projs
Esempio n. 16
0
    def build(self):
        args = self.args
        emb_size = args.embed_size
        proj_size = args.embed_size
        hidden_size = args.hidden_size
        batch_size = args.batch_size
        num_layers = args.num_layers
        num_steps = args.num_steps

        lstm_outputs = []

        x_f = layers.data(name="x", shape=[1], dtype='int64', lod_level=1)
        y_f = layers.data(name="y", shape=[1], dtype='int64', lod_level=1)

        x_b = layers.data(name="x_r", shape=[1], dtype='int64', lod_level=1)
        y_b = layers.data(name="y_r", shape=[1], dtype='int64', lod_level=1)

        init_hiddens_ = layers.data(
            name="init_hiddens", shape=[1], dtype='float32')
        init_cells_ = layers.data(
            name="init_cells", shape=[1], dtype='float32')

        if args.debug:
            layers.Print(init_cells_, message='init_cells_', summarize=10)
            layers.Print(init_hiddens_, message='init_hiddens_', summarize=10)

        init_hiddens = layers.reshape(
            init_hiddens_, shape=[2 * num_layers, -1, proj_size])
        init_cells = layers.reshape(
            init_cells_, shape=[2 * num_layers, -1, hidden_size])

        init_hidden = layers.slice(
            init_hiddens, axes=[0], starts=[0], ends=[num_layers])
        init_cell = layers.slice(
            init_cells, axes=[0], starts=[0], ends=[num_layers])
        init_hidden_r = layers.slice(
            init_hiddens, axes=[0], starts=[num_layers],
            ends=[2 * num_layers])
        init_cell_r = layers.slice(
            init_cells, axes=[0], starts=[num_layers], ends=[2 * num_layers])

        if args.use_custom_samples:
            custom_samples = layers.data(
                name="custom_samples",
                shape=[args.n_negative_samples_batch + 1],
                dtype='int64',
                lod_level=1)
            custom_samples_r = layers.data(
                name="custom_samples_r",
                shape=[args.n_negative_samples_batch + 1],
                dtype='int64',
                lod_level=1)
            custom_probabilities = layers.data(
                name="custom_probabilities",
                shape=[args.n_negative_samples_batch + 1],
                dtype='float32',
                lod_level=1)
        else:
            custom_samples = None
            custom_samples_r = None
            custom_probabilities = None

        forward, fw_hiddens, fw_hiddens_ori, fw_cells, fw_projs = encoder(
            x_f,
            y_f,
            self.vocab_size,
            emb_size,
            init_hidden,
            init_cell,
            para_name='fw_',
            custom_samples=custom_samples,
            custom_probabilities=custom_probabilities,
            test_mode=self.test_mode,
            args=args)
        backward, bw_hiddens, bw_hiddens_ori, bw_cells, bw_projs = encoder(
            x_b,
            y_b,
            self.vocab_size,
            emb_size,
            init_hidden_r,
            init_cell_r,
            para_name='bw_',
            custom_samples=custom_samples_r,
            custom_probabilities=custom_probabilities,
            test_mode=self.test_mode,
            args=args)

        losses = layers.concat([forward[-1], backward[-1]])
        self.loss = layers.reduce_mean(losses)
        self.loss.permissions = True
        self.loss.persistable = True

        if args.debug:
            x_emb, projection, loss = forward
            layers.Print(init_cells, message='init_cells', summarize=10)
            layers.Print(init_hiddens, message='init_hiddens', summarize=10)
            layers.Print(init_cell, message='init_cell', summarize=10)
            layers.Print(y_b, message='y_b', summarize=10)
            layers.Print(x_emb, message='x_emb', summarize=10)
            layers.Print(projection, message='projection', summarize=10)
            layers.Print(losses, message='losses', summarize=320)
            layers.Print(self.loss, message='loss', summarize=320)
        self.grad_vars = [x_f, y_f, x_b, y_b, self.loss]
        self.grad_vars_name = ['x', 'y', 'x_r', 'y_r', 'final_loss']
        fw_vars_name = ['x_emb', 'proj', 'loss'] + [
            'init_hidden', 'init_cell'
        ] + ['rnn_out', 'rnn_out2', 'cell', 'cell2', 'xproj', 'xproj2']
        bw_vars_name = ['x_emb_r', 'proj_r', 'loss_r'] + [
            'init_hidden_r', 'init_cell_r'
        ] + [
            'rnn_out_r', 'rnn_out2_r', 'cell_r', 'cell2_r', 'xproj_r',
            'xproj2_r'
        ]
        fw_vars = forward + [init_hidden, init_cell
                             ] + fw_hiddens + fw_cells + fw_projs
        bw_vars = backward + [init_hidden_r, init_cell_r
                              ] + bw_hiddens + bw_cells + bw_projs
        for i in range(len(fw_vars_name)):
            self.grad_vars.append(fw_vars[i])
            self.grad_vars.append(bw_vars[i])
            self.grad_vars_name.append(fw_vars_name[i])
            self.grad_vars_name.append(bw_vars_name[i])
        if args.use_custom_samples:
            self.feed_order = [
                'x', 'y', 'x_r', 'y_r', 'custom_samples', 'custom_samples_r',
                'custom_probabilities'
            ]
        else:
            self.feed_order = ['x', 'y', 'x_r', 'y_r']
        self.last_hidden = [
            fluid.layers.sequence_last_step(input=x)
            for x in fw_hiddens_ori + bw_hiddens_ori
        ]
        self.last_cell = [
            fluid.layers.sequence_last_step(input=x)
            for x in fw_cells + bw_cells
        ]
        self.last_hidden = layers.concat(self.last_hidden, axis=0)
        self.last_hidden.persistable = True
        self.last_cell = layers.concat(self.last_cell, axis=0)
        self.last_cell.persistable = True
        if args.debug:
            layers.Print(self.last_cell, message='last_cell', summarize=10)
            layers.Print(self.last_hidden, message='last_hidden', summarize=10)