Ejemplo n.º 1
0
def infer(args):
    id2word_dict = reader.load_dict(args.word_dict_path)
    word2id_dict = reader.load_reverse_dict(args.word_dict_path)

    id2label_dict = reader.load_dict(args.label_dict_path)
    label2id_dict = reader.load_reverse_dict(args.label_dict_path)
    q2b_dict = reader.load_dict(args.word_rep_dict_path)
    test_data = paddle.batch(reader.test_reader(args.test_data_dir,
                                                word2id_dict, label2id_dict,
                                                q2b_dict),
                             batch_size=args.batch_size)
    place = fluid.CPUPlace()
    exe = fluid.Executor(place)

    inference_scope = fluid.core.Scope()
    with fluid.scope_guard(inference_scope):
        [inference_program, feed_target_names,
         fetch_targets] = fluid.io.load_inference_model(args.model_path, exe)
        for data in test_data():
            full_out_str = ""
            word_idx = to_lodtensor([x[0] for x in data], place)
            word_list = [x[1] for x in data]
            (crf_decode, ) = exe.run(inference_program,
                                     feed={"word": word_idx},
                                     fetch_list=fetch_targets,
                                     return_numpy=False)
            lod_info = (crf_decode.lod())[0]
            np_data = np.array(crf_decode)
            assert len(data) == len(lod_info) - 1
            for sen_index in xrange(len(data)):
                assert len(
                    data[sen_index][0]) == lod_info[sen_index +
                                                    1] - lod_info[sen_index]
                word_index = 0
                outstr = ""
                cur_full_word = ""
                cur_full_tag = ""
                words = word_list[sen_index]
                for tag_index in xrange(lod_info[sen_index],
                                        lod_info[sen_index + 1]):
                    cur_word = words[word_index]
                    cur_tag = id2label_dict[str(np_data[tag_index][0])]
                    if cur_tag.endswith("-B") or cur_tag.endswith("O"):
                        if len(cur_full_word) != 0:
                            outstr += cur_full_word.encode(
                                'utf8') + "/" + cur_full_tag.encode(
                                    'utf8') + " "
                        cur_full_word = cur_word
                        cur_full_tag = get_real_tag(cur_tag)
                    else:
                        cur_full_word += cur_word
                    word_index += 1
                outstr += cur_full_word.encode(
                    'utf8') + "/" + cur_full_tag.encode('utf8') + " "
                outstr = outstr.strip()
                full_out_str += outstr + "\n"
            print full_out_str.strip()
Ejemplo n.º 2
0
def train(args):
    """
    Train the network.
    """
    if not os.path.exists(args.model_save_dir):
        os.mkdir(args.model_save_dir)

    word2id_dict = reader.load_reverse_dict(args.word_dict_path)
    label2id_dict = reader.load_reverse_dict(args.label_dict_path)
    word_rep_dict = reader.load_dict(args.word_rep_dict_path)
    word_dict_len = max(map(int, word2id_dict.values())) + 1
    label_dict_len = max(map(int, label2id_dict.values())) + 1

    avg_cost, crf_decode, word, target = lex_net(args, word_dict_len,
                                                 label_dict_len)
    sgd_optimizer = fluid.optimizer.SGD(learning_rate=args.base_learning_rate)
    sgd_optimizer.minimize(avg_cost)

    (precision, recall, f1_score, num_infer_chunks, num_label_chunks,
     num_correct_chunks) = fluid.layers.chunk_eval(
         input=crf_decode,
         label=target,
         chunk_scheme="IOB",
         num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
    chunk_evaluator = fluid.metrics.ChunkEvaluator()
    chunk_evaluator.reset()

    train_reader_list = []
    corpus_num = len(args.corpus_type_list)
    for i in xrange(corpus_num):
        train_reader = paddle.batch(
            paddle.reader.shuffle(reader.file_reader(args.traindata_dir,
                                                     word2id_dict,
                                                     label2id_dict,
                                                     word_rep_dict,
                                                     args.corpus_type_list[i]),
                                  buf_size=args.traindata_shuffle_buffer),
            batch_size=int(args.batch_size * args.corpus_proportion_list[i]))
        train_reader_list.append(train_reader)
    test_reader = paddle.batch(reader.file_reader(args.testdata_dir,
                                                  word2id_dict, label2id_dict,
                                                  word_rep_dict),
                               batch_size=args.batch_size)
    train_reader_itr_list = []
    for train_reader in train_reader_list:
        cur_reader_itr = train_reader()
        train_reader_itr_list.append(cur_reader_itr)

    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    feeder = fluid.DataFeeder(feed_list=[word, target], place=place)
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
    batch_id = 0
    start_time = time.time()
    eval_list = []
    iter = 0
    while True:
        full_batch = []
        cur_batch = []
        for i in xrange(corpus_num):
            reader_itr = train_reader_itr_list[i]
            try:
                cur_batch = next(reader_itr)
            except StopIteration:
                print(args.corpus_type_list[i] +
                      " corpus finish a pass of training")
                new_reader = train_reader_list[i]
                train_reader_itr_list[i] = new_reader()
                cur_batch = next(train_reader_itr_list[i])
            full_batch += cur_batch
        random.shuffle(full_batch)

        cost_var, nums_infer, nums_label, nums_correct = exe.run(
            fluid.default_main_program(),
            fetch_list=[
                avg_cost, num_infer_chunks, num_label_chunks,
                num_correct_chunks
            ],
            feed=feeder.feed(full_batch))
        print("batch_id:" + str(batch_id) + ", avg_cost:" + str(cost_var[0]))
        chunk_evaluator.update(nums_infer, nums_label, nums_correct)
        batch_id += 1

        if (batch_id % args.save_model_per_batchs == 1):
            save_exe = fluid.Executor(place)
            save_dirname = os.path.join(args.model_save_dir,
                                        "params_batch_%d" % batch_id)
            fluid.io.save_inference_model(save_dirname, ['word'], [crf_decode],
                                          save_exe)
            temp_save_model = os.path.join(args.model_save_dir,
                                           "temp_model_for_test")
            fluid.io.save_inference_model(
                temp_save_model, ['word', 'target'],
                [num_infer_chunks, num_label_chunks, num_correct_chunks],
                save_exe)

            precision, recall, f1_score = chunk_evaluator.eval()
            print("[train] batch_id:" + str(batch_id) + ", precision:" +
                  str(precision) + ", recall:" + str(recall) + ", f1:" +
                  str(f1_score))
            chunk_evaluator.reset()
            p, r, f1 = test(exe, chunk_evaluator, temp_save_model, test_reader,
                            place)
            chunk_evaluator.reset()
            print("[test] batch_id:" + str(batch_id) + ", precision:" +
                  str(p) + ", recall:" + str(r) + ", f1:" + str(f1))
            end_time = time.time()
            print("cur_batch_id:" + str(batch_id) + ", last " +
                  str(args.save_model_per_batchs) + " batchs, time_cost:" +
                  str(end_time - start_time))
            start_time = time.time()

            if len(eval_list) < 2 * args.eval_window:
                eval_list.append(f1)
            else:
                eval_list.pop(0)
                eval_list.append(f1)
                last_avg_f1 = sum(
                    eval_list[0:args.eval_window]) / args.eval_window
                cur_avg_f1 = sum(
                    eval_list[args.eval_window:2 *
                              args.eval_window]) / args.eval_window
                if cur_avg_f1 <= last_avg_f1:
                    return
                else:
                    print "keep training!"
        iter += 1
        if (iter == args.num_iterations):
            return
Ejemplo n.º 3
0
    results = []
    # get out data from output tensor
    output_names = predictor.get_output_names()
    for i, name in enumerate(output_names):
        output_tensor = predictor.get_output_tensor(name)
        output_data = output_tensor.copy_to_cpu()
        results.append(output_data)
    return results


if __name__ == '__main__':

    args = parse_args()
    word2id_dict = reader.load_reverse_dict(args.word_dict_path)
    label2id_dict = reader.load_reverse_dict(args.label_dict_path)
    word_rep_dict = reader.load_dict(args.word_rep_dict_path)
    word_dict_len = max(map(int, word2id_dict.values())) + 1
    label_dict_len = max(map(int, label2id_dict.values())) + 1

    pred = create_predictor(args)

    test_data = paddle.batch(reader.file_reader(args.testdata_dir,
                                                word2id_dict, label2id_dict,
                                                word_rep_dict),
                             batch_size=1)
    batch_id = 0
    id2word = {v: k for k, v in word2id_dict.items()}
    id2label = {v: k for k, v in label2id_dict.items()}
    for data in test_data():
        batch_id += 1
        word_data, word_lod = to_lodtensor(list(map(lambda x: x[0], data)))
Ejemplo n.º 4
0
def train(train_data_path,
          test_data_path,
          src_dict_path,
          trg_dict_path,
          enc_conv_blocks,
          dec_conv_blocks,
          emb_dim=256,
          pos_size=200,
          drop_rate=0.,
          use_bn=False,
          batch_size=32,
          num_passes=15):
    """
    Train the convolution sequence-to-sequence model.    

    :param train_data_path: The path of the training set.
    :type train_data_path: str
    :param test_data_path: The path of the test set.
    :type test_data_path: str
    :param src_dict_path: The path of the source dictionary.
    :type src_dict_path: str
    :param trg_dict_path: The path of the target dictionary.
    :type trg_dict_path: str
    :param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of
                            the list contains output dimension and context length of the corresponding
                            convolution block.
    :type enc_conv_blocks: list of tuple
    :param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of
                            the list contains output dimension and context length of the corresponding
                            convolution block.
    :type dec_conv_blocks: list of tuple
    :param emb_dim: The dimension of the embedding vector.
    :type emb_dim: int
    :param pos_size: The total number of the position indexes, which means
                     the maximum value of the index is pos_size - 1.
    :type pos_size: int
    :param drop_rate: Dropout rate.
    :type drop_rate: float
    :param use_bn: Whether to use batch normalization or not. False is the default value.
    :type use_bn: bool
    :param batch_size: The size of a mini-batch.
    :type batch_size: int
    :param num_passes: The total number of the passes to train.
    :type num_passes: int
    """
    # load dict
    src_dict = reader.load_dict(src_dict_path)
    trg_dict = reader.load_dict(trg_dict_path)
    src_dict_size = src_dict.__len__()
    trg_dict_size = trg_dict.__len__()

    optimizer = paddle.optimizer.Adam(learning_rate=1e-3, )

    cost = conv_seq2seq(src_dict_size=src_dict_size,
                        trg_dict_size=trg_dict_size,
                        pos_size=pos_size,
                        emb_dim=emb_dim,
                        enc_conv_blocks=enc_conv_blocks,
                        dec_conv_blocks=dec_conv_blocks,
                        drop_rate=drop_rate,
                        with_bn=use_bn,
                        is_infer=False)

    # create parameters and trainer
    parameters = paddle.parameters.create(cost)
    trainer = paddle.trainer.SGD(cost=cost,
                                 parameters=parameters,
                                 update_equation=optimizer)

    padding_list = [context_len - 1 for (size, context_len) in dec_conv_blocks]
    padding_num = reduce(lambda x, y: x + y, padding_list)
    train_reader, test_reader = create_reader(padding_num=padding_num,
                                              train_data_path=train_data_path,
                                              test_data_path=test_data_path,
                                              src_dict=src_dict,
                                              trg_dict=trg_dict,
                                              pos_size=pos_size,
                                              batch_size=batch_size)

    feeding = {
        'src_word': 0,
        'src_word_pos': 1,
        'trg_word': 2,
        'trg_word_pos': 3,
        'trg_next_word': 4
    }

    # create event handler
    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % 20 == 0:
                cur_time = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime())
                print "[%s]: Pass: %d, Batch: %d, TrainCost: %f, %s" % (
                    cur_time, event.pass_id, event.batch_id, event.cost,
                    event.metrics)
                sys.stdout.flush()

        if isinstance(event, paddle.event.EndPass):
            if test_reader is not None:
                cur_time = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime())
                result = trainer.test(reader=test_reader, feeding=feeding)
                print "[%s]: Pass: %d, TestCost: %f, %s" % (
                    cur_time, event.pass_id, result.cost, result.metrics)
                sys.stdout.flush()
            with gzip.open("output/params.pass-%d.tar.gz" % event.pass_id,
                           'w') as f:
                trainer.save_parameter_to_tar(f)

    if not os.path.exists('output'):
        os.mkdir('output')

    trainer.train(reader=train_reader,
                  event_handler=event_handler,
                  num_passes=num_passes,
                  feeding=feeding)
Ejemplo n.º 5
0
def infer(infer_data_path,
          src_dict_path,
          trg_dict_path,
          model_path,
          enc_conv_blocks,
          dec_conv_blocks,
          emb_dim=256,
          pos_size=200,
          drop_rate=0.,
          use_bn=False,
          max_len=100,
          batch_size=1,
          beam_size=1,
          is_show_attention=False):
    """
    Inference.

    :param infer_data_path: The path of the data for inference.
    :type infer_data_path: str
    :param src_dict_path: The path of the source dictionary.
    :type src_dict_path: str
    :param trg_dict_path: The path of the target dictionary.
    :type trg_dict_path: str
    :param model_path: The path of a trained model.
    :type model_path: str
    :param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of
                            the list contains output dimension and context length of the corresponding
                            convolution block.
    :type enc_conv_blocks: list of tuple
    :param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of
                            the list contains output dimension and context length of the corresponding
                            convolution block.
    :type dec_conv_blocks: list of tuple
    :param emb_dim: The dimension of the embedding vector.
    :type emb_dim: int
    :param pos_size: The total number of the position indexes, which means
                     the maximum value of the index is pos_size - 1.
    :type pos_size: int
    :param drop_rate: Dropout rate.
    :type drop_rate: float
    :param use_bn: Whether to use batch normalization or not. False is the default value.
    :type use_bn: bool
    :param max_len: The maximum length of the sentence to be generated.
    :type max_len: int
    :param beam_size: The width of beam expansion.
    :type beam_size: int
    :param is_show_attention: Whether to show attention weight or not. False is the default value.
    :type is_show_attention: bool
    """
    # load dict
    src_dict = reader.load_dict(src_dict_path)
    trg_dict = reader.load_dict(trg_dict_path)
    src_dict_size = src_dict.__len__()
    trg_dict_size = trg_dict.__len__()

    prob, weight = conv_seq2seq(src_dict_size=src_dict_size,
                                trg_dict_size=trg_dict_size,
                                pos_size=pos_size,
                                emb_dim=emb_dim,
                                enc_conv_blocks=enc_conv_blocks,
                                dec_conv_blocks=dec_conv_blocks,
                                drop_rate=drop_rate,
                                with_bn=use_bn,
                                is_infer=True)

    # load parameters
    parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path))

    padding_list = [context_len - 1 for (size, context_len) in dec_conv_blocks]
    padding_num = reduce(lambda x, y: x + y, padding_list)
    infer_reader = reader.data_reader(data_file=infer_data_path,
                                      src_dict=src_dict,
                                      trg_dict=trg_dict,
                                      pos_size=pos_size,
                                      padding_num=padding_num)

    if is_show_attention:
        attention_inferer = paddle.inference.Inference(output_layer=weight,
                                                       parameters=parameters)
        for i, data in enumerate(infer_reader()):
            src_len = len(data[0])
            trg_len = len(data[2])
            attention_weight = attention_inferer.infer([data],
                                                       field='value',
                                                       flatten_result=False)
            attention_weight = [
                weight.reshape((trg_len, src_len))
                for weight in attention_weight
            ]
            print attention_weight
            break
        return

    infer_data = []
    for i, raw_data in enumerate(infer_reader()):
        infer_data.append([raw_data[0], raw_data[1]])

    inferer = paddle.inference.Inference(output_layer=prob,
                                         parameters=parameters)

    searcher = BeamSearch(inferer=inferer,
                          trg_dict=trg_dict,
                          pos_size=pos_size,
                          padding_num=padding_num,
                          max_len=max_len,
                          batch_size=batch_size,
                          beam_size=beam_size)

    searcher.search(infer_data)
    return