Beispiel #1
0
def infer(model_path, batch_size, test_data_file, vocab_file, target_file):
    def _infer_a_batch(inferer, test_data, id_2_word, id_2_label):
        probs = inferer.infer(input=test_data, field=["id"])
        assert len(probs) == sum(len(x[0]) for x in test_data)

        for idx, test_sample in enumerate(test_data):
            start_id = 0
            for w, tag in zip(test_sample[0],
                              probs[start_id:start_id + len(test_sample[0])]):
                print("%s\t%s" % (id_2_word[w], id_2_label[tag]))
            print("\n")
            start_id += len(test_sample[0])

    word_dict = load_dict(vocab_file)
    word_dict_len = len(word_dict)
    word_reverse_dict = load_reverse_dict(vocab_file)

    label_dict = load_dict(target_file)
    label_reverse_dict = load_reverse_dict(target_file)
    label_dict_len = len(label_dict)

    # initialize PaddlePaddle
    paddle.init(use_gpu=False, trainer_count=1)
    parameters = paddle.parameters.Parameters.from_tar(
        gzip.open(model_path, "r"))

    predict = ner_net(word_dict_len=word_dict_len,
                      label_dict_len=label_dict_len,
                      is_train=False)

    inferer = paddle.inference.Inference(output_layer=predict,
                                         parameters=parameters)

    test_data = []
    for i, item in enumerate(
            reader.data_reader(test_data_file, word_dict, label_dict)()):
        test_data.append([item[0], item[1]])
        if len(test_data) == batch_size:
            _infer_a_batch(inferer, test_data, word_reverse_dict,
                           label_reverse_dict)
            test_data = []

    _infer_a_batch(inferer, test_data, word_reverse_dict, label_reverse_dict)
    test_data = []
Beispiel #2
0
def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
         model_save_dir, num_passes, use_gpu, parallel):
    if not os.path.exists(model_save_dir):
        os.mkdir(model_save_dir)

    BATCH_SIZE = 200
    word_dict = load_dict(vocab_file)
    label_dict = load_dict(target_file)

    word_vector_values = get_embedding(emb_file)

    word_dict_len = len(word_dict)
    label_dict_len = len(label_dict)

    avg_cost, feature_out, word, mark, target = ner_net(
        word_dict_len, label_dict_len, parallel)

    sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
    sgd_optimizer.minimize(avg_cost)

    crf_decode = fluid.layers.crf_decoding(
        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))

    chunk_evaluator = fluid.evaluator.ChunkEvaluator(
        input=crf_decode,
        label=target,
        chunk_scheme="IOB",
        num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))

    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        test_target = chunk_evaluator.metrics + chunk_evaluator.states
        inference_program = fluid.io.get_inference_program(test_target)

    train_reader = paddle.batch(paddle.reader.shuffle(reader.data_reader(
        train_data_file, word_dict, label_dict),
                                                      buf_size=20000),
                                batch_size=BATCH_SIZE)
    test_reader = paddle.batch(paddle.reader.shuffle(reader.data_reader(
        test_data_file, word_dict, label_dict),
                                                     buf_size=20000),
                               batch_size=BATCH_SIZE)

    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
    feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place)
    exe = fluid.Executor(place)

    exe.run(fluid.default_startup_program())

    embedding_name = 'emb'
    embedding_param = fluid.global_scope().find_var(
        embedding_name).get_tensor()
    embedding_param.set(word_vector_values, place)

    batch_id = 0
    total_time = 0.0
    for pass_id in xrange(num_passes):
        chunk_evaluator.reset(exe)
        start_time = time.time()
        for data in train_reader():
            cost, batch_precision, batch_recall, batch_f1_score = exe.run(
                fluid.default_main_program(),
                feed=feeder.feed(data),
                fetch_list=[avg_cost] + chunk_evaluator.metrics)
            batch_id = batch_id + 1
        t1 = time.time()
        total_time += t1 - start_time
        pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(exe)
        if pass_id == num_passes - 1:
            train_acc_kpi.add_record(pass_precision)
            pass_duration_kpi.add_record(total_time / num_passes)
        if pass_id % 100 == 0:
            print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" +
                  str(pass_precision) + " pass_recall:" + str(pass_recall) +
                  " pass_f1_score:" + str(pass_f1_score))
        pass_precision, pass_recall, pass_f1_score = test(
            exe, chunk_evaluator, inference_program, test_reader, place)
        if pass_id % 100 == 0:
            print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" +
                  str(pass_precision) + " pass_recall:" + str(pass_recall) +
                  " pass_f1_score:" + str(pass_f1_score))

        save_dirname = os.path.join(model_save_dir, "params_pass_%d" % pass_id)
        fluid.io.save_inference_model(save_dirname, ['word', 'mark', 'target'],
                                      [crf_decode], exe)
    train_acc_kpi.persist()
    pass_duration_kpi.persist()
Beispiel #3
0
def main(train_data_file,
         test_data_file,
         vocab_file,
         target_file,
         emb_file,
         model_save_dir,
         num_passes=10,
         batch_size=32):
    if not os.path.exists(model_save_dir):
        os.mkdir(model_save_dir)

    word_dict = load_dict(vocab_file)
    label_dict = load_dict(target_file)

    word_vector_values = get_embedding(emb_file)

    word_dict_len = len(word_dict)
    label_dict_len = len(label_dict)

    paddle.init(use_gpu=False, trainer_count=1)

    # define network topology
    crf_cost, crf_dec, target = ner_net(word_dict_len, label_dict_len)
    evaluator.sum(name="error", input=crf_dec)
    evaluator.chunk(
        name="ner_chunk",
        input=crf_dec,
        label=target,
        chunk_scheme="IOB",
        num_chunk_types=(label_dict_len - 1) / 2)

    # create parameters
    parameters = paddle.parameters.create(crf_cost)
    parameters.set("emb", word_vector_values)

    # create optimizer
    optimizer = paddle.optimizer.Momentum(
        momentum=0,
        learning_rate=2e-4,
        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
        gradient_clipping_threshold=25,
        model_average=paddle.optimizer.ModelAverage(
            average_window=0.5, max_average_window=10000), )

    trainer = paddle.trainer.SGD(
        cost=crf_cost,
        parameters=parameters,
        update_equation=optimizer,
        extra_layers=crf_dec)

    train_reader = paddle.batch(
        paddle.reader.shuffle(
            reader.data_reader(train_data_file, word_dict, label_dict),
            buf_size=1000),
        batch_size=batch_size)
    test_reader = paddle.batch(
        paddle.reader.shuffle(
            reader.data_reader(test_data_file, word_dict, label_dict),
            buf_size=1000),
        batch_size=batch_size)

    feeding = {"word": 0, "mark": 1, "target": 2}

    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % 1 == 0:
                logger.info("Pass %d, Batch %d, Cost %f, %s" % (
                    event.pass_id, event.batch_id, event.cost, event.metrics))
            if event.batch_id % 1 == 0:
                result = trainer.test(reader=test_reader, feeding=feeding)
                logger.info("\nTest with Pass %d, Batch %d, %s" %
                            (event.pass_id, event.batch_id, result.metrics))

        if isinstance(event, paddle.event.EndPass):
            # save parameters
            with gzip.open(
                    os.path.join(model_save_dir, "params_pass_%d.tar.gz" %
                                 event.pass_id), "w") as f:
                parameters.to_tar(f)

            result = trainer.test(reader=test_reader, feeding=feeding)
            logger.info("\nTest with Pass %d, %s" % (event.pass_id,
                                                     result.metrics))

    trainer.train(
        reader=train_reader,
        event_handler=event_handler,
        num_passes=num_passes,
        feeding=feeding)
def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
         model_save_dir, num_passes, use_gpu, parallel):
    if not os.path.exists(model_save_dir):
        os.mkdir(model_save_dir)

    BATCH_SIZE = int(os.getenv("BATCH_SIZE", "200"))
    word_dict = load_dict(vocab_file)
    label_dict = load_dict(target_file)

    word_vector_values = get_embedding(emb_file)

    word_dict_len = len(word_dict)
    label_dict_len = len(label_dict)

    avg_cost, feature_out, word, mark, target = ner_net(
        word_dict_len, label_dict_len, parallel)

    sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-3 / BATCH_SIZE)
    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)

    crf_decode = fluid.layers.crf_decoding(
        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))

    chunk_evaluator = fluid.evaluator.ChunkEvaluator(
        input=crf_decode,
        label=target,
        chunk_scheme="IOB",
        num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))

    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        test_target = chunk_evaluator.metrics + chunk_evaluator.states
        inference_program = fluid.io.get_inference_program(test_target)

    train_reader = paddle.batch(paddle.reader.shuffle(reader.data_reader(
        train_data_file, word_dict, label_dict),
                                                      buf_size=20000),
                                batch_size=BATCH_SIZE)
    test_reader = paddle.batch(paddle.reader.shuffle(reader.data_reader(
        test_data_file, word_dict, label_dict),
                                                     buf_size=20000),
                               batch_size=BATCH_SIZE)

    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
    feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place)
    exe = fluid.Executor(place)

    def train_loop(exe, trainer_prog, trainer_id=0, reader=train_reader):
        embedding_name = 'emb'
        embedding_param = fluid.global_scope().find_var(
            embedding_name).get_tensor()
        embedding_param.set(word_vector_values, place)

        batch_id = 0
        for pass_id in xrange(num_passes):
            chunk_evaluator.reset(exe)
            start_time = time.time()
            with profiler.profiler(
                    "CPU", 'total',
                    profile_path="/usr/local/nvidia/lib64/tmp") as prof:
                for data in reader():
                    cost, batch_precision, batch_recall, batch_f1_score = exe.run(
                        trainer_prog,
                        feed=feeder.feed(data),
                        fetch_list=[avg_cost] + chunk_evaluator.metrics)
                    if batch_id % 5 == 0:
                        print("Pass " + str(pass_id) + ", Batch " +
                              str(batch_id) + ", Cost " + str(cost[0]) +
                              ", Precision " + str(batch_precision[0]) +
                              ", Recall " + str(batch_recall[0]) +
                              ", F1_score" + str(batch_f1_score[0]))
                    batch_id = batch_id + 1

                pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
                    exe)
                spent = time.time() - start_time
                print("pass_id: %d, precision: %f, recall: %f, f1: %f, spent: %f, speed: %f" % \
                      (pass_id, pass_precision, pass_recall, pass_f1_score,
                      spent, 14987.0 / spent))
                pass_precision, pass_recall, pass_f1_score = test(
                    exe, chunk_evaluator, inference_program, test_reader,
                    place)
                print("[TestSet] pass_id:" + str(pass_id) +
                      " pass_precision:" + str(pass_precision) +
                      " pass_recall:" + str(pass_recall) + " pass_f1_score:" +
                      str(pass_f1_score))

                # save_dirname = os.path.join(model_save_dir,
                #     "params_pass_%d_trainer%d" % (pass_id, trainer_id))
                # fluid.io.save_inference_model(save_dirname, ['word', 'mark', 'target'],
                #                             [crf_decode], exe)

    with open("/tmp/origin_prog", "w") as fn:
        fn.write(fluid.default_main_program().__str__())

    if os.getenv("LOCAL") == "TRUE":
        exe.run(fluid.default_startup_program())
        train_loop(exe, fluid.default_main_program())
    else:
        pserver_ips = os.getenv(
            "PADDLE_INIT_PSERVERS")  # all pserver endpoints
        eplist = []
        port = os.getenv("PADDLE_INIT_PORT")
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)
        trainers = int(os.getenv("TRAINERS"))  # total trainer count
        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID", "0"))
        current_endpoint = os.getenv(
            "POD_IP") + ":" + port  # current pserver endpoint
        training_role = os.getenv(
            "TRAINING_ROLE",
            "TRAINER")  # get the training role: trainer/pserver
        t = fluid.DistributeTranspiler()
        t.transpile(optimize_ops,
                    params_grads,
                    trainer_id,
                    pservers=pserver_endpoints,
                    trainers=trainers)

        print("endpoints: %s, current: %s, trainers: %d, trainer_id: %d, role: %s" %\
              (pserver_endpoints, current_endpoint, trainers, trainer_id, training_role))
        if training_role == "PSERVER":
            if not current_endpoint:
                print("need env SERVER_ENDPOINT")
                exit(1)
            pserver_prog = t.get_pserver_program(current_endpoint)
            print("######## pserver prog #############")
            with open("/tmp/pserver_prog", "w") as f:
                f.write(pserver_prog.__str__())
            print("######## pserver prog #############")
            pserver_startup = t.get_startup_program(current_endpoint,
                                                    pserver_prog)
            with open("/tmp/pserver_startup", "w") as f:
                f.write(pserver_startup.__str__())
            print("starting server side startup")
            exe.run(pserver_startup)
            print("starting parameter server...")
            exe.run(pserver_prog)
        elif training_role == "TRAINER":
            exe.run(fluid.default_startup_program())
            trainer_prog = t.get_trainer_program()
            cluster_train_reader = paddle.batch(paddle.reader.shuffle(
                reader.cluster_data_reader(train_data_file, word_dict,
                                           label_dict, trainers, trainer_id),
                buf_size=20000),
                                                batch_size=BATCH_SIZE)
            print("######## trainer prog #############")
            with open("/tmp/trainer_prog", "w") as f:
                f.write(trainer_prog.__str__())
            print("######## trainer prog #############")
            train_loop(exe, trainer_prog, trainer_id, cluster_train_reader)
        else:
            print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
Beispiel #5
0
def main(train_data_file,
         test_data_file,
         vocab_file,
         target_file,
         emb_file,
         model_save_dir,
         num_passes,
         use_gpu,
         parallel,
         batch_size=200):
    if not os.path.exists(model_save_dir):
        os.mkdir(model_save_dir)

    word_dict = load_dict(vocab_file)
    label_dict = load_dict(target_file)

    word_vector_values = get_embedding(emb_file)

    word_dict_len = len(word_dict)
    label_dict_len = len(label_dict)

    if "CE_MODE_X" in os.environ:
        fluid.default_startup_program().random_seed = 110

    avg_cost, feature_out, word, mark, target = ner_net(
        word_dict_len, label_dict_len, parallel)

    crf_decode = fluid.layers.crf_decoding(
        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))

    (precision, recall, f1_score, num_infer_chunks, num_label_chunks,
     num_correct_chunks) = fluid.layers.chunk_eval(
         input=crf_decode,
         label=target,
         chunk_scheme="IOB",
         num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
    chunk_evaluator = fluid.metrics.ChunkEvaluator()

    inference_program = fluid.default_main_program().clone(for_test=True)
    test_fetch_list = [num_infer_chunks, num_label_chunks, num_correct_chunks]
    sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
    sgd_optimizer.minimize(avg_cost)

    if "CE_MODE_X" not in os.environ:
        train_reader = paddle.batch(paddle.reader.shuffle(reader.data_reader(
            train_data_file, word_dict, label_dict),
                                                          buf_size=20000),
                                    batch_size=batch_size)
        test_reader = paddle.batch(paddle.reader.shuffle(reader.data_reader(
            test_data_file, word_dict, label_dict),
                                                         buf_size=20000),
                                   batch_size=batch_size)
    else:
        train_reader = paddle.batch(reader.data_reader(train_data_file,
                                                       word_dict, label_dict),
                                    batch_size=batch_size)
        test_reader = paddle.batch(reader.data_reader(test_data_file,
                                                      word_dict, label_dict),
                                   batch_size=batch_size)

    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
    feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place)
    exe = fluid.Executor(place)

    exe.run(fluid.default_startup_program())

    embedding_name = 'emb'
    embedding_param = fluid.global_scope().find_var(
        embedding_name).get_tensor()
    embedding_param.set(word_vector_values, place)

    time_begin = time.time()
    for pass_id in six.moves.xrange(num_passes):
        chunk_evaluator.reset()
        for batch_id, data in enumerate(train_reader()):
            cost_var, nums_infer, nums_label, nums_correct = exe.run(
                fluid.default_main_program(),
                feed=feeder.feed(data),
                fetch_list=[
                    avg_cost, num_infer_chunks, num_label_chunks,
                    num_correct_chunks
                ])
            if batch_id % 5 == 0:
                print("Pass " + str(pass_id) + ", Batch " + str(batch_id) +
                      ", Cost " + str(cost_var[0]))
            chunk_evaluator.update(nums_infer, nums_label, nums_correct)
        pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval()
        print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" +
              str(pass_precision) + " pass_recall:" + str(pass_recall) +
              " pass_f1_score:" + str(pass_f1_score))

        test_pass_precision, test_pass_recall, test_pass_f1_score = test(
            exe, chunk_evaluator, inference_program, test_reader,
            test_fetch_list, place)
        print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" +
              str(test_pass_precision) + " pass_recall:" +
              str(test_pass_recall) + " pass_f1_score:" +
              str(test_pass_f1_score))

        save_dirname = os.path.join(model_save_dir, "params_pass_%d" % pass_id)
        if "CE_MODE_X" not in os.environ:
            fluid.io.save_inference_model(save_dirname, ['word', 'mark'],
                                          crf_decode, exe)

    if "CE_MODE_X" in os.environ:
        print("kpis	train_precision	%f" % pass_precision)
        print("kpis	test_precision	%f" % test_pass_precision)
        print("kpis	train_duration	%f" % (time.time() - time_begin))