Beispiel #1
0
def train(train_file_list_path, test_file_list_path, label_dict_path, model_save_dir):
    # 获取训练列表
    train_file_list = get_file_list(train_file_list_path)
    # 获取测试列表
    test_file_list = get_file_list(test_file_list_path)

    # 使用训练数据生成标记字典
    if not os.path.exists(label_dict_path):
        print(("Label dictionary is not given, the dictionary "
               "is automatically built from the training data."))
        build_label_dict(train_file_list, label_dict_path)

    # 获取标签字典
    char_dict = load_dict(label_dict_path)
    # 获取字典大小
    dict_size = len(char_dict)
    # 定义网络拓扑
    model = Model(dict_size, IMAGE_SHAPE, is_infer=False)

    # 初始化PaddlePaddle
    paddle.init(use_gpu=True, trainer_count=1)
    # 创建优化方法
    optimizer = paddle.optimizer.Momentum(
        momentum=0.9,
        regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128),
        learning_rate=0.001 / 128,
        learning_rate_decay_a=0.1,
        learning_rate_decay_b=128000 * 35,
        learning_rate_schedule="discexp", )
    # 创建训练参数
    params = paddle.parameters.create(model.cost)
    # 定义训练器
    trainer = paddle.trainer.SGD(cost=model.cost,
                                 parameters=params,
                                 update_equation=optimizer,
                                 extra_layers=model.eval)

    # 获取reader
    my_reader = Reader(char_dict=char_dict, image_shape=IMAGE_SHAPE)
    # 说明数据层之间的关系
    feeding = {'image': 0, 'label': 1}

    # 训练事件
    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % 100 == 0:
                print("Pass %d, batch %d, Samples %d, Cost %f" %
                      (event.pass_id, event.batch_id, event.batch_id *
                       BATCH_SIZE, event.cost))

        if isinstance(event, paddle.event.EndPass):
            # 这里由于训练和测试数据共享相同的格式
            # 我们仍然使用reader.train_reader来读取测试数据
            test_reader = paddle.batch(
                my_reader.train_reader(test_file_list),
                batch_size=BATCH_SIZE)
            result = trainer.test(reader=test_reader, feeding=feeding)
            print("Test %d, Cost %f" % (event.pass_id, result.cost))
            # 检查保存model的路径是否存在,如果不存在就创建
            if not os.path.exists(model_save_dir):
                os.mkdir(model_save_dir)
            with gzip.open(
                    os.path.join(model_save_dir, "params_pass.tar.gz"), "w") as f:
                trainer.save_parameter_to_tar(f)

    # 获取训练数据的reader
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            my_reader.train_reader(train_file_list),
            buf_size=1000),
        batch_size=BATCH_SIZE)
    # 开始训练
    trainer.train(reader=train_reader,
                  feeding=feeding,
                  event_handler=event_handler,
                  num_passes=1000)
Beispiel #2
0
def train(train_file_list_path, test_file_list_path, label_dict_path,
          model_save_dir):

    if not os.path.exists(model_save_dir):
        os.mkdir(model_save_dir)

    train_file_list = get_file_list(train_file_list_path)
    test_file_list = get_file_list(test_file_list_path)

    if not os.path.exists(label_dict_path):
        print(("Label dictionary is not given, the dictionary "
               "is automatically built from the training data."))
        build_label_dict(train_file_list, label_dict_path)

    char_dict = load_dict(label_dict_path)
    dict_size = len(char_dict)
    data_generator = DataGenerator(char_dict=char_dict,
                                   image_shape=conf.image_shape)

    paddle.init(use_gpu=conf.use_gpu, trainer_count=conf.trainer_count)
    # Create optimizer.
    optimizer = paddle.optimizer.Momentum(momentum=conf.momentum)
    # Define network topology.
    model = Model(dict_size, conf.image_shape, is_infer=False)
    # Create all the trainable parameters.
    params = paddle.parameters.create(model.cost)

    trainer = paddle.trainer.SGD(cost=model.cost,
                                 parameters=params,
                                 update_equation=optimizer,
                                 extra_layers=model.eval)
    # Feeding dictionary.
    feeding = {'image': 0, 'label': 1}

    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % conf.log_period == 0:
                print("Pass %d, batch %d, Samples %d, Cost %f, Eval %s" %
                      (event.pass_id, event.batch_id, event.batch_id *
                       conf.batch_size, event.cost, event.metrics))

        if isinstance(event, paddle.event.EndPass):
            # Here, because training and testing data share a same format,
            # we still use the reader.train_reader to read the testing data.
            result = trainer.test(reader=paddle.batch(
                data_generator.train_reader(test_file_list),
                batch_size=conf.batch_size),
                                  feeding=feeding)
            print("Test %d, Cost %f, Eval %s" %
                  (event.pass_id, result.cost, result.metrics))
            with gzip.open(
                    os.path.join(model_save_dir,
                                 "params_pass_%05d.tar.gz" % event.pass_id),
                    "w") as f:
                trainer.save_parameter_to_tar(f)

    trainer.train(reader=paddle.batch(paddle.reader.shuffle(
        data_generator.train_reader(train_file_list), buf_size=conf.buf_size),
                                      batch_size=conf.batch_size),
                  feeding=feeding,
                  event_handler=event_handler,
                  num_passes=conf.num_passes)
Beispiel #3
0
def train(train_data_dir, test_data_dir, word_dict_path, label_dict_path,
          model_save_dir):
    """
    :params train_data_path: The path of training data, if this parameter
        is not specified, imdb dataset will be used to run this example
    :type train_data_path: str
    :params test_data_path: The path of testing data, if this parameter
        is not specified, imdb dataset will be used to run this example
    :type test_data_path: str
    :params word_dict_path: The path of word dictionary, if this parameter
        is not specified, imdb dataset will be used to run this example
    :type word_dict_path: str
    :params label_dict_path: The path of label dictionary, if this parameter
        is not specified, imdb dataset will be used to run this example
    :type label_dict_path: str
    :params model_save_dir: dir where models saved
    :type model_save_dir: str
    """
    if train_data_dir is not None:
        assert word_dict_path and label_dict_path, (
            "The parameter train_data_dir, word_dict_path, label_dict_path "
            "should be set at the same time.")

    if not os.path.exists(model_save_dir):
        os.mkdir(model_save_dir)

    use_default_data = (train_data_dir is None)

    if use_default_data:
        logger.info(("No training data are porivided, "
                     "use imdb to train the model."))
        logger.info("Please wait to build the word dictionary ...")

        word_dict = reader.imdb_word_dict()
        train_reader = paddle.batch(paddle.reader.shuffle(
            lambda: reader.imdb_train(word_dict), buf_size=1000),
                                    batch_size=100)
        test_reader = paddle.batch(lambda: reader.imdb_test(word_dict),
                                   batch_size=100)
        class_num = 2
    else:
        if word_dict_path is None or not os.path.exists(word_dict_path):
            logger.info(("Word dictionary is not given, the dictionary "
                         "is automatically built from the training data."))

            # build the word dictionary to map the original string-typed
            # words into integer-typed index
            build_word_dict(data_dir=train_data_dir,
                            save_path=word_dict_path,
                            use_col=1,
                            cutoff_fre=0)

        if not os.path.exists(label_dict_path):
            logger.info(("Label dictionary is not given, the dictionary "
                         "is automatically built from the training data."))
            # build the label dictionary to map the original string-typed
            # label into integer-typed index
            build_label_dict(data_dir=train_data_dir,
                             save_path=label_dict_path,
                             use_col=0)

        word_dict = load_dict(word_dict_path)
        label_dict = load_dict(label_dict_path)

        class_num = len(label_dict)
        logger.info("Class number is : %d." % class_num)

        train_reader = paddle.batch(paddle.reader.shuffle(
            reader.train_reader(train_data_dir, word_dict, label_dict),
            buf_size=conf.buf_size),
                                    batch_size=conf.batch_size)

        if test_data_dir is not None:
            # here, because training and testing data share a same format,
            # we still use the reader.train_reader to read the testing data.
            test_reader = paddle.batch(paddle.reader.shuffle(
                reader.train_reader(test_data_dir, word_dict, label_dict),
                buf_size=conf.buf_size),
                                       batch_size=conf.batch_size)
        else:
            test_reader = None

    dict_dim = len(word_dict)

    logger.info("Length of word dictionary is : %d." % (dict_dim))

    paddle.init(use_gpu=conf.use_gpu, trainer_count=conf.trainer_count)

    # create optimizer
    adam_optimizer = paddle.optimizer.Adam(
        learning_rate=conf.learning_rate,
        regularization=paddle.optimizer.L2Regularization(
            rate=conf.l2_learning_rate),
        model_average=paddle.optimizer.ModelAverage(
            average_window=conf.average_window))

    # define network topology.
    cost, prob, label = nested_net(dict_dim, class_num, is_infer=False)

    # create all the trainable parameters.
    parameters = paddle.parameters.create(cost)

    # create the trainer instance.
    trainer = paddle.trainer.SGD(cost=cost,
                                 extra_layers=paddle.evaluator.auc(
                                     input=prob, label=label),
                                 parameters=parameters,
                                 update_equation=adam_optimizer)

    # feeding dictionary
    feeding = {"word": 0, "label": 1}

    def _event_handler(event):
        """
        Define the end batch and the end pass event handler.
        """
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % conf.log_period == 0:
                logger.info(
                    "Pass %d, Batch %d, Cost %f, %s\n" %
                    (event.pass_id, event.batch_id, event.cost, event.metrics))

        if isinstance(event, paddle.event.EndPass):
            if test_reader is not None:
                result = trainer.test(reader=test_reader, feeding=feeding)
                logger.info("Test at Pass %d, %s \n" %
                            (event.pass_id, result.metrics))
            with gzip.open(
                    os.path.join(model_save_dir,
                                 "params_pass_%05d.tar.gz" % event.pass_id),
                    "w") as f:
                trainer.save_parameter_to_tar(f)

    # begin training network
    trainer.train(reader=train_reader,
                  event_handler=_event_handler,
                  feeding=feeding,
                  num_passes=conf.num_passes)

    logger.info("Training has finished.")
Beispiel #4
0
def train(train_file_list_path, test_file_list_path, label_dict_path,
          model_save_dir):
    # 检查保存model的路径是否存在,如果不存在就创建
    if not os.path.exists(model_save_dir):
        os.mkdir(model_save_dir)
    # 获取训练列表
    train_file_list = get_file_list(train_file_list_path)
    # 获取测试列表
    test_file_list = get_file_list(test_file_list_path)
    # 使用训练数据生成标记字典
    if not os.path.exists(label_dict_path):
        print(("Label dictionary is not given, the dictionary "
               "is automatically built from the training data."))
        build_label_dict(train_file_list, label_dict_path)
    # 获取标签字典
    char_dict = load_dict(label_dict_path)
    # 获取字典大小
    dict_size = len(char_dict)
    # 获取reader
    data_generator = DataGenerator(char_dict=char_dict,
                                   image_shape=conf.image_shape)
    # 初始化PaddlePaddle
    paddle.init(use_gpu=conf.use_gpu, trainer_count=conf.trainer_count)
    # 创建训练参数
    optimizer = paddle.optimizer.Momentum(momentum=conf.momentum)
    # 定义网络拓扑
    model = Model(dict_size, conf.image_shape, is_infer=False)
    # 创建训练参数
    params = paddle.parameters.create(model.cost)

    trainer = paddle.trainer.SGD(cost=model.cost,
                                 parameters=params,
                                 update_equation=optimizer,
                                 extra_layers=model.eval)
    # 说明数据层之间的关系
    feeding = {'image': 0, 'label': 1}

    # 训练事件
    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % conf.log_period == 0:
                print("Pass %d, batch %d, Samples %d, Cost %f, Eval %s" %
                      (event.pass_id, event.batch_id, event.batch_id *
                       conf.batch_size, event.cost, event.metrics))

        if isinstance(event, paddle.event.EndPass):
            # 这里由于训练和测试数据共享相同的格式
            # 我们仍然使用reader.train_reader来读取测试数据
            result = trainer.test(reader=paddle.batch(
                data_generator.train_reader(test_file_list),
                batch_size=conf.batch_size),
                                  feeding=feeding)
            print("Test %d, Cost %f, Eval %s" %
                  (event.pass_id, result.cost, result.metrics))
            with gzip.open(os.path.join(model_save_dir, "params_pass.tar.gz"),
                           "w") as f:
                trainer.save_parameter_to_tar(f)

    # 开始训练
    trainer.train(reader=paddle.batch(paddle.reader.shuffle(
        data_generator.train_reader(train_file_list), buf_size=conf.buf_size),
                                      batch_size=conf.batch_size),
                  feeding=feeding,
                  event_handler=event_handler,
                  num_passes=conf.num_passes)