Beispiel #1
0
    def infer(self, model_path, data_path, output):
        test_reader = paddle.batch(paddle.reader.buffered(
            reader.create_reader(data_path, self.settings),
            size=self.conf.batch_size * 1000),
                                   batch_size=self.conf.batch_size)

        # load the trained models
        parameters = paddle.parameters.Parameters.from_tar(
            utils.open_file(model_path, "r"))
        inferer = paddle.inference.Inference(output_layer=self.tags_layer,
                                             parameters=parameters)

        def count_evi_ids(test_batch):
            num = 0
            for sample in test_batch:
                num += len(sample[reader.E_IDS])
            return num

        for test_batch in test_reader():
            tags = inferer.infer(input=test_batch,
                                 field=["id"],
                                 feeding=network.feeding)
            evi_ids_num = count_evi_ids(test_batch)
            assert len(tags) == evi_ids_num
            print >> output, ";\n".join(str(tag) for tag in tags) + ";"
Beispiel #2
0
    def runTest(self):
        for keep_first_b in [True, False]:
            for label_schema in ["BIO", "BIO2"]:
                settings = reader.Settings(vocab=Vocab().data,
                                           is_training=True,
                                           label_schema=label_schema,
                                           negative_sample_ratio=0.2,
                                           hit_ans_negative_sample_ratio=0.25,
                                           keep_first_b=keep_first_b)

                filename = os.path.join(topdir, "test", "trn_data.gz")
                data_stream = reader.create_reader(filename, settings)
                total, at_least_one, one = 1000, 0, 0
                for _, d in itertools.izip(xrange(total), data_stream()):
                    labels = d[reader.LABELS]
                    b_num = labels.count(0)
                    if b_num >= 1:
                        at_least_one += 1
                    if b_num == 1:
                        one += 1

                self.assertLess(at_least_one, total)
                if keep_first_b:
                    self.assertEqual(one, at_least_one)
                else:
                    self.assertLess(one, at_least_one)
Beispiel #3
0
    def runTest(self):
        settings = reader.Settings(vocab=Vocab().data,
                                   is_training=True,
                                   label_schema="BIO2",
                                   negative_sample_ratio=0.2,
                                   hit_ans_negative_sample_ratio=0.25,
                                   keep_first_b=True)

        filename = os.path.join(topdir, "test", "trn_data.gz")
        data_stream = reader.create_reader(filename, settings)
        q_uniq_ids, e_uniq_ids = set(), set()
        for _, d in itertools.izip(xrange(1000), data_stream()):
            q_uniq_ids.update(d[reader.Q_IDS])
            e_uniq_ids.update(d[reader.E_IDS])

        self.assertGreater(len(q_uniq_ids), 50)
        self.assertGreater(len(e_uniq_ids), 50)
Beispiel #4
0
    def check_ratio(self, negative_sample_ratio):
        for keep_first_b in [True, False]:
            settings = reader.Settings(
                vocab=Vocab().data,
                is_training=True,
                label_schema="BIO2",
                negative_sample_ratio=negative_sample_ratio,
                hit_ans_negative_sample_ratio=0.25,
                keep_first_b=keep_first_b)

            filename = os.path.join(topdir, "test", "trn_data.gz")
            data_stream = reader.create_reader(filename, settings)
            total, negative_num = 5000, 0
            for _, d in itertools.izip(xrange(total), data_stream()):
                labels = d[reader.LABELS]
                if labels.count(0) == 0:
                    negative_num += 1

            ratio = negative_num / float(total)
            self.assertLessEqual(math.fabs(ratio - negative_sample_ratio),
                                 0.01)
Beispiel #5
0
def train(conf):
    if not os.path.exists(conf.model_save_dir):
        os.makedirs(conf.model_save_dir, mode=0755)

    settings = reader.Settings(
        vocab=conf.vocab,
        is_training=True,
        label_schema=conf.label_schema,
        negative_sample_ratio=conf.negative_sample_ratio,
        hit_ans_negative_sample_ratio=conf.hit_ans_negative_sample_ratio,
        keep_first_b=conf.keep_first_b,
        seed=conf.seed)
    samples_per_pass = conf.batch_size * conf.batches_per_pass
    train_reader = paddle.batch(
        paddle.reader.buffered(
            reader.create_reader(conf.train_data_path, settings,
                                 samples_per_pass),
            size=samples_per_pass),
        batch_size=conf.batch_size)

    # TODO(lipeng17) v2 API does not support parallel_nn yet. Therefore, we can
    # only use CPU currently
    paddle.init(
        use_gpu=conf.use_gpu,
        trainer_count=conf.trainer_count,
        seed=conf.paddle_seed)

    # network config
    cost = network.training_net(conf)

    # create parameters
    # NOTE: parameter values are not initilized here, therefore, we need to
    # print parameter initialization info in the beginning of the first batch
    parameters = paddle.parameters.create(cost)

    # create optimizer
    rmsprop_optimizer = paddle.optimizer.RMSProp(
        learning_rate=conf.learning_rate,
        rho=conf.rho,
        epsilon=conf.epsilon,
        model_average=paddle.optimizer.ModelAverage(
            average_window=conf.average_window,
            max_average_window=conf.max_average_window))

    # create trainer
    trainer = paddle.trainer.SGD(
        cost=cost, parameters=parameters, update_equation=rmsprop_optimizer)

    # begin training network
    def _event_handler(event):
        """
        Define end batch and end pass event handler
        """
        if isinstance(event, paddle.event.EndIteration):
            sys.stderr.write(".")
            batch_num = event.batch_id + 1
            total_batch = conf.batches_per_pass * event.pass_id + batch_num
            if batch_num % conf.log_period == 0:
                sys.stderr.write("\n")
                logger.info("Total batch=%d Batch=%d CurrentCost=%f Eval: %s" \
                        % (total_batch, batch_num, event.cost, event.metrics))

            if batch_num % conf.show_parameter_status_period == 0:
                show_parameter_status(parameters)
        elif isinstance(event, paddle.event.EndPass):
            save_model(trainer, conf.model_save_dir, parameters, event.pass_id)
        elif isinstance(event, paddle.event.BeginIteration):
            if event.batch_id == 0 and event.pass_id == 0:
                show_parameter_init_info(parameters)

    ## for debugging purpose
    #with utils.open_file("config", "w") as config:
    #    print >> config, paddle.layer.parse_network(cost)

    trainer.train(
        reader=train_reader,
        event_handler=_event_handler,
        feeding=network.feeding,
        num_passes=conf.num_passes)

    logger.info("Training has finished.")