def infer(self, model_path, data_path, output): test_reader = paddle.batch(paddle.reader.buffered( reader.create_reader(data_path, self.settings), size=self.conf.batch_size * 1000), batch_size=self.conf.batch_size) # load the trained models parameters = paddle.parameters.Parameters.from_tar( utils.open_file(model_path, "r")) inferer = paddle.inference.Inference(output_layer=self.tags_layer, parameters=parameters) def count_evi_ids(test_batch): num = 0 for sample in test_batch: num += len(sample[reader.E_IDS]) return num for test_batch in test_reader(): tags = inferer.infer(input=test_batch, field=["id"], feeding=network.feeding) evi_ids_num = count_evi_ids(test_batch) assert len(tags) == evi_ids_num print >> output, ";\n".join(str(tag) for tag in tags) + ";"
def runTest(self): for keep_first_b in [True, False]: for label_schema in ["BIO", "BIO2"]: settings = reader.Settings(vocab=Vocab().data, is_training=True, label_schema=label_schema, negative_sample_ratio=0.2, hit_ans_negative_sample_ratio=0.25, keep_first_b=keep_first_b) filename = os.path.join(topdir, "test", "trn_data.gz") data_stream = reader.create_reader(filename, settings) total, at_least_one, one = 1000, 0, 0 for _, d in itertools.izip(xrange(total), data_stream()): labels = d[reader.LABELS] b_num = labels.count(0) if b_num >= 1: at_least_one += 1 if b_num == 1: one += 1 self.assertLess(at_least_one, total) if keep_first_b: self.assertEqual(one, at_least_one) else: self.assertLess(one, at_least_one)
def runTest(self): settings = reader.Settings(vocab=Vocab().data, is_training=True, label_schema="BIO2", negative_sample_ratio=0.2, hit_ans_negative_sample_ratio=0.25, keep_first_b=True) filename = os.path.join(topdir, "test", "trn_data.gz") data_stream = reader.create_reader(filename, settings) q_uniq_ids, e_uniq_ids = set(), set() for _, d in itertools.izip(xrange(1000), data_stream()): q_uniq_ids.update(d[reader.Q_IDS]) e_uniq_ids.update(d[reader.E_IDS]) self.assertGreater(len(q_uniq_ids), 50) self.assertGreater(len(e_uniq_ids), 50)
def check_ratio(self, negative_sample_ratio): for keep_first_b in [True, False]: settings = reader.Settings( vocab=Vocab().data, is_training=True, label_schema="BIO2", negative_sample_ratio=negative_sample_ratio, hit_ans_negative_sample_ratio=0.25, keep_first_b=keep_first_b) filename = os.path.join(topdir, "test", "trn_data.gz") data_stream = reader.create_reader(filename, settings) total, negative_num = 5000, 0 for _, d in itertools.izip(xrange(total), data_stream()): labels = d[reader.LABELS] if labels.count(0) == 0: negative_num += 1 ratio = negative_num / float(total) self.assertLessEqual(math.fabs(ratio - negative_sample_ratio), 0.01)
def train(conf): if not os.path.exists(conf.model_save_dir): os.makedirs(conf.model_save_dir, mode=0755) settings = reader.Settings( vocab=conf.vocab, is_training=True, label_schema=conf.label_schema, negative_sample_ratio=conf.negative_sample_ratio, hit_ans_negative_sample_ratio=conf.hit_ans_negative_sample_ratio, keep_first_b=conf.keep_first_b, seed=conf.seed) samples_per_pass = conf.batch_size * conf.batches_per_pass train_reader = paddle.batch( paddle.reader.buffered( reader.create_reader(conf.train_data_path, settings, samples_per_pass), size=samples_per_pass), batch_size=conf.batch_size) # TODO(lipeng17) v2 API does not support parallel_nn yet. Therefore, we can # only use CPU currently paddle.init( use_gpu=conf.use_gpu, trainer_count=conf.trainer_count, seed=conf.paddle_seed) # network config cost = network.training_net(conf) # create parameters # NOTE: parameter values are not initilized here, therefore, we need to # print parameter initialization info in the beginning of the first batch parameters = paddle.parameters.create(cost) # create optimizer rmsprop_optimizer = paddle.optimizer.RMSProp( learning_rate=conf.learning_rate, rho=conf.rho, epsilon=conf.epsilon, model_average=paddle.optimizer.ModelAverage( average_window=conf.average_window, max_average_window=conf.max_average_window)) # create trainer trainer = paddle.trainer.SGD( cost=cost, parameters=parameters, update_equation=rmsprop_optimizer) # begin training network def _event_handler(event): """ Define end batch and end pass event handler """ if isinstance(event, paddle.event.EndIteration): sys.stderr.write(".") batch_num = event.batch_id + 1 total_batch = conf.batches_per_pass * event.pass_id + batch_num if batch_num % conf.log_period == 0: sys.stderr.write("\n") logger.info("Total batch=%d Batch=%d CurrentCost=%f Eval: %s" \ % (total_batch, batch_num, event.cost, event.metrics)) if batch_num % conf.show_parameter_status_period == 0: show_parameter_status(parameters) elif isinstance(event, paddle.event.EndPass): save_model(trainer, conf.model_save_dir, parameters, event.pass_id) elif isinstance(event, paddle.event.BeginIteration): if event.batch_id == 0 and event.pass_id == 0: show_parameter_init_info(parameters) ## for debugging purpose #with utils.open_file("config", "w") as config: # print >> config, paddle.layer.parse_network(cost) trainer.train( reader=train_reader, event_handler=_event_handler, feeding=network.feeding, num_passes=conf.num_passes) logger.info("Training has finished.")