def evaluate(self, session): metric_dict = {} try: while True: real_label_ids, logits = session.run( [self.labels, self.logits]) predict_label_ids = self._logits_to_label_ids(logits) predict_labels = DatasetMaker.label_ids_to_labels( predict_label_ids) real_labels = DatasetMaker.label_ids_to_labels(real_label_ids) metric_dict = metric_collect(real_labels, predict_labels, metric_dict) except tf.errors.OutOfRangeError: return metric_dict
def _init_dataset_maker(self, load=False): if not load: DatasetMaker.generate_mapping(self.train_data) DatasetMaker.save_mapping(self.map_file, self.vocabulary_file) else: DatasetMaker.load_mapping(self.map_file) DatasetMaker.save_mapping(self.map_file, self.vocabulary_file) FLAGS.char_num = len(DatasetMaker.char_to_id) FLAGS.tag_num = len(DatasetMaker.tag_to_id)
def _init_dataset_maker(self, load=False): if not load: DatasetMaker.generate_mapping(self.train_data) if self.is_chief: DatasetMaker.save_mapping(self.map_file, self.vocabulary_file) else: DatasetMaker.load_mapping(self.map_file) if self.is_chief: DatasetMaker.save_mapping(self.map_file, self.vocabulary_file) FLAGS.char_num = len(DatasetMaker.char_to_id) #FLAGS.gram2_num = len(DatasetMaker.gram2_to_id) #FLAGS.gram3_num = len(DatasetMaker.gram3_to_id) FLAGS.label_num = len(DatasetMaker.label_to_id)
def evaluate(self, session): metric_dict = {} try: while True: predict_tag_ids = None if self.loss_type == "softmax": lengths, real_tag_ids, logits = session.run( [self.char_len, self.tags, self.logits]) predict_tag_ids = self._logits_to_tag_ids(logits) elif self.loss_type == "crf": real_tag_ids, logits, lengths, trans = session.run( [self.tags, self.logits, self.char_len, self.trans]) predict_tag_ids = self._logits_to_tag_ids( logits, lengths, trans) predict_tags = DatasetMaker.tag_ids_to_tags(predict_tag_ids) real_tags = DatasetMaker.tag_ids_to_tags(real_tag_ids) metric_dict = entity_metric_collect(real_tags, predict_tags, lengths, metric_dict) except tf.errors.OutOfRangeError: return metric_dict
def infer(self, session, file_handler): try: while True: data_ids, logits = session.run([self.ids, self.logits]) predict_label_ids = self._logits_to_label_ids(logits) predict_labels = DatasetMaker.label_ids_to_labels( predict_label_ids) file_handler.write( np.concatenate([data_ids, predict_labels], axis=1)) except tf.errors.OutOfRangeError as e: raise e
def infer(self): self._init_dataset_maker() char_mapping_tensor, label_mapping_tensor = DatasetMaker.make_mapping_table_tensor( ) infer_dataset = DatasetMaker.make_dataset(char_mapping_tensor, label_mapping_tensor, self.infer_data, 2, "infer", 1, 0) tf.logging.info("The part {}/{} Training dataset is prepared!".format( 1, 1)) train_iter = tf.data.Iterator.from_structure( infer_dataset.output_types, infer_dataset.output_shapes) self.train_init_op = train_iter.make_initializer(infer_dataset) infer_session = self._create_session(None) infer_session.run(char_mapping_tensor.init) infer_session.run(self.train_init_op) tf.saved_model.loader.load(infer_session, ["sentiment-analysis"], self.model_path) graph = tf.get_default_graph() x_origin = graph.get_tensor_by_name("input_1:0") y = graph.get_tensor_by_name("dense_3/Sigmoid:0") x = train_iter.get_next() xx = infer_session.run(x) xx = [line[::-1] for line in xx] print(xx) s = [ 1268, 7, 468, 1, 428, 85, 44, 331, 76, 2, 60, 354, 2, 8, 68, 221, 2, 4281, 270, 89, 667, 748, 249 ] print(infer_session.run(y, {x_origin: xx})) tf.logging.info("Loading model from {}".format(self.model_path)) """with tf.gfile.GFile("file_{}".format(self.task_index), "w") as f_w:
def _init_dataset_maker(self): DatasetMaker.load_mapping(self.map_file) # DatasetMaker.save_mapping(self.map_file, self.vocabulary_file) FLAGS.char_num = len(DatasetMaker.char_to_id) FLAGS.label_num = len(DatasetMaker.label_to_id)
def train(self): self._init_dataset_maker(False) train_graph = tf.Graph() with train_graph.as_default(): train_char_mapping_tensor, train_label_mapping_tensor = DatasetMaker.make_mapping_table_tensor( ) train_dataset = DatasetMaker.make_dataset( train_char_mapping_tensor, train_label_mapping_tensor, self.train_data, FLAGS.batch_size, "train", 1, 0) self.global_step = tf.train.get_or_create_global_step() train_iter = tf.data.Iterator.from_structure( train_dataset.output_types, train_dataset.output_shapes) train_init_op = train_iter.make_initializer(train_dataset) train_model = TrainModel(train_iter, FLAGS, self.global_step) self.train_summary_op = train_model.merge_train_summary_op eval_graph = tf.Graph() with eval_graph.as_default(): eval_char_mapping_tensor, eval_label_mapping_tensor = DatasetMaker.make_mapping_table_tensor( ) valid_dataset = DatasetMaker.make_dataset( eval_char_mapping_tensor, eval_label_mapping_tensor, self.valid_data, FLAGS.batch_size, "eval", 1, 0) tf.logging.info("The part 1/1 Validation dataset is prepared!") test_dataset = DatasetMaker.make_dataset( eval_char_mapping_tensor, eval_label_mapping_tensor, self.test_data, FLAGS.batch_size, "eval", 1, 0) tf.logging.info("The part 1/1 Test dataset is prepared!") eval_iter = tf.data.Iterator.from_structure( valid_dataset.output_types, valid_dataset.output_shapes) valid_init_op = eval_iter.make_initializer(valid_dataset) test_init_op = eval_iter.make_initializer(test_dataset) eval_model = EvalModel(eval_iter, FLAGS) train_session = self._create_session(train_graph) tf.logging.info("Created model with fresh parameters.") print_flags(FLAGS) save_flags(FLAGS, os.path.join(self.root_path, "config.pkl")) with train_session.graph.as_default(): train_session.run(tf.global_variables_initializer()) train_session.run(train_char_mapping_tensor.init) #train_session.run(train_gram2_mapping_tensor.init) #train_session.run(train_gram3_mapping_tensor.init) train_session.run(train_label_mapping_tensor.init) train_session.run(train_init_op) eval_session = self._create_session(eval_graph) eval_session.run(eval_char_mapping_tensor.init) #eval_session.run(eval_gram2_mapping_tensor.init) #eval_session.run(eval_gram3_mapping_tensor.init) eval_session.run(eval_label_mapping_tensor.init) tf.logging.info("Start training") loss = [] steps_per_epoch = self.train_data_num // FLAGS.batch_size # how many batches in an epoch for i in range(FLAGS.max_epoch): for j in range(steps_per_epoch): step, loss_value = train_model.train(train_session) loss.append(loss_value) if step % FLAGS.check_step == 0: iteration = step // steps_per_epoch + 1 tf.logging.info( "iteration:{} step:{}/{}, cross entropy loss:{:>9.6f}". format(iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] if step % FLAGS.eval_step == 0: tf.logging.info( "Evaluate Validation Dataset and Test Dataset in step: {}" .format(step)) train_model.saver.save( train_session, os.path.join(self.log_dir, "temp_model.ckpt")) tf.logging.info("Saving model parameters in {}".format( os.path.join(self.log_dir, "temp_model.ckpt"))) eval_model.saver.restore( eval_session, os.path.join(self.log_dir, "temp_model.ckpt")) tf.logging.info("Loading model from {}".format( os.path.join(self.log_dir, "temp_model.ckpt"))) validation_accuracy = self._eval_performance( eval_session, eval_model, "validation", valid_init_op) test_accuracy = self._eval_performance( eval_session, eval_model, "test", test_init_op) eval_model.save_dev_test_summary(self.summary_writer, eval_session, validation_accuracy, test_accuracy, step)
def train(self): if self.job_name == "ps": with tf.device("/cpu:0"): self.server.join() return self._init_dataset_maker(False) train_init_op = None valid_init_op = None test_init_op = None with tf.device( tf.train.replica_device_setter( worker_device=self.worker_prefix, cluster=self.cluster)): self.global_step = tf.train.get_or_create_global_step() if self.job_name == "worker": train_dataset = DatasetMaker.make_dataset( self.train_data, FLAGS.batch_size, "train", self.num_worker, self.task_index) tf.logging.info( "The part {}/{} Training dataset is prepared!".format( self.task_index + 1, self.num_worker)) train_iter = tf.data.Iterator.from_structure( train_dataset.output_types, train_dataset.output_shapes) train_init_op = train_iter.make_initializer(train_dataset) train_model = TrainModel(train_iter, FLAGS, self.global_step) elif self.job_name == "chief": # build same train graph to synchronize model parameters train_dataset = DatasetMaker.make_dataset( self.train_data, FLAGS.batch_size, "train", self.num_worker, self.task_index) train_iter = tf.data.Iterator.from_structure( train_dataset.output_types, train_dataset.output_shapes) train_model = TrainModel(train_iter, FLAGS, self.global_step) self.train_summary_op = train_model.merge_train_summary_op # build test graph of same structure but different name scope # restore model from train checkpoint, and avoid its updating during validation eval_graph = tf.Graph() with eval_graph.as_default(): valid_dataset = DatasetMaker.make_dataset( self.valid_data, FLAGS.batch_size, "eval", 1, 0) tf.logging.info( "The part 1/1 Validation dataset is prepared!") test_dataset = DatasetMaker.make_dataset( self.test_data, FLAGS.batch_size, "eval", 1, 0) tf.logging.info("The part 1/1 Test dataset is prepared!") eval_iter = tf.data.Iterator.from_structure( valid_dataset.output_types, valid_dataset.output_shapes) valid_init_op = eval_iter.make_initializer(valid_dataset) test_init_op = eval_iter.make_initializer(test_dataset) eval_model = EvalModel(eval_iter, FLAGS, "eval_graph") with self._create_session_wrapper(retries=10) as sess: try: if self.job_name == "worker": DatasetMaker.init_mapping_table_tensor(sess) sess.run(train_init_op) step = 0 while not sess.should_stop(): global_step_val, loss_value = train_model.train(sess) if (step + 1) % self.check_step == 0: epoch = (global_step_val * FLAGS.batch_size) // self.train_data_num tf.logging.info( "Job-{}:Worker-{}-----Epoch:{}-Local_Step/Global_Step:{}/{}:Loss is {:.2f}" .format(self.job_name, self.task_index, epoch, step, global_step_val, loss_value)) step += 1 elif self.job_name == "chief": tf.logging.info("Created model with fresh parameters.") self._print_flags(FLAGS) sess.run(tf.global_variables_initializer()) DatasetMaker.init_mapping_table_tensor(sess) # record top N model's performance while True: time.sleep(2) global_step_val = sess.run(self.global_step) if (global_step_val + 1) % self.eval_step == 0: tf.logging.info( "Evaluate Validation Dataset and Test Dataset in step: {}" .format(global_step_val)) train_model.saver.save( sess, self.log_dir, latest_filename="temp", global_step=self.global_step) ckpt = tf.train.get_checkpoint_state( self.log_dir, latest_filename="temp") tf.logging.info( "Saving model parameters in {}".format( ckpt.model_checkpoint_path)) eval_model.saver.restore( sess, ckpt.model_checkpoint_path) tf.logging.info("Loading model from {}".format( ckpt.model_checkpoint_path)) validation_accuracy = self._eval_performance( sess, EvalModel, "validation", valid_init_op) test_accuracy = self._eval_performance( sess, EvalModel, "test", test_init_op) eval_model.save_dev_test_summary( self.summary_writer, sess, validation_accuracy, test_accuracy, global_step_val) except tf.errors.OutOfRangeError as e: exc_info = traceback.format_exc(sys.exc_info()) msg = 'Out of range error:{}\n{}'.format(e, exc_info) tf.logging.warn(msg) tf.logging.info('Done training -- step limit reached')
def train(self): if self.job_name == "ps": with tf.device("/cpu:0"): self.server.join() return if not self.is_chief: time.sleep(20) self._init_dataset_maker(True) ps_strategy = tf.contrib.training.GreedyLoadBalancingStrategy( self.num_ps) with tf.device( tf.train.replica_device_setter( worker_device=self.worker_prefix, cluster=self.cluster, ps_strategy=ps_strategy)): self.global_step = tf.train.get_or_create_global_step() char_mapping_tensor, label_mapping_tensor = DatasetMaker.make_mapping_table_tensor( ) train_dataset = DatasetMaker.make_dataset( char_mapping_tensor, label_mapping_tensor, self.train_data, FLAGS.batch_size, "train", self.num_worker, self.task_index) tf.logging.info( "The part {}/{} Training dataset is prepared!".format( self.task_index + 1, self.num_worker)) train_iter = tf.data.Iterator.from_structure( train_dataset.output_types, train_dataset.output_shapes) self.train_init_op = train_iter.make_initializer(train_dataset) train_model = TrainModel(train_iter, FLAGS, self.global_step) self.optimizer = train_model.optimizer self.train_summary_op = train_model.merge_train_summary_op with self._create_session_wrapper(retries=10) as sess: try: if self.job_name == "worker": step = 0 while not sess.should_stop(): global_step_val, loss_value = train_model.train(sess) if (step + 1) % self.check_step == 0: epoch = ((step + 1) * FLAGS.batch_size) // self.train_data_num tf.logging.info( "Job-{}:Worker-{}-----Local_Step/Global_Step:{}/{}:Loss is {:.4f}" .format(self.job_name, self.task_index, step, global_step_val, loss_value)) tf.logging.info( "Epoch:{}-Processed {}/{} data".format( epoch, (step + 1) * FLAGS.batch_size % self.train_data_num, self.train_data_num)) step += 1 elif self.job_name == "chief": print_flags(FLAGS, True) save_flags(FLAGS, os.path.join(self.root_path, "config.pkl"), True) tf.logging.info("Waiting for training...") # record top N model's performance while True: time.sleep(5) global_step_val = sess.run(self.global_step) tf.logging.info( "Global step is {}".format(global_step_val)) except tf.errors.OutOfRangeError as e: exc_info = traceback.format_exc(sys.exc_info()) msg = 'Out of range error:{}\n{}'.format(e, exc_info) tf.logging.warn(msg) tf.logging.info('Done training -- step limit reached')