Ejemplo n.º 1
0
    def train(self):
        self._init_dataset_maker(False)

        train_graph = tf.Graph()
        with train_graph.as_default():
            train_char_mapping_tensor, train_label_mapping_tensor = DatasetMaker.make_mapping_table_tensor(
            )
            train_dataset = DatasetMaker.make_dataset(
                train_char_mapping_tensor, train_label_mapping_tensor,
                self.train_data, FLAGS.batch_size, "train", 1, 0)
            self.global_step = tf.train.get_or_create_global_step()
            train_iter = tf.data.Iterator.from_structure(
                train_dataset.output_types, train_dataset.output_shapes)
            train_init_op = train_iter.make_initializer(train_dataset)
            train_model = TrainModel(train_iter, FLAGS, self.global_step)
            self.train_summary_op = train_model.merge_train_summary_op

        eval_graph = tf.Graph()
        with eval_graph.as_default():
            eval_char_mapping_tensor, eval_label_mapping_tensor = DatasetMaker.make_mapping_table_tensor(
            )
            valid_dataset = DatasetMaker.make_dataset(
                eval_char_mapping_tensor, eval_label_mapping_tensor,
                self.valid_data, FLAGS.batch_size, "eval", 1, 0)
            tf.logging.info("The part 1/1 Validation dataset is prepared!")
            test_dataset = DatasetMaker.make_dataset(
                eval_char_mapping_tensor, eval_label_mapping_tensor,
                self.test_data, FLAGS.batch_size, "eval", 1, 0)
            tf.logging.info("The part 1/1 Test dataset is prepared!")

            eval_iter = tf.data.Iterator.from_structure(
                valid_dataset.output_types, valid_dataset.output_shapes)
            valid_init_op = eval_iter.make_initializer(valid_dataset)
            test_init_op = eval_iter.make_initializer(test_dataset)
            eval_model = EvalModel(eval_iter, FLAGS)

        train_session = self._create_session(train_graph)
        tf.logging.info("Created model with fresh parameters.")
        print_flags(FLAGS)
        save_flags(FLAGS, os.path.join(self.root_path, "config.pkl"))
        with train_session.graph.as_default():
            train_session.run(tf.global_variables_initializer())
        train_session.run(train_char_mapping_tensor.init)
        #train_session.run(train_gram2_mapping_tensor.init)
        #train_session.run(train_gram3_mapping_tensor.init)
        train_session.run(train_label_mapping_tensor.init)
        train_session.run(train_init_op)

        eval_session = self._create_session(eval_graph)
        eval_session.run(eval_char_mapping_tensor.init)
        #eval_session.run(eval_gram2_mapping_tensor.init)
        #eval_session.run(eval_gram3_mapping_tensor.init)
        eval_session.run(eval_label_mapping_tensor.init)

        tf.logging.info("Start training")
        loss = []
        steps_per_epoch = self.train_data_num // FLAGS.batch_size  # how many batches in an epoch
        for i in range(FLAGS.max_epoch):
            for j in range(steps_per_epoch):
                step, loss_value = train_model.train(train_session)
                loss.append(loss_value)
                if step % FLAGS.check_step == 0:
                    iteration = step // steps_per_epoch + 1
                    tf.logging.info(
                        "iteration:{} step:{}/{}, cross entropy loss:{:>9.6f}".
                        format(iteration, step % steps_per_epoch,
                               steps_per_epoch, np.mean(loss)))
                    loss = []

                if step % FLAGS.eval_step == 0:
                    tf.logging.info(
                        "Evaluate Validation Dataset and Test Dataset in step: {}"
                        .format(step))
                    train_model.saver.save(
                        train_session,
                        os.path.join(self.log_dir, "temp_model.ckpt"))
                    tf.logging.info("Saving model parameters in {}".format(
                        os.path.join(self.log_dir, "temp_model.ckpt")))

                    eval_model.saver.restore(
                        eval_session,
                        os.path.join(self.log_dir, "temp_model.ckpt"))
                    tf.logging.info("Loading model from {}".format(
                        os.path.join(self.log_dir, "temp_model.ckpt")))
                    validation_accuracy = self._eval_performance(
                        eval_session, eval_model, "validation", valid_init_op)
                    test_accuracy = self._eval_performance(
                        eval_session, eval_model, "test", test_init_op)
                    eval_model.save_dev_test_summary(self.summary_writer,
                                                     eval_session,
                                                     validation_accuracy,
                                                     test_accuracy, step)
Ejemplo n.º 2
0
                                 infer_mode)

checkpoints_path = "model2/checkpoints"

#train_sess.run(initializer)
infer_batch = get_infer_batches(source_inputs, infer_batch_size,
                                vocab_to_int['<PAD>'])
print(infer_batch)

for i in range(epochs):
    for batch_i, batch in enumerate(
            get_batches(source_inputs, target_inputs, target_outputs,
                        batch_size, vocab_to_int['<PAD>'],
                        vocab_to_int['<PAD>'])):
        if batch_i <= 30000:
            current_loss = train_model.train(train_sess, batch)
            print('Epoch %d Batch %d/%d - Training Loss: %f' %
                  (i + 1, batch_i + 1,
                   (len(source_inputs) - 1) // batch_size + 1, current_loss))
            if (batch_i + 1) % infer_step == 0:
                print("in")
                checkpoint_path = train_model.saver.save(train_sess,
                                                         checkpoints_path,
                                                         global_step=(i * 100 +
                                                                      batch_i))
                print("out")
                infer_model.saver.restore(infer_sess, checkpoint_path)
                current_predict = infer_model.infer(infer_sess, infer_batch)
                #print("current_predict: ", current_predict)
                print(''.join([
                    int_to_vocab[idxes[0]]
    def train(self):
        if self.job_name == "ps":
            with tf.device("/cpu:0"):
                self.server.join()
                return
        if not self.is_chief:
            time.sleep(20)
        self._init_dataset_maker(True)
        ps_strategy = tf.contrib.training.GreedyLoadBalancingStrategy(
            self.num_ps)
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device=self.worker_prefix,
                    cluster=self.cluster,
                    ps_strategy=ps_strategy)):
            self.global_step = tf.train.get_or_create_global_step()
            char_mapping_tensor, label_mapping_tensor = DatasetMaker.make_mapping_table_tensor(
            )

            train_dataset = DatasetMaker.make_dataset(
                char_mapping_tensor, label_mapping_tensor, self.train_data,
                FLAGS.batch_size, "train", self.num_worker, self.task_index)
            tf.logging.info(
                "The part {}/{} Training dataset is prepared!".format(
                    self.task_index + 1, self.num_worker))
            train_iter = tf.data.Iterator.from_structure(
                train_dataset.output_types, train_dataset.output_shapes)
            self.train_init_op = train_iter.make_initializer(train_dataset)

            train_model = TrainModel(train_iter, FLAGS, self.global_step)
            self.optimizer = train_model.optimizer
            self.train_summary_op = train_model.merge_train_summary_op

        with self._create_session_wrapper(retries=10) as sess:
            try:
                if self.job_name == "worker":
                    step = 0
                    while not sess.should_stop():
                        global_step_val, loss_value = train_model.train(sess)
                        if (step + 1) % self.check_step == 0:
                            epoch = ((step + 1) *
                                     FLAGS.batch_size) // self.train_data_num
                            tf.logging.info(
                                "Job-{}:Worker-{}-----Local_Step/Global_Step:{}/{}:Loss is {:.4f}"
                                .format(self.job_name, self.task_index, step,
                                        global_step_val, loss_value))
                            tf.logging.info(
                                "Epoch:{}-Processed {}/{} data".format(
                                    epoch, (step + 1) * FLAGS.batch_size %
                                    self.train_data_num, self.train_data_num))
                        step += 1
                elif self.job_name == "chief":
                    print_flags(FLAGS, True)
                    save_flags(FLAGS, os.path.join(self.root_path,
                                                   "config.pkl"), True)
                    tf.logging.info("Waiting for training...")
                    # record top N model's performance
                    while True:
                        time.sleep(5)
                        global_step_val = sess.run(self.global_step)
                        tf.logging.info(
                            "Global step is {}".format(global_step_val))
            except tf.errors.OutOfRangeError as e:
                exc_info = traceback.format_exc(sys.exc_info())
                msg = 'Out of range error:{}\n{}'.format(e, exc_info)
                tf.logging.warn(msg)
                tf.logging.info('Done training -- step limit reached')
Ejemplo n.º 4
0
    def train(self):
        if self.job_name == "ps":
            with tf.device("/cpu:0"):
                self.server.join()
                return

        self._init_dataset_maker(False)
        train_init_op = None
        valid_init_op = None
        test_init_op = None
        with tf.device(
                tf.train.replica_device_setter(
                    worker_device=self.worker_prefix, cluster=self.cluster)):
            self.global_step = tf.train.get_or_create_global_step()
            if self.job_name == "worker":
                train_dataset = DatasetMaker.make_dataset(
                    self.train_data, FLAGS.batch_size, "train",
                    self.num_worker, self.task_index)
                tf.logging.info(
                    "The part {}/{} Training dataset is prepared!".format(
                        self.task_index + 1, self.num_worker))
                train_iter = tf.data.Iterator.from_structure(
                    train_dataset.output_types, train_dataset.output_shapes)
                train_init_op = train_iter.make_initializer(train_dataset)

                train_model = TrainModel(train_iter, FLAGS, self.global_step)

            elif self.job_name == "chief":
                # build same train graph to synchronize model parameters
                train_dataset = DatasetMaker.make_dataset(
                    self.train_data, FLAGS.batch_size, "train",
                    self.num_worker, self.task_index)
                train_iter = tf.data.Iterator.from_structure(
                    train_dataset.output_types, train_dataset.output_shapes)
                train_model = TrainModel(train_iter, FLAGS, self.global_step)
                self.train_summary_op = train_model.merge_train_summary_op

                # build test graph of same structure but different name scope
                # restore model from train checkpoint, and avoid its updating during validation
                eval_graph = tf.Graph()
                with eval_graph.as_default():
                    valid_dataset = DatasetMaker.make_dataset(
                        self.valid_data, FLAGS.batch_size, "eval", 1, 0)
                    tf.logging.info(
                        "The part 1/1 Validation dataset is prepared!")
                    test_dataset = DatasetMaker.make_dataset(
                        self.test_data, FLAGS.batch_size, "eval", 1, 0)
                    tf.logging.info("The part 1/1 Test dataset is prepared!")

                    eval_iter = tf.data.Iterator.from_structure(
                        valid_dataset.output_types,
                        valid_dataset.output_shapes)
                    valid_init_op = eval_iter.make_initializer(valid_dataset)
                    test_init_op = eval_iter.make_initializer(test_dataset)
                    eval_model = EvalModel(eval_iter, FLAGS, "eval_graph")

        with self._create_session_wrapper(retries=10) as sess:
            try:
                if self.job_name == "worker":
                    DatasetMaker.init_mapping_table_tensor(sess)
                    sess.run(train_init_op)

                    step = 0
                    while not sess.should_stop():
                        global_step_val, loss_value = train_model.train(sess)
                        if (step + 1) % self.check_step == 0:
                            epoch = (global_step_val *
                                     FLAGS.batch_size) // self.train_data_num
                            tf.logging.info(
                                "Job-{}:Worker-{}-----Epoch:{}-Local_Step/Global_Step:{}/{}:Loss is {:.2f}"
                                .format(self.job_name, self.task_index, epoch,
                                        step, global_step_val, loss_value))
                        step += 1
                elif self.job_name == "chief":
                    tf.logging.info("Created model with fresh parameters.")
                    self._print_flags(FLAGS)
                    sess.run(tf.global_variables_initializer())
                    DatasetMaker.init_mapping_table_tensor(sess)
                    # record top N model's performance
                    while True:
                        time.sleep(2)
                        global_step_val = sess.run(self.global_step)
                        if (global_step_val + 1) % self.eval_step == 0:
                            tf.logging.info(
                                "Evaluate Validation Dataset and Test Dataset in step: {}"
                                .format(global_step_val))
                            train_model.saver.save(
                                sess,
                                self.log_dir,
                                latest_filename="temp",
                                global_step=self.global_step)
                            ckpt = tf.train.get_checkpoint_state(
                                self.log_dir, latest_filename="temp")
                            tf.logging.info(
                                "Saving model parameters in {}".format(
                                    ckpt.model_checkpoint_path))

                            eval_model.saver.restore(
                                sess, ckpt.model_checkpoint_path)
                            tf.logging.info("Loading model from {}".format(
                                ckpt.model_checkpoint_path))
                            validation_accuracy = self._eval_performance(
                                sess, EvalModel, "validation", valid_init_op)
                            test_accuracy = self._eval_performance(
                                sess, EvalModel, "test", test_init_op)
                            eval_model.save_dev_test_summary(
                                self.summary_writer, sess, validation_accuracy,
                                test_accuracy, global_step_val)
            except tf.errors.OutOfRangeError as e:
                exc_info = traceback.format_exc(sys.exc_info())
                msg = 'Out of range error:{}\n{}'.format(e, exc_info)
                tf.logging.warn(msg)
                tf.logging.info('Done training -- step limit reached')
Ejemplo n.º 5
0
class ModelUsage(object):
    def __init__(self, FLAGS):
        self.FLAGS = FLAGS
        self.FLAGS.ckpt_path = os.path.join(FLAGS.root_path, FLAGS.ckpt_path)
        self.FLAGS.summary_path = os.path.join(FLAGS.root_path,
                                               FLAGS.summary_path)
        self.FLAGS.log_path = os.path.join(FLAGS.root_path, FLAGS.log_path)
        self.FLAGS.logfile_path = os.path.join(self.FLAGS.log_path,
                                               "train.log")
        self.FLAGS.map_path = os.path.join(FLAGS.root_path, FLAGS.map_path)
        self.FLAGS.mapfile_path = os.path.join(self.FLAGS.map_path, "maps.pkl")
        self.FLAGS.vocab_path = os.path.join(FLAGS.root_path, FLAGS.vocab_path)
        self.FLAGS.vocabfile_path = os.path.join(FLAGS.vocab_path,
                                                 "vocabulary.csv")
        self.FLAGS.config_path = os.path.join(FLAGS.root_path,
                                              FLAGS.config_path)
        self.FLAGS.configfile_path = os.path.join(self.FLAGS.config_path,
                                                  "config_file")
        self.FLAGS.result_path = os.path.join(FLAGS.root_path,
                                              FLAGS.result_path)
        self.FLAGS.train_file = os.path.join(FLAGS.data_root_path,
                                             FLAGS.train_file)
        self.FLAGS.dev_file = os.path.join(FLAGS.data_root_path,
                                           FLAGS.dev_file)
        self.FLAGS.test_file = os.path.join(FLAGS.data_root_path,
                                            FLAGS.test_file)

    def config(self):
        config = OrderedDict()
        config["char_num"] = len(MappingInfo.char_to_id)
        config["char_dim"] = self.FLAGS.char_dim
        config["hidden_dim"] = self.FLAGS.hidden_dim
        config["rnn_layer_num"] = self.FLAGS.rnn_layer_num
        config["infer_num"] = self.FLAGS.infer_num
        config["batch_size"] = self.FLAGS.batch_size

        config["start_symbol_id"] = MappingInfo.char_to_id['<begin>']
        config["end_symbol_id"] = MappingInfo.char_to_id['</begin>']

        config["clip"] = self.FLAGS.clip
        config["use_train_sampling"] = self.FLAGS.use_train_sampling
        config["train_sample_prob"] = self.FLAGS.train_sample_prob
        config["dropout"] = self.FLAGS.dropout
        config["lr"] = self.FLAGS.lr
        config["zeros"] = self.FLAGS.zeros
        config["lower"] = self.FLAGS.lower

        config["summary_path"] = self.FLAGS.summary_path
        return config

    def infer(self, session, model, logger):
        sentence_list = model.infer(session)
        sentence = u"\n".join([u"".join(s) for s in sentence_list])
        logger.info(sentence)

    def evaluate(self, session, model, name, iter_init_op, logger):
        logger.info("evaluate:{}".format(name))
        session.run(iter_init_op)  # initilize dev or test iterator
        logger.info("iterator is switched to {}".format(name))

        perplexity = model.evaluate(session)
        logger.info("current {} perplexity score:{:>.3f}".format(
            name, perplexity))
        if name == "dev":
            self.train_session.run(
                tf.assign(self.train_model.dev_perplexity, perplexity))
            best_dev_perplexity = self.train_session.run(
                self.train_model.best_dev_perplexity)
            if perplexity < best_dev_perplexity:
                self.train_session.run(
                    tf.assign(self.train_model.best_dev_perplexity,
                              perplexity))
                logger.info(
                    "new best dev perplexity score:{:>.3f}".format(perplexity))
            return (perplexity < best_dev_perplexity, perplexity)
        elif name == "test":
            self.train_session.run(
                tf.assign(self.train_model.test_perplexity, perplexity))
            best_test_perplexity = self.train_session.run(
                self.train_model.best_test_perplexity)
            if perplexity < best_test_perplexity:
                self.train_session.run(
                    tf.assign(self.train_model.best_test_perplexity,
                              perplexity))
                logger.info("new best test perplexity score:{:>.3f}".format(
                    perplexity))
            return (perplexity < best_test_perplexity, perplexity)

    def train(self):
        make_path(self.FLAGS)

        logger = get_logger(self.FLAGS.logfile_path)

        # build char-id mapping
        MappingInfo.char_mapping(self.FLAGS.train_file, self.FLAGS.zeros,
                                 self.FLAGS.lower)
        MappingInfo.save_map(self.FLAGS.mapfile_path,
                             self.FLAGS.vocabfile_path, logger)

        # load config and print it
        if tf.gfile.Exists(self.FLAGS.configfile_path):
            config = load_config(self.FLAGS.configfile_path)
        else:
            config = self.config()
            save_config(config, self.FLAGS.configfile_path)
        print_config(config, logger)

        # calculate sentence num
        logger.info("Calculating sentence num in dataset")
        train_sentence_num = line_num_count(self.FLAGS.train_file)
        dev_sentence_num = line_num_count(self.FLAGS.dev_file)
        test_sentence_num = line_num_count(self.FLAGS.test_file)
        logger.info("{} / {} / {} sentences in train / dev / test.".format(
            train_sentence_num, dev_sentence_num, test_sentence_num))

        self.train_graph = tf.Graph()
        with self.train_graph.as_default():
            table_train = MappingInfo.make_table_tensor()
            # load data sets
            # use generator to avoid memory oversize
            train_dataset = dataset_from_file(self.FLAGS.train_file,
                                              self.FLAGS.zeros,
                                              self.FLAGS.lower,
                                              self.FLAGS.batch_size, None,
                                              table_train)
            logger.info("Train sentence dataset is initialized")
            # build iterator from dataset
            iter_train = tf.data.Iterator.from_structure(
                train_dataset.output_types, train_dataset.output_shapes)
            train_init_op = iter_train.make_initializer(train_dataset)

            self.train_model = TrainModel(config, iter_train)

        self.eval_graph = tf.Graph()
        with self.eval_graph.as_default():
            table_eval = MappingInfo.make_table_tensor()
            dev_dataset = dataset_from_file(self.FLAGS.dev_file,
                                            self.FLAGS.zeros, self.FLAGS.lower,
                                            self.FLAGS.batch_size, 1,
                                            table_eval)
            logger.info("Dev sentence dataset is initialized")
            test_dataset = dataset_from_file(self.FLAGS.test_file,
                                             self.FLAGS.zeros,
                                             self.FLAGS.lower,
                                             self.FLAGS.batch_size, 1,
                                             table_eval)
            logger.info("Test sentence dataset is initialized")

            iter_eval = tf.data.Iterator.from_structure(
                train_dataset.output_types, train_dataset.output_shapes)
            dev_init_op = iter_eval.make_initializer(dev_dataset)
            test_init_op = iter_eval.make_initializer(test_dataset)
            eval_model = EvalModel(config, iter_eval)

        self.infer_graph = tf.Graph()
        with self.infer_graph.as_default():
            infer_model = InferModel(config)

        # limit GPU memory
        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True
        #tf_config.log_device_placement = True
        steps_per_epoch = train_sentence_num // config[
            "batch_size"]  # how many batches in an epoch

        self.train_session = tf.Session(config=tf_config,
                                        graph=self.train_graph)
        eval_session = tf.Session(config=tf_config, graph=self.eval_graph)
        infer_session = tf.Session(config=tf_config, graph=self.infer_graph)

        logger.info("start training")
        create_model(self.train_session, self.train_model,
                     self.FLAGS.ckpt_path, logger)
        self.train_session.run(table_train.init)
        self.train_session.run(train_init_op)

        eval_session.run(table_eval.init)
        loss = []
        lr = config["lr"]
        for i in range(self.FLAGS.max_epoch):
            for j in range(steps_per_epoch):
                step, batch_loss = self.train_model.train(self.train_session)
                loss.append(batch_loss)
                sample_prob = max(
                    0.3, config["train_sample_prob"] -
                    0.2 * step / steps_per_epoch)  # liner decay sample prob
                self.train_session.run(
                    tf.assign(self.train_model.train_sample_prob, sample_prob))

                if step % self.FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info(
                        "iteration:{} step:{}/{}, NER loss:{:>9.6f}, Training Sample prob is now {:>4.2f}"
                        .format(iteration, step % steps_per_epoch,
                                steps_per_epoch, np.mean(loss), sample_prob))
                    loss = []

                if step % self.FLAGS.steps_eval == 0:
                    save_model(self.train_session, self.train_model,
                               self.FLAGS.ckpt_path, logger)
                    load_model(eval_session, eval_model, self.FLAGS.ckpt_path,
                               logger)
                    best, current_perplexity = self.evaluate(
                        eval_session, eval_model, "dev", dev_init_op, logger)
                    if best:
                        save_model(self.train_session, self.train_model,
                                   self.FLAGS.best_ckpt_path, logger)
                    self.evaluate(eval_session, eval_model, "test",
                                  test_init_op, logger)
                    self.train_model.save_dev_test_summary(self.train_session)

                    load_model(infer_session, infer_model,
                               self.FLAGS.ckpt_path, logger)
                    self.infer(infer_session, infer_model, logger)

            lr = max(0.0001, lr / 1.5)
            self.train_session.run(tf.assign(self.train_model.lr, lr))
            logger.info(
                "Epoch {} is finished, rescale learing rate to {}".format(
                    i, lr))

    def evaluate_line(self):
        config = load_config(self.FLAGS.configfile_path)
        logger = get_logger(self.FLAGS.logfile_path)

        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True
        with tf.gfile.GFile(self.FLAGS.mapfile_path, "rb") as f:
            char_to_id, id_to_char = pickle.load(f)

        with tf.Session(config=tf_config) as sess:

            x = tf.placeholder(tf.string, shape=[1, N])
            dataset = dataset_from_string
            model = load_model(sess, InferModel, self.FLAGS.ckpt_path, config,
                               logger)

    def run(self):
        if self.FLAGS.train:
            if self.FLAGS.clean:
                clean(self.FLAGS)
            if self.FLAGS.clean_map:
                clean_map(self.FLAGS)
            self.train()
        else:
            self.evaluate_line()