Example #1
0
def check_all_trainers_ready(ready_path, epoch):
    trainer_num = fleet.worker_num()
    trainer_id = fleet.worker_index()

    hadoop_home = os.getenv("HADOOP_HOME")
    configs = {
        "fs.default.name": os.getenv("FS_NAME"),
        "hadoop.job.ugi": os.getenv("FS_UGI")
    }

    node_ready = "ready.{}.{}.done".format(epoch, trainer_id)

    with open(node_ready, "w") as node:
        node.write("")

    client = HDFSClient(hadoop_home, configs)
    if not client.is_dir(ready_path):
        client.makedirs(ready_path)
    client.upload(
        hdfs_path=ready_path,
        local_path=node_ready,
        overwrite=True,
        retry_times=0)

    print("PUT {} ON HDFS {} OK".format(node_ready, ready_path))

    while True:
        ready_num = len(client.ls(ready_path))
        print("have {} trainers need to be ready".format(trainer_num - ready_num
                                                         % trainer_num))
        if ready_num % trainer_num == 0:
            break
        time.sleep(10)
        ready_num = len(client.ls(ready_path))

    print("All trainers are ready, continue training")
Example #2
0
from paddle.fluid.incubate.fleet.base import role_maker

input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')

cost = mlp(input_x, input_y)
optimizer = fluid.optimizer.Adagrad(learning_rate=0.01)

role = role_maker.PaddleCloudRoleMaker()
fleet.init(role)

optimizer = fleet.distributed_optimizer(optimizer)
optimizer.minimize(cost)

if fleet.is_server():
    fleet.init_server()
    fleet.run_server()
elif fleet.is_worker():
    fleet.init_worker()
    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
    step = 1001
    for i in range(step):
        cost_val = exe.run(program=fleet.main_program,
                           feed=gen_data(),
                           fetch_list=[cost.name])
        print("worker_index: %d, step%d cost = %f" %
              (fleet.worker_index(), i, cost_val[0]))
    fleet.stop_worker()
Example #3
0
    def train_and_eval(self):
        """
        :return:
        """
        if self.is_fleet and fleet.is_server():
            logging.debug("is fleet.server, over")
            return
        if self.is_fleet:
            logging.debug("worker_index%d start train...." % fleet.worker_index())

        num_train_examples = self.params.get("num_train_examples", 0)
        if num_train_examples == 0:
            num_train_examples = self.data_set_reader.train_reader.get_num_examples()

        self.data_set_reader.train_reader.run()
        steps = 1
        time_begin = time.time()
        if 'output_path' in self.params.keys() and self.params["output_path"]:
            save_checkpoints_path = os.path.join(self.params["output_path"], "save_checkpoints")
            save_inference_model_path = os.path.join(self.params["output_path"], "save_inference_model")
        else:
            save_checkpoints_path = "./output/save_checkpoints/"
            save_inference_model_path = "./output/save_inference_model/"
        try:
            while True:
                try:
                    if steps % self.params["train_log_step"] != 0:
                        self.run(InstanceName.TRAINING, need_fetch=False)
                    else:
                        metrics_tensor_value = self.run(InstanceName.TRAINING, need_fetch=True)
                        current_example, current_epoch = self.data_set_reader.train_reader.get_train_progress()
                        logging.info("epoch {0} progress {1}/{2} pyreader queue size {3}".
                                     format(current_epoch, current_example, num_train_examples,
                                            self.data_set_reader.train_reader.paddle_py_reader.queue.size()))

                        fetch_output_dict = collections.OrderedDict()
                        for key, value in zip(self.fetch_list_train_key, metrics_tensor_value):
                            fetch_output_dict[key] = value
                        time_end = time.time()
                        used_time = time_end - time_begin
                        meta_info = collections.OrderedDict()
                        meta_info[InstanceName.STEP] = steps
                        meta_info[InstanceName.GPU_ID] = self.gpu_id
                        meta_info[InstanceName.TIME_COST] = used_time

                        metrics_output = self.model_class.get_metrics(fetch_output_dict, meta_info,
                                                                      InstanceName.TRAINING)
                        if self.params.get("visualdl_log", False):
                            assert isinstance(metrics_output, OrderedDict), "metrics_output is must be OrderedDict"
                            self.visualdl_log(metrics_output, np.mean(fetch_output_dict[InstanceName.LOSS]), steps, 
                                              phase=InstanceName.TRAINING)
                        time_begin = time.time()

                    if steps % self.params["eval_step"] == 0:
                        if self.params["is_eval_dev"]:
                            self.evaluate(self.data_set_reader.dev_reader, InstanceName.EVALUATE, steps)
                        if self.params["is_eval_test"]:
                            self.evaluate(self.data_set_reader.test_reader, InstanceName.TEST, steps)
                    if self.trainer_id == 0:
                        if steps % self.params["save_model_step"] == 0:
                            self.save_models(save_checkpoints_path, save_inference_model_path, steps)
                    steps += 1
                    if "steps_for_test" in self.params and steps >= self.params["steps_for_test"]:
                        self.data_set_reader.train_reader.stop()
                        logging.debug("steps_for_test stop!")
                        break
                except fluid.core.EOFException:
                    self.data_set_reader.train_reader.stop()
                    break
            if self.params["is_eval_dev"]:
                logging.info("Final evaluate result: ")
                self.evaluate(self.data_set_reader.dev_reader, InstanceName.EVALUATE, steps)
            if self.params["is_eval_test"]:
                logging.info("Final test result: ")
                self.evaluate(self.data_set_reader.test_reader, InstanceName.TEST, steps)
        except Exception as e:
            logging.error('traceback.format_exc():%s' % traceback.format_exc())
            self.save_models(save_checkpoints_path, save_inference_model_path, steps)
            raise e

        self.save_models(save_checkpoints_path, save_inference_model_path, steps)
Example #4
0
#http_ip_port="127.0.0.1:26001")

#role = role_maker.PaddleCloudRoleMaker(http_ip_port="127.0.0.1:26001")

#role = role_maker.GeneralRoleMaker(path="./tmp4")
logger.info("Begin")
res = [0, 0]

logger.info(res)

role = role_maker.PaddleCloudRoleMaker(path="./tmp4")

fleet.init(role)
print("init wancheng")  #
#if fleet.is_worker():
#    import time
#    time.sleep(3)

a = [5]
b = [2]
res = [0]
if fleet.worker_index() == 0:
    role._all_reduce(role._node_type_comm, a)
elif fleet.worker_index() == 1:
    role._all_reduce(role._node_type_comm, b)

#logger.info(res)
#print("res ", res)

#role._barrier_all()