Ejemplo n.º 1
0
def train(args):

    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

    filelist = GetFileList(args.train_data_dir)
    word2vec_reader = reader.Word2VecReader(args.dict_path,
                                            args.train_data_dir, filelist, 0,
                                            1)

    logger.info("dict_size: {}".format(word2vec_reader.dict_size))
    np_power = np.power(np.array(word2vec_reader.id_frequencys), 0.75)
    id_frequencys_pow = np_power / np_power.sum()

    loss, py_reader = skip_gram_word2vec(word2vec_reader.dict_size,
                                         args.embedding_size,
                                         is_sparse=args.is_sparse,
                                         neg_num=args.nce_num)

    learning_rate = paddle.optimizer.lr.ExponentialDecay(args.base_lr,
                                                         gamma=0.999)

    optimizer = paddle.optimizer.SGD(learning_rate=learning_rate)

    optimizer.minimize(loss)

    # do local training
    logger.info("run local training")
    main_program = paddle.static.default_main_program()
    train_loop(args, main_program, word2vec_reader, py_reader, loss, 0,
               id_frequencys_pow, learning_rate)
Ejemplo n.º 2
0
def train(args):

    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

    filelist = GetFileList(args.train_data_dir)
    word2vec_reader = reader.Word2VecReader(args.dict_path,
                                            args.train_data_dir, filelist, 0,
                                            1)

    logger.info("dict_size: {}".format(word2vec_reader.dict_size))
    np_power = np.power(np.array(word2vec_reader.id_frequencys), 0.75)
    id_frequencys_pow = np_power / np_power.sum()

    loss, py_reader = skip_gram_word2vec(word2vec_reader.dict_size,
                                         args.embedding_size,
                                         is_sparse=args.is_sparse,
                                         neg_num=args.nce_num)

    optimizer = fluid.optimizer.SGD(
        learning_rate=fluid.layers.exponential_decay(
            learning_rate=args.base_lr,
            decay_steps=100000,
            decay_rate=0.999,
            staircase=True))

    optimizer.minimize(loss)

    # do local training
    logger.info("run local training")
    main_program = fluid.default_main_program()
    train_loop(args, main_program, word2vec_reader, py_reader, loss, 0,
               id_frequencys_pow)
Ejemplo n.º 3
0
def train(args):
    # add ce
    if args.enable_ce:
        SEED = 102
        fluid.default_main_program().random_seed = SEED
        fluid.default_startup_program().random_seed = SEED

    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

    filelist = GetFileList(args.train_data_dir)
    word2vec_reader = reader.Word2VecReader(args.dict_path, args.train_data_dir,
                                            filelist, 0, 1)

    logger.info("dict_size: {}".format(word2vec_reader.dict_size))

    if args.with_shuffle_batch:
        loss, data_loader = skip_gram_word2vec_shuffle_batch(
            word2vec_reader.dict_size,
            args.embedding_size,
            is_sparse=args.is_sparse,
            neg_num=args.nce_num)
        data_loader.set_sample_generator(word2vec_reader.train(), batch_size=args.batch_size, drop_last=True)
    else:
        np_power = np.power(np.array(word2vec_reader.id_frequencys), 0.75)
        id_frequencys_pow = np_power / np_power.sum()

        loss, data_loader = skip_gram_word2vec(
            word2vec_reader.dict_size,
            args.embedding_size,
            is_sparse=args.is_sparse,
            neg_num=args.nce_num)

        data_loader.set_batch_generator(
            convert_python_to_tensor(id_frequencys_pow, args.batch_size, word2vec_reader.train()))

    optimizer = fluid.optimizer.SGD(
        learning_rate=fluid.layers.exponential_decay(
            learning_rate=args.base_lr,
            decay_steps=100000,
            decay_rate=0.999,
            staircase=True))

    optimizer.minimize(loss)

    # do local training 
    logger.info("run local training")
    main_program = fluid.default_main_program()
    train_loop(args, main_program, data_loader, loss, 0)
Ejemplo n.º 4
0
def train(args):

    if not os.path.isdir(args.model_output_dir) and args.trainer_id == 0:
        os.mkdir(args.model_output_dir)

    filelist = GetFileList(args.train_data_dir)
    word2vec_reader = reader.Word2VecReader(args.dict_path, args.train_data_dir,
                                            filelist, 0, 1)

    logger.info("dict_size: {}".format(word2vec_reader.dict_size))
    np_power = np.power(np.array(word2vec_reader.id_frequencys), 0.75)
    id_frequencys_pow = np_power / np_power.sum()

    loss, py_reader = skip_gram_word2vec(
        word2vec_reader.dict_size,
        args.embedding_size,
        is_sparse=args.is_sparse,
        neg_num=args.nce_num)

    optimizer = fluid.optimizer.SGD(
        learning_rate=fluid.layers.exponential_decay(
            learning_rate=args.base_lr,
            decay_steps=100000,
            decay_rate=0.999,
            staircase=True))

    optimizer.minimize(loss)

    logger.info("run dist training")

    t = fluid.DistributeTranspiler()
    t.transpile(
        args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
    if args.role == "pserver":
        print("run psever")
        pserver_prog = t.get_pserver_program(args.current_endpoint)
        pserver_startup = t.get_startup_program(args.current_endpoint,
                                                pserver_prog)
        exe = fluid.Executor(fluid.CPUPlace())
        exe.run(pserver_startup)
        exe.run(pserver_prog)
    elif args.role == "trainer":
        print("run trainer")
        train_loop(args,
                   t.get_trainer_program(), word2vec_reader, py_reader, loss,
                   args.trainer_id, id_frequencys_pow)
Ejemplo n.º 5
0
def async_train(args):
    if not os.path.isdir(args.model_output_dir):
                os.mkdir(args.model_output_dir)
    filelist = GetFileList(args.train_data_path)
    word2vec_reader = reader.Word2VecReader(
        args.dict_path, args.train_data_path, filelist, 0, 1)
    loss, words = skip_gram_word2vec(
        word2vec_reader.dict_size,
        word2vec_reader.word_frequencys,
        args.embedding_size,
        args.max_code_length,
        args.with_hs,
        args.with_nce,
        is_sparse=args.is_sparse)
    dataset = fluid.DataFeedDesc('data_feed.proto')
    dataset.set_batch_size(args.batch_size)
    dataset.set_use_slots([w.name for w in words])
    dataset.set_pipe_command("/home/users/dongdaxiang/paddle_whls/new_io/paddle_release_home/python/bin/python word2vec_data_gen.py")
    optimizer = fluid.optimizer.SGD(learning_rate=1e-4)
    optimizer.minimize(loss)
    async_train_loop(args, fluid.default_main_program(), loss, dataset, filelist)
Ejemplo n.º 6
0
def main(_):
    ps_hosts = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST").split(",")
    worker_hosts = os.getenv("PADDLE_WORKERS_IP_PORT_LIST").split(",")
    role = os.getenv("TRAINING_ROLE")
    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})

    if role == "PSERVER":
        pserver_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        server = tf.train.Server(cluster, job_name="ps", task_index=pserver_id)
        server.join()
    elif role == "TRAINER":
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        server = tf.train.Server(cluster,
                                 job_name="worker",
                                 task_index=trainer_id)
        is_chief = (trainer_id == 0)
        num_workers = len(worker_hosts)
        device_setter = tf.train.replica_device_setter(
            worker_device="/job:worker/task:%d" % trainer_id, cluster=cluster)
        with tf.device(device_setter):
            global_step = tf.Variable(0, name="global_step")

            filelist = GetFileList(FLAGS.train_data_dir, num_workers,
                                   trainer_id)
            all_examples = get_example_num(filelist)
            logger.info("train_file_list: %s" % str(filelist))
            logger.info("there are a total of %d files, %d words" %
                        (len(filelist), all_examples))
            word2vec_reader = reader.Word2VecReader(FLAGS.dict_path,
                                                    FLAGS.train_data_dir,
                                                    filelist, 0, 1)
            logger.info("dict_size: {}".format(word2vec_reader.dict_size))

            examples, labels, loss = skip_gram_word2vec(
                word2vec_reader.dict_size,
                FLAGS.embedding_size, FLAGS.batch_size,
                np.array(word2vec_reader.id_frequencys), FLAGS.num_neg_samples)
            lr = tf.train.exponential_decay(learning_rate=FLAGS.learning_rate,
                                            global_step=global_step,
                                            decay_steps=100000,
                                            decay_rate=0.999,
                                            staircase=True)
            optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
            hooks = []
            if FLAGS.dist_mode == "sync":
                optimizer = tf.train.SyncReplicasOptimizer(
                    optimizer,
                    replicas_to_aggregate=num_workers,
                    total_num_replicas=num_workers)
                hooks.append(optimizer.make_session_run_hook(is_chief))
            saver = tf.train.Saver(max_to_keep=None)
            saver_hook = tf.train.CheckpointSaverHook(
                checkpoint_dir=FLAGS.model_output_dir,
                save_steps=FLAGS.save_steps,
                saver=saver)
            hooks.append(saver_hook)
            train_op = optimizer.minimize(loss, global_step=global_step)
            sess_config = tf.ConfigProto(
                allow_soft_placement=True,
                log_device_placement=False,
                inter_op_parallelism_threads=FLAGS.num_threads,
                intra_op_parallelism_threads=FLAGS.num_threads)
            with tf.train.MonitoredTrainingSession(
                    master=server.target,
                    is_chief=is_chief,
                    hooks=hooks,
                    config=sess_config) as session:
                train_result = {}
                for epoch in xrange(FLAGS.epochs_to_train):
                    start_time = time.time()
                    batch_id = 0
                    for examples_, labels_, in get_batch(
                            word2vec_reader.train(), FLAGS.batch_size):
                        feed_dict = {}
                        feed_dict[examples] = examples_
                        feed_dict[labels] = labels_
                        _, loss_, step_ = session.run(
                            [train_op, loss, global_step], feed_dict=feed_dict)
                        if batch_id % 1000 == 0:
                            logger.info(
                                "Epoch %4d Step %8d local step %8d loss = %6.2f"
                                % (epoch, step_, batch_id, loss_))
                        batch_id += 1
                    now = time.time()
                    speed = float(all_examples) / float(now - start_time)
                    train_result[epoch] = {}
                    train_result[epoch]['speed'] = speed
                    logger.info(
                        "Epoch: {} total time: {} ips: {} word/s".format(
                            epoch, now - start_time, speed))
                if not FLAGS.is_local and trainer_id == 0:
                    upload(FLAGS.model_output_dir, 'model')
                log_path = FLAGS.log_dir + '/' + str(trainer_id) + '.log'
                with open(log_path, 'w') as fout:
                    fout.write(str(train_result))
                if not FLAGS.is_local:
                    upload(log_path, 'log')
Ejemplo n.º 7
0
def train(args):

    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

    filelist = GetFileList(args.train_data_path)
    word2vec_reader = None
    if args.is_local or os.getenv("PADDLE_IS_LOCAL", "1") == "1":
        word2vec_reader = reader.Word2VecReader(
            args.dict_path, args.train_data_path, filelist, 0, 1)
    else:
        trainer_id = int(os.environ["PADDLE_TRAINER_ID"])
        trainers = int(os.environ["PADDLE_TRAINERS"])
        word2vec_reader = reader.Word2VecReader(args.dict_path,
                                                args.train_data_path, filelist,
                                                trainer_id, trainer_num)

    logger.info("dict_size: {}".format(word2vec_reader.dict_size))
    loss, py_reader = skip_gram_word2vec(
        word2vec_reader.dict_size,
        word2vec_reader.word_frequencys,
        args.embedding_size,
        args.max_code_length,
        args.with_hs,
        args.with_nce,
        is_sparse=args.is_sparse)

    optimizer = None
    if args.with_Adam:
        optimizer = fluid.optimizer.Adam(learning_rate=1e-4)
    else:
        optimizer = fluid.optimizer.SGD(learning_rate=1e-4)

    optimizer.minimize(loss)

    # do local training 
    if args.is_local or os.getenv("PADDLE_IS_LOCAL", "1") == "1":
        logger.info("run local training")
        main_program = fluid.default_main_program()

        with open("local.main.proto", "w") as f:
            f.write(str(main_program))

        train_loop(args, main_program, word2vec_reader, py_reader, loss, 0)
    # do distribute training
    else:
        logger.info("run dist training")

        trainer_id = int(os.environ["PADDLE_TRAINER_ID"])
        trainers = int(os.environ["PADDLE_TRAINERS"])
        training_role = os.environ["PADDLE_TRAINING_ROLE"]

        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
        pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)
        current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port

        config = fluid.DistributeTranspilerConfig()
        config.slice_var_up = False
        t = fluid.DistributeTranspiler(config=config)
        t.transpile(
            trainer_id,
            pservers=pserver_endpoints,
            trainers=trainers,
            sync_mode=True)

        if training_role == "PSERVER":
            logger.info("run pserver")
            prog = t.get_pserver_program(current_endpoint)
            startup = t.get_startup_program(
                current_endpoint, pserver_program=prog)

            with open("pserver.main.proto.{}".format(os.getenv("CUR_PORT")),
                      "w") as f:
                f.write(str(prog))

            exe = fluid.Executor(fluid.CPUPlace())
            exe.run(startup)
            exe.run(prog)
        elif training_role == "TRAINER":
            logger.info("run trainer")
            train_prog = t.get_trainer_program()

            with open("trainer.main.proto.{}".format(trainer_id), "w") as f:
                f.write(str(train_prog))

            train_loop(args, train_prog, word2vec_reader, py_reader, loss,
                       trainer_id)