Example #1
0
def train(args):

    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

    filelist = GetFileList(args.train_data_dir)
    word2vec_reader = reader.Word2VecReader(args.dict_path,
                                            args.train_data_dir, filelist, 0,
                                            1)

    logger.info("dict_size: {}".format(word2vec_reader.dict_size))
    np_power = np.power(np.array(word2vec_reader.id_frequencys), 0.75)
    id_frequencys_pow = np_power / np_power.sum()

    loss, py_reader = skip_gram_word2vec(word2vec_reader.dict_size,
                                         args.embedding_size,
                                         is_sparse=args.is_sparse,
                                         neg_num=args.nce_num)

    optimizer = fluid.optimizer.SGD(
        learning_rate=fluid.layers.exponential_decay(
            learning_rate=args.base_lr,
            decay_steps=100000,
            decay_rate=0.999,
            staircase=True))

    optimizer.minimize(loss)

    # do local training
    logger.info("run local training")
    main_program = fluid.default_main_program()
    train_loop(args, main_program, word2vec_reader, py_reader, loss, 0,
               id_frequencys_pow)
Example #2
0
def train(args):

    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

    filelist = GetFileList(args.train_data_dir)
    word2vec_reader = reader.Word2VecReader(args.dict_path,
                                            args.train_data_dir, filelist, 0,
                                            1)

    logger.info("dict_size: {}".format(word2vec_reader.dict_size))
    np_power = np.power(np.array(word2vec_reader.id_frequencys), 0.75)
    id_frequencys_pow = np_power / np_power.sum()

    loss, py_reader = skip_gram_word2vec(word2vec_reader.dict_size,
                                         args.embedding_size,
                                         is_sparse=args.is_sparse,
                                         neg_num=args.nce_num)

    learning_rate = paddle.optimizer.lr.ExponentialDecay(args.base_lr,
                                                         gamma=0.999)

    optimizer = paddle.optimizer.SGD(learning_rate=learning_rate)

    optimizer.minimize(loss)

    # do local training
    logger.info("run local training")
    main_program = paddle.static.default_main_program()
    train_loop(args, main_program, word2vec_reader, py_reader, loss, 0,
               id_frequencys_pow, learning_rate)
Example #3
0
def train(args):
    # add ce
    if args.enable_ce:
        SEED = 102
        fluid.default_main_program().random_seed = SEED
        fluid.default_startup_program().random_seed = SEED

    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

    filelist = GetFileList(args.train_data_dir)
    word2vec_reader = reader.Word2VecReader(args.dict_path, args.train_data_dir,
                                            filelist, 0, 1)

    logger.info("dict_size: {}".format(word2vec_reader.dict_size))

    if args.with_shuffle_batch:
        loss, data_loader = skip_gram_word2vec_shuffle_batch(
            word2vec_reader.dict_size,
            args.embedding_size,
            is_sparse=args.is_sparse,
            neg_num=args.nce_num)
        data_loader.set_sample_generator(word2vec_reader.train(), batch_size=args.batch_size, drop_last=True)
    else:
        np_power = np.power(np.array(word2vec_reader.id_frequencys), 0.75)
        id_frequencys_pow = np_power / np_power.sum()

        loss, data_loader = skip_gram_word2vec(
            word2vec_reader.dict_size,
            args.embedding_size,
            is_sparse=args.is_sparse,
            neg_num=args.nce_num)

        data_loader.set_batch_generator(
            convert_python_to_tensor(id_frequencys_pow, args.batch_size, word2vec_reader.train()))

    optimizer = fluid.optimizer.SGD(
        learning_rate=fluid.layers.exponential_decay(
            learning_rate=args.base_lr,
            decay_steps=100000,
            decay_rate=0.999,
            staircase=True))

    optimizer.minimize(loss)

    # do local training 
    logger.info("run local training")
    main_program = fluid.default_main_program()
    train_loop(args, main_program, data_loader, loss, 0)
Example #4
0
def train(args):

    if not os.path.isdir(args.model_output_dir) and args.trainer_id == 0:
        os.mkdir(args.model_output_dir)

    filelist = GetFileList(args.train_data_dir)
    word2vec_reader = reader.Word2VecReader(args.dict_path, args.train_data_dir,
                                            filelist, 0, 1)

    logger.info("dict_size: {}".format(word2vec_reader.dict_size))
    np_power = np.power(np.array(word2vec_reader.id_frequencys), 0.75)
    id_frequencys_pow = np_power / np_power.sum()

    loss, py_reader = skip_gram_word2vec(
        word2vec_reader.dict_size,
        args.embedding_size,
        is_sparse=args.is_sparse,
        neg_num=args.nce_num)

    optimizer = fluid.optimizer.SGD(
        learning_rate=fluid.layers.exponential_decay(
            learning_rate=args.base_lr,
            decay_steps=100000,
            decay_rate=0.999,
            staircase=True))

    optimizer.minimize(loss)

    logger.info("run dist training")

    t = fluid.DistributeTranspiler()
    t.transpile(
        args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
    if args.role == "pserver":
        print("run psever")
        pserver_prog = t.get_pserver_program(args.current_endpoint)
        pserver_startup = t.get_startup_program(args.current_endpoint,
                                                pserver_prog)
        exe = fluid.Executor(fluid.CPUPlace())
        exe.run(pserver_startup)
        exe.run(pserver_prog)
    elif args.role == "trainer":
        print("run trainer")
        train_loop(args,
                   t.get_trainer_program(), word2vec_reader, py_reader, loss,
                   args.trainer_id, id_frequencys_pow)
Example #5
0
def main(_):
    ps_hosts = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST").split(",")
    worker_hosts = os.getenv("PADDLE_WORKERS_IP_PORT_LIST").split(",")
    role = os.getenv("TRAINING_ROLE")
    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})

    if role == "PSERVER":
        pserver_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        server = tf.train.Server(cluster, job_name="ps", task_index=pserver_id)
        server.join()
    elif role == "TRAINER":
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        server = tf.train.Server(cluster,
                                 job_name="worker",
                                 task_index=trainer_id)
        is_chief = (trainer_id == 0)
        num_workers = len(worker_hosts)
        device_setter = tf.train.replica_device_setter(
            worker_device="/job:worker/task:%d" % trainer_id, cluster=cluster)
        with tf.device(device_setter):
            global_step = tf.Variable(0, name="global_step")

            filelist = GetFileList(FLAGS.train_data_dir, num_workers,
                                   trainer_id)
            all_examples = get_example_num(filelist)
            logger.info("train_file_list: %s" % str(filelist))
            logger.info("there are a total of %d files, %d words" %
                        (len(filelist), all_examples))
            word2vec_reader = reader.Word2VecReader(FLAGS.dict_path,
                                                    FLAGS.train_data_dir,
                                                    filelist, 0, 1)
            logger.info("dict_size: {}".format(word2vec_reader.dict_size))

            examples, labels, loss = skip_gram_word2vec(
                word2vec_reader.dict_size,
                FLAGS.embedding_size, FLAGS.batch_size,
                np.array(word2vec_reader.id_frequencys), FLAGS.num_neg_samples)
            lr = tf.train.exponential_decay(learning_rate=FLAGS.learning_rate,
                                            global_step=global_step,
                                            decay_steps=100000,
                                            decay_rate=0.999,
                                            staircase=True)
            optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
            hooks = []
            if FLAGS.dist_mode == "sync":
                optimizer = tf.train.SyncReplicasOptimizer(
                    optimizer,
                    replicas_to_aggregate=num_workers,
                    total_num_replicas=num_workers)
                hooks.append(optimizer.make_session_run_hook(is_chief))
            saver = tf.train.Saver(max_to_keep=None)
            saver_hook = tf.train.CheckpointSaverHook(
                checkpoint_dir=FLAGS.model_output_dir,
                save_steps=FLAGS.save_steps,
                saver=saver)
            hooks.append(saver_hook)
            train_op = optimizer.minimize(loss, global_step=global_step)
            sess_config = tf.ConfigProto(
                allow_soft_placement=True,
                log_device_placement=False,
                inter_op_parallelism_threads=FLAGS.num_threads,
                intra_op_parallelism_threads=FLAGS.num_threads)
            with tf.train.MonitoredTrainingSession(
                    master=server.target,
                    is_chief=is_chief,
                    hooks=hooks,
                    config=sess_config) as session:
                train_result = {}
                for epoch in xrange(FLAGS.epochs_to_train):
                    start_time = time.time()
                    batch_id = 0
                    for examples_, labels_, in get_batch(
                            word2vec_reader.train(), FLAGS.batch_size):
                        feed_dict = {}
                        feed_dict[examples] = examples_
                        feed_dict[labels] = labels_
                        _, loss_, step_ = session.run(
                            [train_op, loss, global_step], feed_dict=feed_dict)
                        if batch_id % 1000 == 0:
                            logger.info(
                                "Epoch %4d Step %8d local step %8d loss = %6.2f"
                                % (epoch, step_, batch_id, loss_))
                        batch_id += 1
                    now = time.time()
                    speed = float(all_examples) / float(now - start_time)
                    train_result[epoch] = {}
                    train_result[epoch]['speed'] = speed
                    logger.info(
                        "Epoch: {} total time: {} ips: {} word/s".format(
                            epoch, now - start_time, speed))
                if not FLAGS.is_local and trainer_id == 0:
                    upload(FLAGS.model_output_dir, 'model')
                log_path = FLAGS.log_dir + '/' + str(trainer_id) + '.log'
                with open(log_path, 'w') as fout:
                    fout.write(str(train_result))
                if not FLAGS.is_local:
                    upload(log_path, 'log')