def train(args): if not os.path.isdir(args.model_output_dir): os.mkdir(args.model_output_dir) filelist = GetFileList(args.train_data_dir) word2vec_reader = reader.Word2VecReader(args.dict_path, args.train_data_dir, filelist, 0, 1) logger.info("dict_size: {}".format(word2vec_reader.dict_size)) np_power = np.power(np.array(word2vec_reader.id_frequencys), 0.75) id_frequencys_pow = np_power / np_power.sum() loss, py_reader = skip_gram_word2vec(word2vec_reader.dict_size, args.embedding_size, is_sparse=args.is_sparse, neg_num=args.nce_num) learning_rate = paddle.optimizer.lr.ExponentialDecay(args.base_lr, gamma=0.999) optimizer = paddle.optimizer.SGD(learning_rate=learning_rate) optimizer.minimize(loss) # do local training logger.info("run local training") main_program = paddle.static.default_main_program() train_loop(args, main_program, word2vec_reader, py_reader, loss, 0, id_frequencys_pow, learning_rate)
def train(args): if not os.path.isdir(args.model_output_dir): os.mkdir(args.model_output_dir) filelist = GetFileList(args.train_data_dir) word2vec_reader = reader.Word2VecReader(args.dict_path, args.train_data_dir, filelist, 0, 1) logger.info("dict_size: {}".format(word2vec_reader.dict_size)) np_power = np.power(np.array(word2vec_reader.id_frequencys), 0.75) id_frequencys_pow = np_power / np_power.sum() loss, py_reader = skip_gram_word2vec(word2vec_reader.dict_size, args.embedding_size, is_sparse=args.is_sparse, neg_num=args.nce_num) optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.exponential_decay( learning_rate=args.base_lr, decay_steps=100000, decay_rate=0.999, staircase=True)) optimizer.minimize(loss) # do local training logger.info("run local training") main_program = fluid.default_main_program() train_loop(args, main_program, word2vec_reader, py_reader, loss, 0, id_frequencys_pow)
def train(args): # add ce if args.enable_ce: SEED = 102 fluid.default_main_program().random_seed = SEED fluid.default_startup_program().random_seed = SEED if not os.path.isdir(args.model_output_dir): os.mkdir(args.model_output_dir) filelist = GetFileList(args.train_data_dir) word2vec_reader = reader.Word2VecReader(args.dict_path, args.train_data_dir, filelist, 0, 1) logger.info("dict_size: {}".format(word2vec_reader.dict_size)) if args.with_shuffle_batch: loss, data_loader = skip_gram_word2vec_shuffle_batch( word2vec_reader.dict_size, args.embedding_size, is_sparse=args.is_sparse, neg_num=args.nce_num) data_loader.set_sample_generator(word2vec_reader.train(), batch_size=args.batch_size, drop_last=True) else: np_power = np.power(np.array(word2vec_reader.id_frequencys), 0.75) id_frequencys_pow = np_power / np_power.sum() loss, data_loader = skip_gram_word2vec( word2vec_reader.dict_size, args.embedding_size, is_sparse=args.is_sparse, neg_num=args.nce_num) data_loader.set_batch_generator( convert_python_to_tensor(id_frequencys_pow, args.batch_size, word2vec_reader.train())) optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.exponential_decay( learning_rate=args.base_lr, decay_steps=100000, decay_rate=0.999, staircase=True)) optimizer.minimize(loss) # do local training logger.info("run local training") main_program = fluid.default_main_program() train_loop(args, main_program, data_loader, loss, 0)
def train(args): if not os.path.isdir(args.model_output_dir) and args.trainer_id == 0: os.mkdir(args.model_output_dir) filelist = GetFileList(args.train_data_dir) word2vec_reader = reader.Word2VecReader(args.dict_path, args.train_data_dir, filelist, 0, 1) logger.info("dict_size: {}".format(word2vec_reader.dict_size)) np_power = np.power(np.array(word2vec_reader.id_frequencys), 0.75) id_frequencys_pow = np_power / np_power.sum() loss, py_reader = skip_gram_word2vec( word2vec_reader.dict_size, args.embedding_size, is_sparse=args.is_sparse, neg_num=args.nce_num) optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.exponential_decay( learning_rate=args.base_lr, decay_steps=100000, decay_rate=0.999, staircase=True)) optimizer.minimize(loss) logger.info("run dist training") t = fluid.DistributeTranspiler() t.transpile( args.trainer_id, pservers=args.endpoints, trainers=args.trainers) if args.role == "pserver": print("run psever") pserver_prog = t.get_pserver_program(args.current_endpoint) pserver_startup = t.get_startup_program(args.current_endpoint, pserver_prog) exe = fluid.Executor(fluid.CPUPlace()) exe.run(pserver_startup) exe.run(pserver_prog) elif args.role == "trainer": print("run trainer") train_loop(args, t.get_trainer_program(), word2vec_reader, py_reader, loss, args.trainer_id, id_frequencys_pow)
def async_train(args): if not os.path.isdir(args.model_output_dir): os.mkdir(args.model_output_dir) filelist = GetFileList(args.train_data_path) word2vec_reader = reader.Word2VecReader( args.dict_path, args.train_data_path, filelist, 0, 1) loss, words = skip_gram_word2vec( word2vec_reader.dict_size, word2vec_reader.word_frequencys, args.embedding_size, args.max_code_length, args.with_hs, args.with_nce, is_sparse=args.is_sparse) dataset = fluid.DataFeedDesc('data_feed.proto') dataset.set_batch_size(args.batch_size) dataset.set_use_slots([w.name for w in words]) dataset.set_pipe_command("/home/users/dongdaxiang/paddle_whls/new_io/paddle_release_home/python/bin/python word2vec_data_gen.py") optimizer = fluid.optimizer.SGD(learning_rate=1e-4) optimizer.minimize(loss) async_train_loop(args, fluid.default_main_program(), loss, dataset, filelist)
def main(_): ps_hosts = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST").split(",") worker_hosts = os.getenv("PADDLE_WORKERS_IP_PORT_LIST").split(",") role = os.getenv("TRAINING_ROLE") cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) if role == "PSERVER": pserver_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) server = tf.train.Server(cluster, job_name="ps", task_index=pserver_id) server.join() elif role == "TRAINER": trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) server = tf.train.Server(cluster, job_name="worker", task_index=trainer_id) is_chief = (trainer_id == 0) num_workers = len(worker_hosts) device_setter = tf.train.replica_device_setter( worker_device="/job:worker/task:%d" % trainer_id, cluster=cluster) with tf.device(device_setter): global_step = tf.Variable(0, name="global_step") filelist = GetFileList(FLAGS.train_data_dir, num_workers, trainer_id) all_examples = get_example_num(filelist) logger.info("train_file_list: %s" % str(filelist)) logger.info("there are a total of %d files, %d words" % (len(filelist), all_examples)) word2vec_reader = reader.Word2VecReader(FLAGS.dict_path, FLAGS.train_data_dir, filelist, 0, 1) logger.info("dict_size: {}".format(word2vec_reader.dict_size)) examples, labels, loss = skip_gram_word2vec( word2vec_reader.dict_size, FLAGS.embedding_size, FLAGS.batch_size, np.array(word2vec_reader.id_frequencys), FLAGS.num_neg_samples) lr = tf.train.exponential_decay(learning_rate=FLAGS.learning_rate, global_step=global_step, decay_steps=100000, decay_rate=0.999, staircase=True) optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr) hooks = [] if FLAGS.dist_mode == "sync": optimizer = tf.train.SyncReplicasOptimizer( optimizer, replicas_to_aggregate=num_workers, total_num_replicas=num_workers) hooks.append(optimizer.make_session_run_hook(is_chief)) saver = tf.train.Saver(max_to_keep=None) saver_hook = tf.train.CheckpointSaverHook( checkpoint_dir=FLAGS.model_output_dir, save_steps=FLAGS.save_steps, saver=saver) hooks.append(saver_hook) train_op = optimizer.minimize(loss, global_step=global_step) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, inter_op_parallelism_threads=FLAGS.num_threads, intra_op_parallelism_threads=FLAGS.num_threads) with tf.train.MonitoredTrainingSession( master=server.target, is_chief=is_chief, hooks=hooks, config=sess_config) as session: train_result = {} for epoch in xrange(FLAGS.epochs_to_train): start_time = time.time() batch_id = 0 for examples_, labels_, in get_batch( word2vec_reader.train(), FLAGS.batch_size): feed_dict = {} feed_dict[examples] = examples_ feed_dict[labels] = labels_ _, loss_, step_ = session.run( [train_op, loss, global_step], feed_dict=feed_dict) if batch_id % 1000 == 0: logger.info( "Epoch %4d Step %8d local step %8d loss = %6.2f" % (epoch, step_, batch_id, loss_)) batch_id += 1 now = time.time() speed = float(all_examples) / float(now - start_time) train_result[epoch] = {} train_result[epoch]['speed'] = speed logger.info( "Epoch: {} total time: {} ips: {} word/s".format( epoch, now - start_time, speed)) if not FLAGS.is_local and trainer_id == 0: upload(FLAGS.model_output_dir, 'model') log_path = FLAGS.log_dir + '/' + str(trainer_id) + '.log' with open(log_path, 'w') as fout: fout.write(str(train_result)) if not FLAGS.is_local: upload(log_path, 'log')
def train(args): if not os.path.isdir(args.model_output_dir): os.mkdir(args.model_output_dir) filelist = GetFileList(args.train_data_path) word2vec_reader = None if args.is_local or os.getenv("PADDLE_IS_LOCAL", "1") == "1": word2vec_reader = reader.Word2VecReader( args.dict_path, args.train_data_path, filelist, 0, 1) else: trainer_id = int(os.environ["PADDLE_TRAINER_ID"]) trainers = int(os.environ["PADDLE_TRAINERS"]) word2vec_reader = reader.Word2VecReader(args.dict_path, args.train_data_path, filelist, trainer_id, trainer_num) logger.info("dict_size: {}".format(word2vec_reader.dict_size)) loss, py_reader = skip_gram_word2vec( word2vec_reader.dict_size, word2vec_reader.word_frequencys, args.embedding_size, args.max_code_length, args.with_hs, args.with_nce, is_sparse=args.is_sparse) optimizer = None if args.with_Adam: optimizer = fluid.optimizer.Adam(learning_rate=1e-4) else: optimizer = fluid.optimizer.SGD(learning_rate=1e-4) optimizer.minimize(loss) # do local training if args.is_local or os.getenv("PADDLE_IS_LOCAL", "1") == "1": logger.info("run local training") main_program = fluid.default_main_program() with open("local.main.proto", "w") as f: f.write(str(main_program)) train_loop(args, main_program, word2vec_reader, py_reader, loss, 0) # do distribute training else: logger.info("run dist training") trainer_id = int(os.environ["PADDLE_TRAINER_ID"]) trainers = int(os.environ["PADDLE_TRAINERS"]) training_role = os.environ["PADDLE_TRAINING_ROLE"] port = os.getenv("PADDLE_PSERVER_PORT", "6174") pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "") eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port config = fluid.DistributeTranspilerConfig() config.slice_var_up = False t = fluid.DistributeTranspiler(config=config) t.transpile( trainer_id, pservers=pserver_endpoints, trainers=trainers, sync_mode=True) if training_role == "PSERVER": logger.info("run pserver") prog = t.get_pserver_program(current_endpoint) startup = t.get_startup_program( current_endpoint, pserver_program=prog) with open("pserver.main.proto.{}".format(os.getenv("CUR_PORT")), "w") as f: f.write(str(prog)) exe = fluid.Executor(fluid.CPUPlace()) exe.run(startup) exe.run(prog) elif training_role == "TRAINER": logger.info("run trainer") train_prog = t.get_trainer_program() with open("trainer.main.proto.{}".format(trainer_id), "w") as f: f.write(str(train_prog)) train_loop(args, train_prog, word2vec_reader, py_reader, loss, trainer_id)