def test_run(self): # Initialize dataset description data_feed = fluid.DataFeedDesc('train_data/data.prototxt') data_feed.set_batch_size( 128) # See API doc for how to change other fields # define network # input text data data = fluid.layers.data(name="words", shape=[1], dtype="int64", lod_level=1) # label data label = fluid.layers.data(name="label", shape=[1], dtype="int64") avg_cost, acc, prediction = bow_net(data, label) sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002) opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost) # Run startup program startup_program = fluid.default_startup_program() place = fluid.CPUPlace() executor = fluid.Executor(place) executor.run(startup_program) main_program = fluid.default_main_program() async_executor = fluid.AsyncExecutor(place) self.assertRaises(TypeError, async_executor.run) self.assertRaises(TypeError, async_executor.run, main_program) self.assertRaises(TypeError, async_executor.run, main_program, data_feed) filelist = ['train_data/part-%d' % i for i in range(10)] self.assertRaises(TypeError, async_executor.run, main_program, data_feed, filelist) thread_num = 4 self.assertRaises(TypeError, async_executor.run, main_program, data_feed, filelist, thread_num) async_executor.run(main_program, data_feed, filelist, thread_num, [acc]) fluid.io.save_inference_model("imdb.model", [data.name, label.name], [acc], executor) statinfo = os.stat('imdb.model/__model__') self.assertGreater(statinfo.st_size, 0) os.remove('./data.prototxt') shutil.rmtree('./train_data') shutil.rmtree('./imdb.model')
def train(): # Download data with tarfile.open(paddle.dataset.common.download(URL, "imdb", MD5)) as tarf: tarf.extractall(path='./') tarf.close() # Initialize dataset description dataset = fluid.DataFeedDesc('train_data/data.prototxt') dataset.set_batch_size(128) # See API doc for how to change other fields print dataset.desc() # Debug purpose: see what we get # define network # input text data data = fluid.layers.data(name="words", shape=[1], dtype="int64", lod_level=1) # label data label = fluid.layers.data(name="label", shape=[1], dtype="int64") avg_cost, acc, prediction = bow_net(data, label) sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002) opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost) # Run startup program startup_program = fluid.default_startup_program() place = fluid.CPUPlace() executor = fluid.Executor(place) executor.run(startup_program) async_executor = fluid.AsyncExecutor(place) main_program = fluid.default_main_program() epochs = 10 filelist = ["train_data/part-%d" % i for i in range(12)] for i in range(epochs): thread_num = 4 async_executor.run( main_program, # This can be changed during iteration dataset, # This can be changed during iteration filelist, # This can be changed during iteration thread_num, # This can be changed during iteration [data, acc], # Multiple fetch targets can be specified debug=False) fluid.io.save_inference_model('imdb/epoch%d.model' % i, [data.name, label.name], [acc], executor)
def async_train_loop(args, train_program, loss, dataset, filelist): place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) async_executor = fluid.AsyncExecutor(place) thread_num = 40 for i in range(args.epochs): async_executor.run( train_program, # main program dataset, # dataset filelist, # filelist thread_num, # thread [], # fetch debug=True) # debug epoch_model = "word2vec_model/epoch" + str(i + 1) fluid.io.save_inference_model( epoch_model, [data.name, label.name], [acc], executor)
def train(network, dict_dim, lr, save_dirname, training_data_dirname, pass_num, thread_num, batch_size): file_names = os.listdir(training_data_dirname) filelist = [] for i in range(0, len(file_names)): if file_names[i] == 'data_feed.proto': continue filelist.append(os.path.join(training_data_dirname, file_names[i])) dataset = fluid.DataFeedDesc( os.path.join(training_data_dirname, 'data_feed.proto')) dataset.set_batch_size( batch_size) # datafeed should be assigned a batch size dataset.set_use_slots(['words', 'label']) data = fluid.layers.data( name="words", shape=[1], dtype="int64", lod_level=1) label = fluid.layers.data(name="label", shape=[1], dtype="int64") avg_cost, acc, prediction = network(data, label, dict_dim) optimizer = fluid.optimizer.Adagrad(learning_rate=lr) opt_ops, weight_and_grad = optimizer.minimize(avg_cost) startup_program = fluid.default_startup_program() main_program = fluid.default_main_program() place = fluid.CPUPlace() executor = fluid.Executor(place) executor.run(startup_program) async_executor = fluid.AsyncExecutor(place) for i in range(pass_num): pass_start = time.time() async_executor.run(main_program, dataset, filelist, thread_num, [acc], debug=False) print('pass_id: %u pass_time_cost %f' % (i, time.time() - pass_start)) fluid.io.save_inference_model('%s/epoch%d.model' % (save_dirname, i), [data.name, label.name], [acc], executor)
def train_async_local(batch_size): fea_sz, fea_sections, model_dict = model_conf.model_conf( 'thirdparty/model.conf') id_dict = None #data_list, predict, auc_var, cur_auc_var, auc_states, avg_cost, label = \ #fluid_net.async_net(fea_sections) data_list, predict, avg_cost, label = fluid_net.async_net(fea_sections) optimizer = fluid.optimizer.Adam(learning_rate=0.0005, lazy_mode=True) #optimizer = fluid.optimizer.SGD(learning_rate=0.01) optimize_ops, params_grads = optimizer.minimize(avg_cost) place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) async_exe = fluid.AsyncExecutor(place) def train_loop(main_program, trainer_id=None): dataset = fluid.DataFeedDesc('data_feed.proto') dataset.set_batch_size(32) dataset.set_use_slots([d.name for d in data_list]) dataset.set_pipe_command( '/home/users/dongdaxiang/paddle_whls/new_io/paddle_release_home/python/bin/python ctr_reader.py' ) # how to define the protocol thread_num = 10 for pass_id in xrange(PASS_NUM): for hour in range(24): hour_filelist = [ "./test_data_dir/%s" % x for x in os.listdir("./test_data_dir/") if "part" in x ] print(hour_filelist) async_exe.run(main_program, dataset, hour_filelist, thread_num, [avg_cost], debug=True) train_loop(fluid.default_main_program())
def train(network, dict_dim, lr, save_dirname, training_data_dirname, pass_num, thread_num, batch_size): file_names = os.listdir(training_data_dirname) filelist = ['data_generator/train_data/%s' % x for x in os.listdir("data_generator/train_data")] dataset = fluid.DataFeedDesc('data_feed.proto') dataset.set_batch_size( batch_size) # datafeed should be assigned a batch size dataset.set_use_slots(['words', 'label']) #dataset.set_pipe_command('/home/users/dongdaxiang/paddle_whls/new_io/paddle_release_home/python/bin/python new_data_reader.py') dataset.proto_desc.pipe_command = "/home/users/dongdaxiang/paddle_whls/new_io/paddle_release_home/python/bin/python new_data_reader.py" data = fluid.layers.data( name="words", shape=[1], dtype="int64", lod_level=1) label = fluid.layers.data(name="label", shape=[1], dtype="int64") avg_cost, acc, prediction = network(data, label, dict_dim) optimizer = fluid.optimizer.Adagrad(learning_rate=lr) opt_ops, weight_and_grad = optimizer.minimize(avg_cost) startup_program = fluid.default_startup_program() main_program = fluid.default_main_program() place = fluid.CPUPlace() executor = fluid.Executor(place) executor.run(startup_program) async_executor = fluid.AsyncExecutor(place) for i in range(pass_num): pass_start = time.time() async_executor.run(main_program, dataset, filelist, thread_num, [acc], debug=False) print('pass_id: %u pass_time_cost %f' % (i, time.time() - pass_start)) fluid.io.save_inference_model('%s/epoch%d.model' % (save_dirname, i), [data.name, label.name], [acc], executor)
shape=[-1, 1], value=0.0, dtype='float32'), \ loss_op2) avg_cost = fluid.layers.mean(loss_op3) ''' acc = fluid.layers.accuracy(input=cos_q_pt, \ label=label, k=1) ''' #real_acc = get_acc(cos_q_nt, cos_q_pt) # SGD optimizer sgd_optimizer = fluid.optimizer.SGD(learning_rate=base_lr) sgd_optimizer.minimize(avg_cost) place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) async_exe = fluid.AsyncExecutor(place) thread_num = 40 dataset = fluid.DataFeedDesc('data_feed.proto') dataset.set_batch_size(128) dataset.set_use_slots([q.name, pt.name, nt.name]) dataset.set_pipe_command( "/home/users/dongdaxiang/paddle_whls/new_io/paddle_release_home/python/bin/python pairwise_reader.py" ) #dataset.set_pipe_command("cat") filelist = ["ids/%s" % x for x in os.listdir("ids")] #filelist = ["prepared.txt"] print(filelist) async_exe.run(fluid.default_main_program(), dataset, filelist, thread_num, [],
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002) self.startup_program = fluid.default_startup_program() self.program_desc = fluid.default_main_program() if __name__ == "__main__": model = Model() dp = downpour.DownpourSGD(learning_rate=0.1, window=1) server_desc, skipped_ops = dp.minimize(model.avg_cost) server_desc_str = text_format.MessageToString(server_desc) async_exe = fluid.AsyncExecutor() instance = async_exe.config_distributed_nodes() if instance.is_server(): async_exe.init_server(server_desc_str) elif instance.is_worker(): async_exe.init_worker(server_desc_str, model.startup_program) local_data_dir = "./data/" # you can use this to download data from hadoop # async_exe.download_data("your_HADOOP_data_dir", local_data_dir, "fs_default_name", "ugi", 10) data_set = data_feed.DataFeedDesc(local_data_dir + "data_feed.proto") data_set.set_use_slots(["click"] + [str(i) for i in range(100)]) file_list = filter( lambda x: x.find("part") != -1, [local_data_dir + i for i in os.listdir(local_data_dir)]) async_exe.run(model.program_desc, data_set, file_list, 10,