def test_run(self): # Initialize dataset description data_feed = fluid.DataFeedDesc('train_data/data.prototxt') data_feed.set_batch_size( 128) # See API doc for how to change other fields # define network # input text data data = fluid.layers.data(name="words", shape=[1], dtype="int64", lod_level=1) # label data label = fluid.layers.data(name="label", shape=[1], dtype="int64") avg_cost, acc, prediction = bow_net(data, label) sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002) opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost) # Run startup program startup_program = fluid.default_startup_program() place = fluid.CPUPlace() executor = fluid.Executor(place) executor.run(startup_program) main_program = fluid.default_main_program() async_executor = fluid.AsyncExecutor(place) self.assertRaises(TypeError, async_executor.run) self.assertRaises(TypeError, async_executor.run, main_program) self.assertRaises(TypeError, async_executor.run, main_program, data_feed) filelist = ['train_data/part-%d' % i for i in range(10)] self.assertRaises(TypeError, async_executor.run, main_program, data_feed, filelist) thread_num = 4 self.assertRaises(TypeError, async_executor.run, main_program, data_feed, filelist, thread_num) async_executor.run(main_program, data_feed, filelist, thread_num, [acc]) fluid.io.save_inference_model("imdb.model", [data.name, label.name], [acc], executor) statinfo = os.stat('imdb.model/__model__') self.assertGreater(statinfo.st_size, 0) os.remove('./data.prototxt') shutil.rmtree('./train_data') shutil.rmtree('./imdb.model')
def train(): # Download data with tarfile.open(paddle.dataset.common.download(URL, "imdb", MD5)) as tarf: tarf.extractall(path='./') tarf.close() # Initialize dataset description dataset = fluid.DataFeedDesc('train_data/data.prototxt') dataset.set_batch_size(128) # See API doc for how to change other fields print dataset.desc() # Debug purpose: see what we get # define network # input text data data = fluid.layers.data(name="words", shape=[1], dtype="int64", lod_level=1) # label data label = fluid.layers.data(name="label", shape=[1], dtype="int64") avg_cost, acc, prediction = bow_net(data, label) sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002) opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost) # Run startup program startup_program = fluid.default_startup_program() place = fluid.CPUPlace() executor = fluid.Executor(place) executor.run(startup_program) async_executor = fluid.AsyncExecutor(place) main_program = fluid.default_main_program() epochs = 10 filelist = ["train_data/part-%d" % i for i in range(12)] for i in range(epochs): thread_num = 4 async_executor.run( main_program, # This can be changed during iteration dataset, # This can be changed during iteration filelist, # This can be changed during iteration thread_num, # This can be changed during iteration [data, acc], # Multiple fetch targets can be specified debug=False) fluid.io.save_inference_model('imdb/epoch%d.model' % i, [data.name, label.name], [acc], executor)
def train(network, dict_dim, lr, save_dirname, training_data_dirname, pass_num, thread_num, batch_size): file_names = os.listdir(training_data_dirname) filelist = [] for i in range(0, len(file_names)): if file_names[i] == 'data_feed.proto': continue filelist.append(os.path.join(training_data_dirname, file_names[i])) dataset = fluid.DataFeedDesc( os.path.join(training_data_dirname, 'data_feed.proto')) dataset.set_batch_size( batch_size) # datafeed should be assigned a batch size dataset.set_use_slots(['words', 'label']) data = fluid.layers.data( name="words", shape=[1], dtype="int64", lod_level=1) label = fluid.layers.data(name="label", shape=[1], dtype="int64") avg_cost, acc, prediction = network(data, label, dict_dim) optimizer = fluid.optimizer.Adagrad(learning_rate=lr) opt_ops, weight_and_grad = optimizer.minimize(avg_cost) startup_program = fluid.default_startup_program() main_program = fluid.default_main_program() place = fluid.CPUPlace() executor = fluid.Executor(place) executor.run(startup_program) async_executor = fluid.AsyncExecutor(place) for i in range(pass_num): pass_start = time.time() async_executor.run(main_program, dataset, filelist, thread_num, [acc], debug=False) print('pass_id: %u pass_time_cost %f' % (i, time.time() - pass_start)) fluid.io.save_inference_model('%s/epoch%d.model' % (save_dirname, i), [data.name, label.name], [acc], executor)
def train_loop(main_program, trainer_id=None): dataset = fluid.DataFeedDesc('data_feed.proto') dataset.set_batch_size(32) dataset.set_use_slots([d.name for d in data_list]) dataset.set_pipe_command( '/home/users/dongdaxiang/paddle_whls/new_io/paddle_release_home/python/bin/python ctr_reader.py' ) # how to define the protocol thread_num = 10 for pass_id in xrange(PASS_NUM): for hour in range(24): hour_filelist = [ "./test_data_dir/%s" % x for x in os.listdir("./test_data_dir/") if "part" in x ] print(hour_filelist) async_exe.run(main_program, dataset, hour_filelist, thread_num, [avg_cost], debug=True)
def async_train(args): if not os.path.isdir(args.model_output_dir): os.mkdir(args.model_output_dir) filelist = GetFileList(args.train_data_path) word2vec_reader = reader.Word2VecReader( args.dict_path, args.train_data_path, filelist, 0, 1) loss, words = skip_gram_word2vec( word2vec_reader.dict_size, word2vec_reader.word_frequencys, args.embedding_size, args.max_code_length, args.with_hs, args.with_nce, is_sparse=args.is_sparse) dataset = fluid.DataFeedDesc('data_feed.proto') dataset.set_batch_size(args.batch_size) dataset.set_use_slots([w.name for w in words]) dataset.set_pipe_command("/home/users/dongdaxiang/paddle_whls/new_io/paddle_release_home/python/bin/python word2vec_data_gen.py") optimizer = fluid.optimizer.SGD(learning_rate=1e-4) optimizer.minimize(loss) async_train_loop(args, fluid.default_main_program(), loss, dataset, filelist)
def train(network, dict_dim, lr, save_dirname, training_data_dirname, pass_num, thread_num, batch_size): file_names = os.listdir(training_data_dirname) filelist = ['data_generator/train_data/%s' % x for x in os.listdir("data_generator/train_data")] dataset = fluid.DataFeedDesc('data_feed.proto') dataset.set_batch_size( batch_size) # datafeed should be assigned a batch size dataset.set_use_slots(['words', 'label']) #dataset.set_pipe_command('/home/users/dongdaxiang/paddle_whls/new_io/paddle_release_home/python/bin/python new_data_reader.py') dataset.proto_desc.pipe_command = "/home/users/dongdaxiang/paddle_whls/new_io/paddle_release_home/python/bin/python new_data_reader.py" data = fluid.layers.data( name="words", shape=[1], dtype="int64", lod_level=1) label = fluid.layers.data(name="label", shape=[1], dtype="int64") avg_cost, acc, prediction = network(data, label, dict_dim) optimizer = fluid.optimizer.Adagrad(learning_rate=lr) opt_ops, weight_and_grad = optimizer.minimize(avg_cost) startup_program = fluid.default_startup_program() main_program = fluid.default_main_program() place = fluid.CPUPlace() executor = fluid.Executor(place) executor.run(startup_program) async_executor = fluid.AsyncExecutor(place) for i in range(pass_num): pass_start = time.time() async_executor.run(main_program, dataset, filelist, thread_num, [acc], debug=False) print('pass_id: %u pass_time_cost %f' % (i, time.time() - pass_start)) fluid.io.save_inference_model('%s/epoch%d.model' % (save_dirname, i), [data.name, label.name], [acc], executor)
avg_cost = fluid.layers.mean(loss_op3) ''' acc = fluid.layers.accuracy(input=cos_q_pt, \ label=label, k=1) ''' #real_acc = get_acc(cos_q_nt, cos_q_pt) # SGD optimizer sgd_optimizer = fluid.optimizer.SGD(learning_rate=base_lr) sgd_optimizer.minimize(avg_cost) place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) async_exe = fluid.AsyncExecutor(place) thread_num = 40 dataset = fluid.DataFeedDesc('data_feed.proto') dataset.set_batch_size(128) dataset.set_use_slots([q.name, pt.name, nt.name]) dataset.set_pipe_command( "/home/users/dongdaxiang/paddle_whls/new_io/paddle_release_home/python/bin/python pairwise_reader.py" ) #dataset.set_pipe_command("cat") filelist = ["ids/%s" % x for x in os.listdir("ids")] #filelist = ["prepared.txt"] print(filelist) async_exe.run(fluid.default_main_program(), dataset, filelist, thread_num, [], debug=True)
def test_data_feed_desc(self): data_feed = fluid.DataFeedDesc('./data.prototxt') # assertEqueal(data_feed.proto_desc.batch, 2) # assertEqual(len(data_feed.proto_desc.multi_slot_desc), 2) self.assertEqual(" ".join(data_feed.desc().split()), " ".join(proto_str.split()))