def _get_dataset(self): namespace = "train.reader" inputs = self.model.get_inputs() threads = envs.get_global_env("train.threads", None) batch_size = envs.get_global_env("batch_size", None, namespace) reader_class = envs.get_global_env("class", None, namespace) abs_dir = os.path.dirname(os.path.abspath(__file__)) reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py') pipe_cmd = "python {} {} {} {}".format(reader, reader_class, "TRAIN", self._config_yaml) train_data_path = envs.get_global_env("train_data_path", None, namespace) dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(inputs) dataset.set_pipe_command(pipe_cmd) dataset.set_batch_size(batch_size) dataset.set_thread(threads) file_list = [ os.path.join(train_data_path, x) for x in os.listdir(train_data_path) ] dataset.set_filelist(file_list) return dataset
def wait_and_prepare_infer_dataset(self, day, pass_index): test_data_path = self.config.get("runner.infer_data_dir", []) dataset = fluid.DatasetFactory().create_dataset(self.reader_type) dataset.set_use_var(self.input_data) dataset.set_batch_size(self.config.get('runner.infer_batch_size', 1)) dataset.set_thread(self.config.get('runner.infer_thread_num', 1)) dataset.set_hdfs_config(self.hadoop_fs_name, self.hadoop_fs_ugi) dataset.set_parse_ins_id(self.config.get("runner.parse_ins_id", False)) dataset.set_parse_content( self.config.get("runner.parse_content", False)) cur_path = [] for i in self.online_intervals[pass_index - 1]: p = os.path.join(train_data_path, day, str(i)) if self.data_donefile: cur_donefile = os.path.join(p, self.data_donefile) data_ready(cur_donefile, self.data_sleep_second, self.hadoop_client) cur_path.append(p) global_file_list = file_ls(cur_path, self.hadoop_client) my_file_list = fleet.util.get_file_shard(global_file_list) logger.info("my_file_list = {}".format(my_file_list)) dataset.set_filelist(my_file_list) self.pipe_command = "{} {} {}".format( self.config.get("runner.pipe_command"), config.get("yaml_path"), get_utils_file_path()) dataset.set_pipe_command(self.pipe_command) dataset.load_into_memory() return dataset
def do_training(self, fleet): dnn_input_dim, lr_input_dim, train_file_path = ctr_dataset_reader.prepare_data( ) exe = fluid.Executor(fluid.CPUPlace()) fleet.init_worker() exe.run(fleet.startup_program) thread_num = 2 filelist = [] for _ in range(thread_num): filelist.append(train_file_path) # config dataset dataset = fluid.DatasetFactory().create_dataset() dataset.set_batch_size(128) dataset.set_use_var(self.feeds) pipe_command = 'python ctr_dataset_reader.py' dataset.set_pipe_command(pipe_command) dataset.set_filelist(filelist) dataset.set_thread(thread_num) for epoch_id in range(2): pass_start = time.time() dataset.set_filelist(filelist) exe.train_from_dataset( program=fleet.main_program, dataset=dataset, fetch_list=[self.avg_cost], fetch_info=["cost"], print_period=2, debug=False) pass_time = time.time() - pass_start class FH(fluid.executor.FetchHandler): def handler(self, fetch_target_vars): for i in range(len(fetch_target_vars)): print("{}: \n {}\n".format(self.fetch_target_names[0], fetch_target_vars[0])) for epoch_id in range(2): pass_start = time.time() dataset.set_filelist(filelist) exe.train_from_dataset( program=fleet.main_program, dataset=dataset, fetch_handler=FH([self.avg_cost.name], period_secs=2, return_np=True), debug=False) pass_time = time.time() - pass_start model_dir = tempfile.mkdtemp() fleet.save_inference_model( exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost) self.check_model_right(model_dir) shutil.rmtree(model_dir) fleet.stop_worker()
def create_dataset(self, FLAGS, factory): """ DatasetFactory is a factory which create dataset by its name, We can create "QueueDataset" or "InMemoryDataset", or "FileInstantDataset" the default is "QueueDataset". """ if FLAGS.data_reader != "dataset": return None dataset = fluid.DatasetFactory().create_dataset(FLAGS.dataset_mode) dataset.set_batch_size(FLAGS.batch_size) dataset.set_use_var(self.input_layers) dir_name = os.path.dirname(__file__) pipe_command = (FLAGS.fluid_bin + " " + dir_name + "/dataset_reader.py " + ObjectTransform.pickle_dumps_to_str(factory['dataset']) + " " + ObjectTransform.pickle_dumps_to_str(self.input_names)) """ Set pipe command of current dataset A pipe command is a UNIX pipeline command """ dataset.set_pipe_command(pipe_command) #TODO: shuffle #Set thread num, it is the num of readers. dataset.set_thread(self.get_thread_num(FLAGS)) return dataset
def save_program(): dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(sparse_input_ids + [label]) pipe_command = "python criteo_dataset.py {}".format(sys.argv[1]) dataset.set_pipe_command(pipe_command) dataset.set_batch_size(32) dataset.set_thread(10) optimizer = fluid.optimizer.SGD(0.0001) optimizer.minimize(avg_cost) exe = fluid.Executor(fluid.CPUPlace()) input_folder = "hdfs:" output = sp.check_output( "hadoop fs -D hadoop.job.ugi=" + hdfs_ugi + " -D fs.defaultFS=" + hdfs_address +" -ls " + os.path.join(dataset_prefix, current_date_hr) + "/ | awk '{if(NR>1) print $8}'", shell=True) train_filelist = ["{}{}".format(input_folder, f) for f in output.decode('ascii').strip().split('\n')] train_filelist.remove('hdfs:' + os.path.join(dataset_prefix, current_date_hr, 'donefile')) train_filelist = [train_filelist[0]] print(train_filelist) exe.run(fluid.default_startup_program()) print("startup save program done.") dataset.set_filelist(train_filelist) exe.train_from_dataset( program=fluid.default_main_program(), dataset=dataset, fetch_list=[auc_var], fetch_info=["auc"], debug=False,) #print_period=10000) # save model here fetch_list = fluid.io.save_inference_model(inference_path, [x.name for x in inference_feed_vars], [predict], exe)
def create_dataset(): dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset") dataset.set_use_var([x, y]) dataset.set_batch_size(2) dataset.set_thread(1) dataset.set_filelist(filelist) return dataset
def get_dataset(inputs): dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(inputs) dataset.set_batch_size(1) dataset.set_filelist([]) dataset.set_thread(1) return dataset
def _get_dataset(self): namespace = "train.reader" inputs = self.model.get_inputs() threads = int(envs.get_runtime_environ("train.trainer.threads")) batch_size = envs.get_global_env("batch_size", None, namespace) reader_class = envs.get_global_env("class", None, namespace) abs_dir = os.path.dirname(os.path.abspath(__file__)) reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py') pipe_cmd = "python {} {} {} {}".format(reader, reader_class, "TRAIN", self._config_yaml) train_data_path = envs.get_global_env("train_data_path", None, namespace) if train_data_path.startswith("fleetrec::"): package_base = envs.get_runtime_environ("PACKAGE_BASE") assert package_base is not None train_data_path = os.path.join(package_base, train_data_path.split("::")[1]) dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(inputs) dataset.set_pipe_command(pipe_cmd) dataset.set_batch_size(batch_size) dataset.set_thread(threads) file_list = [ os.path.join(train_data_path, x) for x in os.listdir(train_data_path) ] dataset.set_filelist(file_list) return dataset
def test_dataset_create(self): """ Testcase for dataset create. """ try: dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") except: self.assertTrue(False) try: dataset = fluid.DatasetFactory().create_dataset("QueueDataset") except: self.assertTrue(False) try: dataset = fluid.DatasetFactory().create_dataset("MyOwnDataset") self.assertTrue(False) except: self.assertTrue(True)
def get_dataset(self, inputs, files): dataset = fluid.DatasetFactory().create_dataset("QueueDataset") dataset.set_batch_size(32) dataset.set_thread(3) dataset.set_filelist(files) dataset.set_pipe_command("cat") dataset.set_use_var(inputs) return dataset
def test_in_memory_dataset_run(self): """ Testcase for InMemoryDataset from create to run. """ with open("test_in_memory_dataset_run_a.txt", "w") as f: data = "1 1 2 3 3 4 5 5 5 5 1 1\n" data += "1 2 2 3 4 4 6 6 6 6 1 2\n" data += "1 3 2 3 5 4 7 7 7 7 1 3\n" f.write(data) with open("test_in_memory_dataset_run_b.txt", "w") as f: data = "1 4 2 3 3 4 5 5 5 5 1 4\n" data += "1 5 2 3 4 4 6 6 6 6 1 5\n" data += "1 6 2 3 5 4 7 7 7 7 1 6\n" data += "1 7 2 3 6 4 8 8 8 8 1 7\n" f.write(data) slots = ["slot1", "slot2", "slot3", "slot4"] slots_vars = [] for slot in slots: var = fluid.layers.data(name=slot, shape=[1], dtype="int64", lod_level=1) slots_vars.append(var) dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset.set_batch_size(32) dataset.set_thread(3) dataset.set_filelist([ "test_in_memory_dataset_run_a.txt", "test_in_memory_dataset_run_b.txt" ]) dataset.set_pipe_command("cat") dataset.set_use_var(slots_vars) dataset.load_into_memory() dataset.set_fea_eval(10000, True) dataset.slots_shuffle(["slot1"]) dataset.local_shuffle() dataset.set_generate_unique_feasigns(True, 15) dataset.generate_local_tables_unlock(0, 11, 1, 25, 15) exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_startup_program()) if self.use_data_loader: data_loader = fluid.io.DataLoader.from_dataset( dataset, fluid.cpu_places(), self.drop_last) for i in range(self.epoch_num): for data in data_loader(): exe.run(fluid.default_main_program(), feed=data) else: for i in range(self.epoch_num): try: exe.train_from_dataset(fluid.default_main_program(), dataset) except Exception as e: self.assertTrue(False) os.remove("./test_in_memory_dataset_run_a.txt") os.remove("./test_in_memory_dataset_run_b.txt")
def _get_dataset(self, state="TRAIN"): if state == "TRAIN": inputs = self.model.get_inputs() namespace = "train.reader" train_data_path = envs.get_global_env("train_data_path", None, namespace) else: inputs = self.model.get_infer_inputs() namespace = "evaluate.reader" train_data_path = envs.get_global_env("test_data_path", None, namespace) sparse_slots = envs.get_global_env("sparse_slots", None, namespace) dense_slots = envs.get_global_env("dense_slots", None, namespace) threads = int(envs.get_runtime_environ("train.trainer.threads")) batch_size = envs.get_global_env("batch_size", None, namespace) reader_class = envs.get_global_env("class", None, namespace) abs_dir = os.path.dirname(os.path.abspath(__file__)) reader = os.path.join(abs_dir, '../utils', 'dataset_instance.py') if sparse_slots is None and dense_slots is None: pipe_cmd = "python {} {} {} {}".format(reader, reader_class, state, self._config_yaml) else: padding = envs.get_global_env("padding", 0, namespace) pipe_cmd = "python {} {} {} {} {} {} {} {}".format( reader, "slot", "slot", self._config_yaml, namespace, \ sparse_slots.replace(" ", "#"), dense_slots.replace(" ", "#"), str(padding)) if train_data_path.startswith("paddlerec::"): package_base = envs.get_runtime_environ("PACKAGE_BASE") assert package_base is not None train_data_path = os.path.join(package_base, train_data_path.split("::")[1]) dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(inputs) dataset.set_pipe_command(pipe_cmd) dataset.set_batch_size(batch_size) dataset.set_thread(threads) file_list = [ os.path.join(train_data_path, x) for x in os.listdir(train_data_path) ] self.files = file_list dataset.set_filelist(self.files) debug_mode = envs.get_global_env("reader_debug_mode", False, namespace) if debug_mode: print("--- Dataset Debug Mode Begin , show pre 10 data of {}---". format(file_list[0])) os.system("cat {} | {} | head -10".format(file_list[0], pipe_cmd)) print("--- Dataset Debug Mode End , show pre 10 data of {}---". format(file_list[0])) exit(0) return dataset
def test_queue_dataset_run_3(self): """ Testcase for QueueDataset from create to run. Use CUDAPlace Use float type id """ with open("test_queue_dataset_run_a.txt", "w") as f: data = "2 1 2 2 5 4 2 2 7 2 1 3\n" data += "2 6 2 2 1 4 2 2 4 2 2 3\n" data += "2 5 2 2 9 9 2 2 7 2 1 3\n" data += "2 7 2 2 1 9 2 3 7 2 5 3\n" f.write(data) with open("test_queue_dataset_run_b.txt", "w") as f: data = "2 1 2 2 5 4 2 2 7 2 1 3\n" data += "2 6 2 2 1 4 2 2 4 2 2 3\n" data += "2 5 2 2 9 9 2 2 7 2 1 3\n" data += "2 7 2 2 1 9 2 3 7 2 5 3\n" f.write(data) slots = ["slot1", "slot2", "slot3", "slot4"] slots_vars = [] for slot in slots: var = fluid.data( name=slot, shape=[None, 1], dtype="int64", lod_level=1) slots_vars.append(var) dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset.set_input_type(1) dataset.set_batch_size(1) dataset.set_thread(2) dataset.set_filelist( ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"]) dataset.set_pipe_command("cat") dataset.set_use_var(slots_vars) dataset.load_into_memory() exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda( ) else fluid.CUDAPlace(0)) exe.run(fluid.default_startup_program()) if self.use_data_loader: data_loader = fluid.io.DataLoader.from_dataset(dataset, fluid.cpu_places(), self.drop_last) for i in range(self.epoch_num): for data in data_loader(): exe.run(fluid.default_main_program(), feed=data) else: for i in range(self.epoch_num): try: exe.train_from_dataset(fluid.default_main_program(), dataset) except Exception as e: self.assertTrue(False) os.remove("./test_queue_dataset_run_a.txt") os.remove("./test_queue_dataset_run_b.txt")
def dataset_reader(self): """dataset reader""" dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(self.inputs) pipe_command = "python dataset_generator.py" dataset.set_pipe_command(pipe_command) dataset.set_batch_size(params.batch_size) thread_num = int(params.cpu_num) dataset.set_thread(thread_num) return dataset
def train(): args = parse_args() # add ce if args.enable_ce: SEED = 102 fluid.default_main_program().random_seed = SEED fluid.default_startup_program().random_seed = SEED print('---------- Configuration Arguments ----------') for key, value in args.__dict__.items(): print(key + ':' + str(value)) if not os.path.isdir(args.model_output_dir): os.mkdir(args.model_output_dir) loss, auc, data_list, auc_states = ctr_deepfm_model( args.embedding_size, args.num_field, args.num_feat, args.layer_sizes, args.act, args.reg) optimizer = fluid.optimizer.SGD( learning_rate=args.lr, regularization=fluid.regularizer.L2DecayRegularizer(args.reg)) optimizer.minimize(loss) exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_startup_program()) dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(data_list) pipe_command = 'python criteo_reader.py {}'.format(args.feat_dict) dataset.set_pipe_command(pipe_command) dataset.set_batch_size(args.batch_size) dataset.set_thread(args.num_thread) train_filelist = [ os.path.join(args.train_data_dir, x) for x in os.listdir(args.train_data_dir) ] print('---------------------------------------------') for epoch_id in range(args.num_epoch): start = time.time() dataset.set_filelist(train_filelist) exe.train_from_dataset( program=fluid.default_main_program(), dataset=dataset, fetch_list=[loss, auc], fetch_info=['epoch %d batch loss' % (epoch_id + 1), "auc"], print_period=1000, debug=False) model_dir = os.path.join(args.model_output_dir, 'epoch_' + str(epoch_id + 1)) sys.stderr.write('epoch%d is finished and takes %f s\n' % ((epoch_id + 1), time.time() - start)) fluid.io.save_persistables(executor=exe, dirname=model_dir, main_program=fluid.default_main_program())
def create_dataset(use_var_list, my_filelist): dataset = fluid.DatasetFactory().create_dataset(config.dataset_type) dataset.set_batch_size(config.batch_size) dataset.set_thread(config.thread_num) dataset.set_hdfs_config(config.fs_name, config.fs_ugi) dataset.set_pipe_command( "./read_feasign | python ins_weight.py | awk -f format_newcate_hotnews.awk | ./parse_feasign all_slot.dict" ) dataset.set_filelist(["part-00000_1"]) dataset.set_use_var(use_var_list) return dataset
def set_data_config(self): self.dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset") self.dataset.set_feed_type("PaddleBoxDataFeed") self.dataset.set_parse_logkey(True) self.dataset.set_thread(1) self.dataset.set_enable_pv_merge(self.enable_pv_merge) self.dataset.set_batch_size(self.batch_size) if self.enable_pv_merge: self.dataset.set_merge_by_sid(self.merge_by_sid) self.dataset.set_rank_offset("rank_offset") self.dataset.set_pv_batch_size(self.pv_batch_size)
def test_queue_dataset_run_2(self): """ Testcase for QueueDataset from create to run. Use CUDAPlace Use float type id """ with open("test_queue_dataset_run_a.txt", "w") as f: data = "1 1 2 3 3 4 5 5 5 5 1 1\n" data += "1 2 2 3 4 4 6 6 6 6 1 2\n" data += "1 3 2 3 5 4 7 7 7 7 1 3\n" f.write(data) with open("test_queue_dataset_run_b.txt", "w") as f: data = "1 4 2 3 3 4 5 5 5 5 1 4\n" data += "1 5 2 3 4 4 6 6 6 6 1 5\n" data += "1 6 2 3 5 4 7 7 7 7 1 6\n" data += "1 7 2 3 6 4 8 8 8 8 1 7\n" f.write(data) slots = ["slot1_f", "slot2_f", "slot3_f", "slot4_f"] slots_vars = [] for slot in slots: var = fluid.layers.data(name=slot, shape=[1], dtype="float32", lod_level=1) slots_vars.append(var) dataset = fluid.DatasetFactory().create_dataset("QueueDataset") dataset.set_batch_size(32) dataset.set_thread(3) dataset.set_filelist( ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"]) dataset.set_pipe_command("cat") dataset.set_use_var(slots_vars) exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) exe.run(fluid.default_startup_program()) if self.use_data_loader: data_loader = fluid.io.DataLoader.from_dataset( dataset, fluid.cpu_places(), self.drop_last) for i in range(self.epoch_num): for data in data_loader(): exe.run(fluid.default_main_program(), feed=data) else: for i in range(self.epoch_num): try: exe.train_from_dataset(fluid.default_main_program(), dataset) except Exception as e: self.assertTrue(False) os.remove("./test_queue_dataset_run_a.txt") os.remove("./test_queue_dataset_run_b.txt")
def dataset_reader(self): """get dataset_reader.""" dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var([self.dense_input] + self.sparse_input_ids + [self.label]) pipe_command = "python ./thirdparty/ctr/dataset_generator.py" dataset.set_pipe_command(pipe_command) dataset.set_batch_size(4) thread_num = int(2) dataset.set_thread(thread_num) return dataset
def do_dataset_training(self, fleet): train_file_list = ctr_dataset_reader.prepare_fake_data() exe = self.get_executor() exe.run(fluid.default_startup_program()) fleet.init_worker() thread_num = 2 batch_size = 128 filelist = train_file_list # config dataset dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset.set_use_var(self.feeds) dataset.set_batch_size(128) dataset.set_thread(2) dataset.set_filelist(filelist) dataset.set_pipe_command('python ctr_dataset_reader.py') dataset.load_into_memory() dataset.global_shuffle(fleet, 12) ##TODO: thread configure shuffle_data_size = dataset.get_shuffle_data_size(fleet) local_data_size = dataset.get_shuffle_data_size() data_size_list = fleet.util.all_gather(local_data_size) print('after global_shuffle data_size_list: ', data_size_list) print('after global_shuffle data_size: ', shuffle_data_size) for epoch_id in range(1): pass_start = time.time() exe.train_from_dataset(program=fluid.default_main_program(), dataset=dataset, fetch_list=[self.avg_cost], fetch_info=["cost"], print_period=2, debug=int(os.getenv("Debug", "0"))) pass_time = time.time() - pass_start dataset.release_memory() if os.getenv("SAVE_MODEL") == "1": model_dir = tempfile.mkdtemp() fleet.save_inference_model(exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost) self.check_model_right(model_dir) shutil.rmtree(model_dir) dirname = os.getenv("SAVE_DIRNAME", None) if dirname: fleet.save_persistables(exe, dirname=dirname) cache_dirname = os.getenv("SAVE_CACHE_DIRNAME", None) if cache_dirname: fleet.save_cache_model(cache_dirname)
def get_dataset(inputs): dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(inputs) dataset.set_pipe_command("python ../datasets/criteo_dataset_generator.py") dataset.set_batch_size(cfg.batch_size) thread_num = int(cfg.cpu_num) file_list = [ os.path.join(cfg.train_files_path, x) for x in os.listdir(cfg.train_files_path) ] #logger.info("file list: {}".format(file_list)) return dataset, file_list
def train(args): if not os.path.isdir(args.model_output_dir): os.mkdir(args.model_output_dir) input_word = fluid.layers.data(name="context_id", shape=[1], dtype="int64", lod_level=0) true_word = fluid.layers.data(name="target", shape=[1], dtype="int64", lod_level=0) neg_num = 5 neg_word = fluid.layers.data(name="neg_label", shape=[neg_num], dtype='int64', lod_level=0) loss = skip_gram_word2vec_dataset(input_word, true_word, neg_word, 354052, None, args.embedding_size, is_sparse=args.is_sparse) optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.exponential_decay( learning_rate=args.base_lr, decay_steps=100000, decay_rate=0.999, staircase=True)) optimizer.minimize(loss) main_program = fluid.default_main_program() dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var([input_word, true_word, neg_word]) dataset.set_pipe_command( "sudo /home/users/dongdaxiang/paddle_whls/pipe_reader/paddle_release_home/python/bin/python reader.py" ) dataset.set_batch_size(args.batch_size) filelist = GetFileList(args.train_data_dir) dataset.set_filelist(filelist) dataset.set_thread(args.thread_num) exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_startup_program()) for i in range(args.epochs): logger.info("Going to train epoch {}".format(i)) exe.train_from_dataset(program=fluid.default_main_program(), dataset=dataset)
def train(): args = parse_args() if not os.path.isdir(args.model_output_dir): os.mkdir(args.model_output_dir) dense_input = fluid.layers.data(name="dense_input", shape=[dense_feature_dim], dtype='float32') sparse_input_ids = [ fluid.layers.data(name="C" + str(i), shape=[1], lod_level=1, dtype="int64") for i in range(1, 27) ] label = fluid.layers.data(name='label', shape=[1], dtype='int64') loss, auc_var, batch_auc_var = ctr_dnn_model_dataset( dense_input, sparse_input_ids, label, args.embedding_size, args.sparse_feature_dim) optimizer = fluid.optimizer.SGD(learning_rate=1e-4) optimizer.minimize(loss) exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_startup_program()) dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var([dense_input] + sparse_input_ids + [label]) pipe_command = "python criteo_reader.py %d" % args.sparse_feature_dim dataset.set_pipe_command(pipe_command) dataset.set_batch_size(100) thread_num = 10 dataset.set_thread(thread_num) whole_filelist = [ "raw_data/part-%d" % x for x in range(len(os.listdir("raw_data"))) ] epochs = 20 for i in range(epochs): dataset.set_filelist(whole_filelist[:int(0.8 * len(whole_filelist))]) exe.train_from_dataset(program=fluid.default_main_program(), dataset=dataset, fetch_list=[auc_var], fetch_info=["auc"], debug=False) model_dir = args.model_output_dir + '/epoch' + str(i + 1) + ".model" sys.stderr.write("epoch%d finished" % (i + 1)) fluid.io.save_inference_model( model_dir, [dense_input.name] + [x.name for x in sparse_input_ids] + [label.name], [loss, auc_var], exe)
def train(): args = parse_args() # add ce if args.enable_ce: SEED = 102 fluid.default_main_program().random_seed = SEED fluid.default_startup_program().random_seed = SEED print(args) if not os.path.isdir(args.model_output_dir): os.mkdir(args.model_output_dir) loss, auc, data_list, auc_states = eval('network_conf.' + args.model_name)( args.embedding_size, args.num_field, args.num_feat, args.layer_sizes_dnn, args.act, args.reg, args.layer_sizes_cin) optimizer = fluid.optimizer.SGD( learning_rate=args.lr, regularization=fluid.regularizer.L2DecayRegularizer(args.reg)) optimizer.minimize(loss) dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(data_list) dataset.set_pipe_command('python criteo_reader.py') dataset.set_batch_size(args.batch_size) dataset.set_filelist([ os.path.join(args.train_data_dir, x) for x in os.listdir(args.train_data_dir) ]) if args.use_gpu == 1: exe = fluid.Executor(fluid.CUDAPlace(0)) dataset.set_thread(1) else: exe = fluid.Executor(fluid.CPUPlace()) dataset.set_thread(args.num_thread) exe.run(fluid.default_startup_program()) for epoch_id in range(args.num_epoch): start = time.time() sys.stderr.write('\nepoch%d start ...\n' % (epoch_id + 1)) exe.train_from_dataset(program=fluid.default_main_program(), dataset=dataset, fetch_list=[loss, auc], fetch_info=['loss', 'auc'], debug=False, print_period=args.print_steps) model_dir = os.path.join(args.model_output_dir, 'epoch_' + str(epoch_id + 1), "checkpoint") sys.stderr.write('epoch%d is finished and takes %f s\n' % ((epoch_id + 1), time.time() - start)) fluid.io.save_persistables(fluid.default_main_program(), model_dir)
def _get_dataset(self, dataset_name, context): name = "dataset." + dataset_name + "." reader_class = envs.get_global_env(name + "data_converter") reader_class_name = envs.get_global_env(name + "reader_class_name", "Reader") abs_dir = os.path.dirname(os.path.abspath(__file__)) reader = os.path.join(abs_dir, '../../utils', 'dataset_instance.py') sparse_slots = envs.get_global_env(name + "sparse_slots", "").strip() dense_slots = envs.get_global_env(name + "dense_slots", "").strip() if sparse_slots == "" and dense_slots == "": pipe_cmd = "python {} {} {} {}".format(reader, reader_class, reader_class_name, context["config_yaml"]) else: if sparse_slots == "": sparse_slots = "?" if dense_slots == "": dense_slots = "?" padding = envs.get_global_env(name + "padding", 0) pipe_cmd = "python {} {} {} {} {} {} {} {}".format( reader, "slot", "slot", context["config_yaml"], "fake", sparse_slots.replace(" ", "?"), dense_slots.replace(" ", "?"), str(padding)) batch_size = envs.get_global_env(name + "batch_size") dataset = fluid.DatasetFactory().create_dataset() dataset.set_batch_size(batch_size) dataset.set_pipe_command(pipe_cmd) train_data_path = envs.get_global_env(name + "data_path") file_list = [ os.path.join(train_data_path, x) for x in os.listdir(train_data_path) ] if context["engine"] == EngineMode.LOCAL_CLUSTER: file_list = split_files(file_list, context["fleet"].worker_index(), context["fleet"].worker_num()) print("File_list: {}".format(file_list)) dataset.set_filelist(file_list) for model_dict in context["phases"]: if model_dict["dataset_name"] == dataset_name: model = context["model"][model_dict["name"]]["model"] thread_num = int(model_dict["thread_num"]) dataset.set_thread(thread_num) if context["is_infer"]: inputs = model._infer_data_var else: inputs = model._data_var dataset.set_use_var(inputs) break return dataset
def _alloc_dataset(self, file_list): """ """ dataset = fluid.DatasetFactory().create_dataset( self._config['dataset_type']) dataset.set_batch_size(self._config['batch_size']) dataset.set_thread(self._config['load_thread']) dataset.set_hdfs_config(self._config['fs_name'], self._config['fs_ugi']) dataset.set_pipe_command(self._config['data_converter']) dataset.set_filelist(file_list) dataset.set_use_var(self._config['data_vars']) # dataset.set_fleet_send_sleep_seconds(2) # dataset.set_fleet_send_batch_size(80000) return dataset
def test_run_with_dump(self): """ Testcase for InMemoryDataset from create to run. """ with open("test_run_with_dump_a.txt", "w") as f: data = "1 a 1 a 1 1 2 3 3 4 5 5 5 5 1 1\n" data += "1 b 1 b 1 2 2 3 4 4 6 6 6 6 1 2\n" data += "1 c 1 c 1 3 2 3 5 4 7 7 7 7 1 3\n" f.write(data) with open("test_run_with_dump_b.txt", "w") as f: data = "1 d 1 d 1 4 2 3 3 4 5 5 5 5 1 4\n" data += "1 e 1 e 1 5 2 3 4 4 6 6 6 6 1 5\n" data += "1 f 1 f 1 6 2 3 5 4 7 7 7 7 1 6\n" data += "1 g 1 g 1 7 2 3 6 4 8 8 8 8 1 7\n" f.write(data) slots = ["slot1", "slot2", "slot3", "slot4"] slots_vars = [] for slot in slots: var = fluid.layers.data(name=slot, shape=[1], dtype="int64", lod_level=1) slots_vars.append(var) dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset.set_batch_size(32) dataset.set_thread(3) dataset.set_filelist( ["test_run_with_dump_a.txt", "test_run_with_dump_b.txt"]) dataset.set_parse_ins_id(True) dataset.set_parse_content(True) dataset.set_pipe_command("cat") dataset.set_use_var(slots_vars) dataset.load_into_memory() dataset.set_fea_eval(10000, True) dataset.local_shuffle() exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_startup_program()) for i in range(2): try: exe.train_from_dataset(fluid.default_main_program(), dataset) except ImportError as e: pass except Exception as e: self.assertTrue(False) os.remove("./test_run_with_dump_a.txt") os.remove("./test_run_with_dump_b.txt")
def get_dataset(self, inputs, files): """ Test Dataset With Fetch Handler. TestCases. Args: inputs(list): inputs of get_dataset files(list): files of get_dataset """ dataset = fluid.DatasetFactory().create_dataset("QueueDataset") dataset.set_batch_size(32) dataset.set_thread(3) dataset.set_filelist(files) dataset.set_pipe_command("cat") dataset.set_use_var(inputs) return dataset
def get_dataset(inputs, args): """ get dataset """ dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(inputs) dataset.set_pipe_command("python ./dataset_generator.py") dataset.set_batch_size(args.batch_size) dataset.set_thread(int(args.cpu_num)) file_list = [ str(args.train_files_path) + "/%s" % x for x in os.listdir(args.train_files_path) ] dataset.set_filelist(file_list) logger.info("file list: {}".format(file_list)) return dataset
def train_loop(main_program): """ train network """ start_time = time.time() dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(dcn_model.data_list) pipe_command = 'python reader.py {}'.format(args.vocab_dir) dataset.set_pipe_command(pipe_command) dataset.set_batch_size(args.batch_size) dataset.set_thread(args.num_thread) train_filelist = [ os.path.join(args.train_data_dir, fname) for fname in next(os.walk(args.train_data_dir))[2] ] dataset.set_filelist(train_filelist) if args.use_gpu == 1: exe = fluid.Executor(fluid.CUDAPlace(0)) dataset.set_thread(1) else: exe = fluid.Executor(fluid.CPUPlace()) dataset.set_thread(args.num_thread) exe.run(fluid.default_startup_program()) for epoch_id in range(args.num_epoch): start = time.time() sys.stderr.write('\nepoch%d start ...\n' % (epoch_id + 1)) exe.train_from_dataset( program=main_program, dataset=dataset, fetch_list=[ dcn_model.loss, dcn_model.avg_logloss, dcn_model.auc_var ], fetch_info=['total_loss', 'avg_logloss', 'auc'], debug=False, print_period=args.print_steps) model_dir = os.path.join(args.model_output_dir, 'epoch_' + str(epoch_id + 1), "checkpoint") sys.stderr.write('epoch%d is finished and takes %f s\n' % ((epoch_id + 1), time.time() - start)) if args.trainer_id == 0: # only trainer 0 save model print("save model in {}".format(model_dir)) fluid.save(main_program, model_dir) print("train time cost {:.4f}".format(time.time() - start_time)) print("finish training")