def get_data(name, data_dir, meta_dir, gpu_nums): isTrain = name == 'train' ds = PascalVOC12(data_dir, meta_dir, name, shuffle=True) if isTrain:#special augmentation shape_aug = [RandomResize(xrange=(0.7, 1.5), yrange=(0.7, 1.5), aspect_ratio_thres=0.15), RandomCropWithPadding(args.crop_size,IGNORE_LABEL), Flip(horiz=True), ] else: shape_aug = [] ds = AugmentImageComponents(ds, shape_aug, (0, 1), copy=False) def f(ds): image, label = ds m = np.array([104, 116, 122]) const_arr = np.resize(m, (1,1,3)) # NCHW image = image - const_arr return image, label ds = MapData(ds, f) if isTrain: ds = BatchData(ds, args.batch_size*gpu_nums) ds = PrefetchDataZMQ(ds, 1) else: ds = BatchData(ds, 1) return ds
def get_mnist_data(is_train, image_size, batchsize): ds = MNISTCh('train' if is_train else 'test', shuffle=True) if is_train: augs = [ imgaug.RandomApplyAug(imgaug.RandomResize((0.8, 1.2), (0.8, 1.2)), 0.3), imgaug.RandomApplyAug(imgaug.RotationAndCropValid(15), 0.5), imgaug.RandomApplyAug( imgaug.SaltPepperNoise(white_prob=0.01, black_prob=0.01), 0.25), imgaug.Resize((224, 224), cv2.INTER_AREA) ] ds = AugmentImageComponent(ds, augs) ds = PrefetchData(ds, 128 * 10, multiprocessing.cpu_count()) ds = BatchData(ds, batchsize) ds = PrefetchData(ds, 256, 4) else: # no augmentation, only resizing augs = [ imgaug.Resize((image_size, image_size), cv2.INTER_CUBIC), ] ds = AugmentImageComponent(ds, augs) ds = BatchData(ds, batchsize) ds = PrefetchData(ds, 20, 2) return ds
def get_data(name, data_dir, meta_dir, gpu_nums): isTrain = True if 'train' in name else False ds = Camvid(data_dir, meta_dir, name, shuffle=True) if isTrain: ds = MapData(ds, RandomResize) if isTrain: shape_aug = [ RandomCropWithPadding(args.crop_size,IGNORE_LABEL), Flip(horiz=True), ] else: shape_aug = [] ds = AugmentImageComponents(ds, shape_aug, (0, 1), copy=False) def f(ds): image, label = ds m = np.array([104, 116, 122]) const_arr = np.resize(m, (1,1,3)) # NCHW image = image - const_arr return image, label ds = MapData(ds, f) if isTrain: ds = BatchData(ds, args.batch_size*gpu_nums) ds = PrefetchDataZMQ(ds, 1) else: ds = BatchData(ds, 1) return ds
def get_input_flow(self): ds_train = CellImageDataManagerTrain() # ds_train = MapDataComponent(ds_train, random_affine) # TODO : no improvement? ds_train = MapDataComponent(ds_train, random_color) # ds_train = MapDataComponent(ds_train, random_scaling) ds_train = MapDataComponent( ds_train, mask_size_normalize) # Resize by instance size - normalization ds_train = MapDataComponent( ds_train, lambda x: resize_shortedge_if_small(x, self.img_size)) ds_train = MapDataComponent( ds_train, lambda x: random_crop(x, self.img_size, self.img_size)) ds_train = MapDataComponent(ds_train, random_flip_lr) ds_train = MapDataComponent(ds_train, random_flip_ud) # ds_train = MapDataComponent(ds_train, data_to_elastic_transform_wrapper) ds_train = MapDataComponent(ds_train, erosion_mask) ds_train = MapData( ds_train, lambda x: data_to_segment_input( x, is_gray=False, unet_weight=True)) ds_train = PrefetchData(ds_train, 256, 24) ds_train = BatchData(ds_train, self.batchsize) ds_train = MapDataComponent(ds_train, data_to_normalize1) ds_valid = CellImageDataManagerValid() ds_valid = MapDataComponent( ds_valid, lambda x: resize_shortedge_if_small(x, self.img_size)) ds_valid = MapDataComponent( ds_valid, lambda x: random_crop(x, self.img_size, self.img_size)) ds_valid = MapDataComponent(ds_valid, erosion_mask) ds_valid = MapData( ds_valid, lambda x: data_to_segment_input( x, is_gray=False, unet_weight=True)) ds_valid = PrefetchData(ds_valid, 20, 12) ds_valid = BatchData(ds_valid, self.batchsize, remainder=True) ds_valid = MapDataComponent(ds_valid, data_to_normalize1) ds_valid2 = CellImageDataManagerValid() ds_valid2 = MapDataComponent( ds_valid2, lambda x: resize_shortedge_if_small(x, self.img_size)) ds_valid2 = MapDataComponent( ds_valid2, lambda x: center_crop_if_tcga(x, self.img_size, self.img_size)) # ds_valid2 = MapDataComponent(ds_valid2, lambda x: resize_shortedge(x, self.img_size)) ds_valid2 = MapData(ds_valid2, lambda x: data_to_segment_input(x, is_gray=False)) ds_valid2 = MapDataComponent(ds_valid2, data_to_normalize1) ds_test = CellImageDataManagerTest() ds_test = MapDataComponent( ds_test, lambda x: resize_shortedge_if_small(x, self.img_size)) # ds_test = MapDataComponent(ds_test, lambda x: resize_shortedge(x, self.img_size)) ds_test = MapData(ds_test, lambda x: data_to_image(x, is_gray=False)) ds_test = MapDataComponent(ds_test, data_to_normalize1) return ds_train, ds_valid, ds_valid2, ds_test
def get_data(name, meta_dir, gpu_nums): isTrain = True if 'train' in name else False m = np.array([104, 116, 122]) const_arr = np.resize(m, (1, 1, 3)) # NCHW const_arr = np.zeros( (args.crop_size[0], args.crop_size[1], 3)) + const_arr #broadcast if isTrain: #ds = FakeData([[1024, 2048, 3], [ 1024, 2048]], 5000, random=False, dtype='uint8') #ds = FakeData([[CROP_HEIGHT, CROP_HEIGHT, 3], [CROP_HEIGHT, CROP_HEIGHT]], 5000,random=False, dtype='uint8') ds = CityscapesFiles(base_dir, meta_dir, name, shuffle=True) parallel = min(3, multiprocessing.cpu_count()) augmentors = [ RandomCropWithPadding(args.crop_size), Flip(horiz=True), ] aug = imgaug.AugmentorList(augmentors) def mapf(ds): img, label = ds img = cv2.imread(img, cv2.IMREAD_COLOR) label = cv2.imread(label, cv2.IMREAD_GRAYSCALE) img, params = aug.augment_return_params(img) label = aug._augment(label, params) img = img - const_arr # very time-consuming return img, label #ds = MapData(ds, mapf) ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=500, strict=True) #ds = MapData(ds, reduce_mean_rgb) ds = BatchData(ds, args.batch_size * gpu_nums) #ds = PrefetchDataZMQ(ds, 1) else: def imgread(ds): img, label = ds img = cv2.imread(img, cv2.IMREAD_COLOR) label = cv2.imread(label, cv2.IMREAD_GRAYSCALE) return [img, label] ds = CityscapesFiles(base_dir, meta_dir, name, shuffle=False) ds = MapData(ds, imgread) ds = BatchData(ds, 1) return ds
def get_input_flow(self): ds_train = CellImageDataManagerTrain() # Augmentation : ds_train = MapDataComponent(ds_train, random_affine) ds_train = MapDataComponent(ds_train, random_color) # ds_train = MapDataComponent(ds_train, random_color2) # not good ds_train = MapDataComponent(ds_train, random_scaling) ds_train = MapDataComponent( ds_train, lambda x: resize_shortedge_if_small(x, 224)) ds_train = MapDataComponent(ds_train, lambda x: random_crop(x, 224, 224)) ds_train = MapDataComponent(ds_train, random_flip_lr) # ds_train = MapDataComponent(ds_train, data_to_elastic_transform_wrapper) ds_train = MapDataComponent(ds_train, random_flip_ud) if self.unet_weight: ds_train = MapDataComponent(ds_train, erosion_mask) ds_train = PrefetchData(ds_train, 1000, 24) ds_train = MapData( ds_train, lambda x: data_to_segment_input(x, not self.is_color, self.unet_weight)) ds_train = BatchData(ds_train, self.batchsize) ds_train = MapDataComponent(ds_train, data_to_normalize1) ds_train = PrefetchData(ds_train, 10, 2) ds_valid = CellImageDataManagerValid() ds_valid = MapDataComponent(ds_valid, lambda x: center_crop(x, 224, 224)) if self.unet_weight: ds_valid = MapDataComponent(ds_valid, erosion_mask) ds_valid = MapData( ds_valid, lambda x: data_to_segment_input(x, not self.is_color, self.unet_weight)) ds_valid = BatchData(ds_valid, self.batchsize, remainder=True) ds_valid = MapDataComponent(ds_valid, data_to_normalize1) ds_valid = PrefetchData(ds_valid, 20, 24) ds_valid2 = CellImageDataManagerValid() ds_valid2 = MapDataComponent( ds_valid2, lambda x: resize_shortedge_if_small(x, 224)) ds_valid2 = MapData( ds_valid2, lambda x: data_to_segment_input(x, not self.is_color)) ds_valid2 = MapDataComponent(ds_valid2, data_to_normalize1) ds_test = CellImageDataManagerTest() ds_test = MapDataComponent(ds_test, lambda x: resize_shortedge_if_small(x, 224)) ds_test = MapData(ds_test, lambda x: data_to_image(x, not self.is_color)) ds_test = MapDataComponent(ds_test, data_to_normalize1) return ds_train, ds_valid, ds_valid2, ds_test
def get_dataflow_batch(is_train, batchsize): ds = get_dataflow(is_train) ds = PrefetchData(ds, 1000, multiprocessing.cpu_count()) ds = BatchData(ds, batchsize) ds = PrefetchData(ds, 10, 4) return ds
def get_remote_dataflow(port, nr_prefetch=1000, nr_thread=1): ipc = 'ipc:///tmp/ipc-socket' tcp = 'tcp://0.0.0.0:%d' % port data_loader = RemoteDataZMQ(ipc, tcp, hwm=10000) data_loader = BatchData(data_loader, batch_size=hp.train.batch_size) data_loader = PrefetchData(data_loader, nr_prefetch, nr_thread) return data_loader
def main(cfg): print(cfg) tf.reset_default_graph() logger.set_logger_dir('tflogs', action='d') copyfile(hydra.utils.to_absolute_path('model.py'), 'model.py') copyfile(hydra.utils.to_absolute_path('dataflow.py'), 'dataflow.py') if cfg.cat_name == 'smpl': train_df = SMPLDataFlow(cfg, True, 1000) val_df = VisSMPLDataFlow(cfg, True, 1000, port=1080) else: train_df = ShapeNetDataFlow(cfg, cfg.data.train_txt, True) val_df = VisDataFlow(cfg, cfg.data.val_txt, False, port=1080) config = TrainConfig( model=Model(cfg), dataflow=BatchData(PrefetchData(train_df, cpu_count() // 2, cpu_count() // 2), cfg.batch_size), callbacks=[ ModelSaver(), SimpleMovingAverage(['recon_loss', 'GAN/loss_d', 'GAN/loss_g', 'GAN/gp_loss', 'symmetry_loss'], 100), PeriodicTrigger(val_df, every_k_steps=30) ], monitors=tensorpack.train.DEFAULT_MONITORS() + [ScalarPrinter(enable_step=True, enable_epoch=False)], max_epoch=10 ) launch_train_with_config(config, SimpleTrainer())
def get_dataflow(annot_path, img_dir, batch_size): """ This function initializes the tensorpack dataflow and serves generator for training operation. :param annot_path: path to the annotation file :param img_dir: path to the images :param batch_size: batch size :return: dataflow object """ df = CocoDataFlow((368, 368), annot_path, img_dir) df.prepare() df = MapData(df, read_img) df = MapData(df, gen_mask) df = MapData(df, augment) df = MapData(df, apply_mask) df = MapData(df, build_sample) df = PrefetchDataZMQ(df, nr_proc=4) #df = PrefetchData(df, 2, 1) df = BatchData(df, batch_size, use_list=False) df = MapData( df, lambda x: ([x[0], x[1], x[2]], [ x[3], x[4], x[3], x[4], x[3], x[4], x[3], x[4], x[3], x[4], x[3], x[4] ])) df.reset_state() return df
def get_default_dataflow_batch(batchsize=32): ds = get_default_dataflow() ds = MapData(ds, data_to_segment_input) ds = BatchData(ds, batchsize) ds = MapDataComponent(ds, data_to_normalize01) ds = PrefetchData(ds, 10, 2) return ds
def get_dataflow_batch(path, is_train, batchsize, img_path=None): logger.info('dataflow img_path=%s' % img_path) ds = get_dataflow(path, is_train, img_path=img_path) ds = BatchData(ds, batchsize) if is_train: ds = PrefetchData(ds, batchsize*2, 1) else: ds = PrefetchData(ds, batchsize*2, 1) return ds
def get_dataflow_batch(path, clothe_class, is_train, batchsize, img_path=None): logger.info('dataflow img_path=%s' % img_path) ds = get_dataflow(path, is_train, clothe_class, img_path=img_path) ds = BatchData(ds, batchsize) if is_train: ds = PrefetchData(ds, 10, 2) else: ds = PrefetchData(ds, 50, 2) return ds
def get_dataflow_batch(path, is_train, batchsize, img_path=None): logger.info('dataflow img_path=%s' % img_path) ds = get_dataflow(path, is_train, img_path=img_path) ds = BatchData(ds, batchsize) # if is_train: # ds = PrefetchData(ds, 10, 2) # else: # ds = PrefetchData(ds, 50, 2) return ds
def get_data(name, data_dir, meta_dir, gpu_nums): isTrain = True if 'train' in name else False def imgread(ds): img, label = ds img = cv2.imread(img, cv2.IMREAD_COLOR) label = cv2.imread(label, cv2.IMREAD_GRAYSCALE) return img, label if isTrain: #ds = LMDBData('/data2/dataset/cityscapes/cityscapes_train.lmdb', shuffle=True) #ds = FakeData([[batch_size, CROP_HEIGHT, CROP_HEIGHT, 3], [batch_size, CROP_HEIGHT, CROP_HEIGHT, 1]], 5000, random=False, dtype='uint8') ds = PascalVOC12Files(data_dir, meta_dir, name, shuffle=True) ds = MultiThreadMapData(ds,4,imgread, buffer_size= 2) #ds = PrefetchDataZMQ(MapData(ds, ImageDecode), 1) #imagedecode is heavy ds = MapData(ds, RandomResize) else: ds = PascalVOC12Files(data_dir, meta_dir, name, shuffle=False) ds = MultiThreadMapData(ds, 4, imgread, buffer_size= 2) if isTrain: shape_aug = [ RandomCropWithPadding(args.crop_size,IGNORE_LABEL), Flip(horiz=True), ] ds = AugmentImageComponents(ds, shape_aug, (0, 1), copy=False) def reduce_mean_rgb(ds): image, label = ds m = np.array([104, 116, 122]) const_arr = np.resize(m, (1,1,3)) # NCHW image = image - const_arr return image, label def MxnetPrepare(ds): data, label = ds data = np.transpose(data, (0, 3, 1, 2)) # NCHW label = label[:, :, :, None] label = np.transpose(label, (0, 3, 1, 2)) # NCHW dl = [[mx.nd.array(data[args.batch_size * i:args.batch_size * (i + 1)])] for i in range(gpu_nums)] # multi-gpu distribute data, time-consuming!!! ll = [[mx.nd.array(label[args.batch_size * i:args.batch_size * (i + 1)])] for i in range(gpu_nums)] return dl, ll #ds = MapData(ds, reduce_mean_rgb) ds = MultiThreadMapData(ds, 4, reduce_mean_rgb, buffer_size=2) if isTrain: ds = FastBatchData(ds, args.batch_size*gpu_nums) ds = MapData(ds, MxnetPrepare) #ds = PrefetchDataZMQ(ds, 1) else: ds = BatchData(ds, 1) return ds
def get_hand_dataflow_batch(is_train, batchsize, img_path=None): logger.info('dataflow img_path=%s' % img_path) ds = get_hand_dataflow(is_train, img_path=img_path) ds = BatchData(ds, batchsize) if is_train: # ds = PrefetchDataZMQ(ds, 10, 2) PrefetchDataZMQ(ds, 8) else: ds = PrefetchData(ds, 50, 2) return ds
def get_ilsvrc_data_alexnet(is_train, image_size, batchsize, directory): if is_train: if not directory.startswith('/'): ds = ILSVRCTTenthTrain(directory) else: ds = ILSVRC12(directory, 'train') augs = [ imgaug.RandomApplyAug(imgaug.RandomResize((0.9, 1.2), (0.9, 1.2)), 0.7), imgaug.RandomApplyAug(imgaug.RotationAndCropValid(15), 0.7), imgaug.RandomApplyAug( imgaug.RandomChooseAug([ imgaug.SaltPepperNoise(white_prob=0.01, black_prob=0.01), imgaug.RandomOrderAug([ imgaug.BrightnessScale((0.8, 1.2), clip=False), imgaug.Contrast((0.8, 1.2), clip=False), # imgaug.Saturation(0.4, rgb=True), ]), ]), 0.7), imgaug.Flip(horiz=True), imgaug.ResizeShortestEdge(256, cv2.INTER_CUBIC), imgaug.RandomCrop((224, 224)), ] ds = AugmentImageComponent(ds, augs) ds = PrefetchData(ds, 1000, multiprocessing.cpu_count()) ds = BatchData(ds, batchsize) ds = PrefetchData(ds, 10, 4) else: if not directory.startswith('/'): ds = ILSVRCTenthValid(directory) else: ds = ILSVRC12(directory, 'val') ds = AugmentImageComponent(ds, [ imgaug.ResizeShortestEdge(224, cv2.INTER_CUBIC), imgaug.CenterCrop((224, 224)), ]) ds = PrefetchData(ds, 100, multiprocessing.cpu_count()) ds = BatchData(ds, batchsize) return ds
def batch_dataflow(df, batch_size): """ The function builds batch dataflow from the input dataflow of samples :param df: dataflow of samples :param batch_size: batch size :return: dataflow of batches """ df = BatchData(df, batch_size, use_list=False) df = MapData(df, lambda x: ([x[0]], [x[2]])) df.reset_state() return df
def get_dataflow_batch(path, is_train, batchsize, img_path=None): logger.info('dataflow img_path=%s' % img_path) ds = get_dataflow(path, is_train, img_path=img_path) print("ds from get_dataflow", ds) ds = BatchData(ds, batchsize) print("ds from batchdata", ds) if is_train: ds = PrefetchData(ds, 10, 3) print("ds from preferchdata", ds) else: ds = PrefetchData(ds, 50, 2) return ds
def batch_dataflow(df, batch_size, time_steps=4, num_stages=6, format=['heatpaf', 'last']): informat, outformat = format df = BatchData(df, batch_size, use_list=False) def in_heat(x): return [ np.stack([x[0]] * time_steps, axis=1), np.stack([x[2]] * time_steps, axis=1) ] def in_heatpaf(x): return [ np.stack([x[0]] * time_steps, axis=1), np.stack([x[1]] * time_steps, axis=1), np.stack([x[2]] * time_steps, axis=1) ] def out_heat_last(x): return [np.stack([x[4]] * time_steps, axis=1)] * num_stages def out_heatpaf_last(x): return [ np.stack([x[3]] * time_steps, axis=1), np.stack([x[4]] * time_steps, axis=1), np.stack([x[3]] * time_steps, axis=1), np.stack([x[4]] * time_steps, axis=1), # TD layers end here x[3], # TD layers are joined here by LSTM x[4], x[3], # these last outputs collapse to one timestep output x[4], x[3], x[4], x[3], x[4], ] if informat == 'heat' and outformat == 'last': df = MapData(df, lambda x: (heat_only(x), out_heat_last(x))) elif informat == 'heatpaf' and outformat == 'last': df = MapData(df, lambda x: (in_heatpaf(x), out_heatpaf_last(x))) else: raise Exception('Unknown format requested: %s' % format) df.reset_state() return df
def get_dataflow_batch(path, is_train=True, batch_size=10, img_path=None,sigma=8.0,output_shape=(1440,2560), numparts=5,translation=False,scale=False,rotation=True, mins=0.25,maxs=1.2,mina=-np.pi,maxa=np.pi, ilumination=0.0,image_type='RGB'): logger.info('dataflow img_path=%s' % img_path) ds = get_dataflow(path, is_train, img_path=img_path,sigma=sigma,output_shape=output_shape, translation=translation,scale=scale,rotation=rotation, mins=mins,maxs=maxs, mina=mina,maxa=maxa,ilumination=ilumination,image_type=image_type) ds = BatchData(ds, batch_size) # if is_train: ds = PrefetchData(ds, 10, 2) # else: # ds = PrefetchData(ds, 50, 2) return ds
def get_infer_iterator(hparams, dataset, num_gpu, batch_size): df = DataFromList(dataset, shuffle=False) num_samples = len(df) if num_samples % batch_size != 0 and num_samples % batch_size < num_gpu: raise ValueError("num_samples %% batch_size < num_gpu") df = MapData(df, lambda data: map_func(hparams, data)) batched_df = BatchData(df, batch_size=batch_size, remainder=True) splitted_df = MapData( batched_df, lambda x: [np.array_split(x[idx], num_gpu) for idx in range(len(x))]) prefetched_df = PrefetchDataZMQ(splitted_df, nr_proc=1, hwm=batch_size * 10) return prefetched_df
def get_config(): logger.set_logger_dir(LOG_DIR) M = Model() name_base = str(uuid.uuid1())[:6] PIPE_DIR = os.environ.get('TENSORPACK_PIPEDIR', '.').rstrip('/') namec2s = 'ipc://{}/sim-c2s-{}'.format(PIPE_DIR, name_base) names2c = 'ipc://{}/sim-s2c-{}'.format(PIPE_DIR, name_base) procs = [ MySimulatorWorker(k, namec2s, names2c) for k in range(SIMULATOR_PROC) ] ensure_proc_terminate(procs) start_proc_mask_signal(procs) master = MySimulatorMaster(namec2s, names2c, M) dataflow = BatchData(DataFromQueue(master.queue), BATCH_SIZE) lr = tf.Variable(0.001, trainable=False, name='learning_rate') tf.scalar_summary('learning_rate', lr) return TrainConfig( dataset=dataflow, optimizer=tf.train.AdamOptimizer(lr, epsilon=1e-3), callbacks=Callbacks([ StatPrinter(), PeriodicCallback(ModelSaver(), 5), ScheduledHyperParamSetter('learning_rate', [(80, 0.0003), (120, 0.0001)]), ScheduledHyperParamSetter('entropy_beta', [(80, 0.005)]), ScheduledHyperParamSetter('explore_factor', [(80, 2), (100, 3), (120, 4), (140, 5)]), HumanHyperParamSetter('learning_rate'), HumanHyperParamSetter('entropy_beta'), HumanHyperParamSetter('explore_factor'), master, PeriodicCallback( Evaluator(EVAL_EPISODE, ['state'], ['logits'], policy_dist=POLICY_DIST), 5), ]), extra_threads_procs=[master], session_config=get_default_sess_config(0.5), model=M, step_per_epoch=STEP_PER_EPOCH, max_epoch=1000, )
def get_infer_iterator(dataset, hparams, lmdb_path): serialize_to_lmdb(dataset, hparams, lmdb_path) batch_size = hparams.infer_batch_size num_gpu = hparams.num_gpu df = LMDBSerializer.load(lmdb_path, shuffle=False) batched_df = BatchData(df, batch_size=batch_size, remainder=False) splitted_df = MapData( batched_df, lambda x: [np.array_split(x[idx], num_gpu) for idx in range(len(x))]) prefetched_df = PrefetchDataZMQ(splitted_df, nr_proc=1, hwm=batch_size * 10) return prefetched_df
def get_iterator(hparams, dataset, lmdb_path, shuffle=True, drop_remainder=True, nr_proc=4): serialize_to_lmdb(hparams, dataset, lmdb_path) batch_size = hparams.batch_size num_gpu = hparams.num_gpu df = LMDBSerializer.load(lmdb_path, shuffle=shuffle) batched_df = BatchData(df, batch_size=batch_size, remainder=not drop_remainder) splitted_df = MapData( batched_df, lambda x: [np.array_split(x[idx], num_gpu) for idx in range(len(x))]) prefetched_df = PrefetchDataZMQ(splitted_df, nr_proc=nr_proc, hwm=batch_size * 10) return prefetched_df
def __call__(self, n_prefetch=1000, n_thread=1): df = self df = BatchData(df, self.batch_size) df = PrefetchData(df, n_prefetch, n_thread) return df
def get_config(args=None, is_chief=True, task_index=0, chief_worker_hostname="", n_workers=1): logger.set_logger_dir(args.train_log_path + datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '_' + str(task_index)) # function to split model parameters between multiple parameter servers ps_strategy = tf.contrib.training.GreedyLoadBalancingStrategy( len(cluster['ps']), tf.contrib.training.byte_size_load_fn) device_function = tf.train.replica_device_setter( worker_device='/job:worker/task:{}/cpu:0'.format(task_index), cluster=cluster_spec, ps_strategy=ps_strategy) M = Model(device_function) name_base = str(uuid.uuid1()).replace('-', '')[:16] PIPE_DIR = os.environ.get('TENSORPACK_PIPEDIR', '.').rstrip('/') namec2s = 'ipc://{}/sim-c2s-{}'.format(PIPE_DIR, name_base) names2c = 'ipc://{}/sim-s2c-{}'.format(PIPE_DIR, name_base) procs = [ MySimulatorWorker(k, namec2s, names2c) for k in range(args.simulator_procs) ] ensure_proc_terminate(procs) start_proc_mask_signal(procs) neptune_client = neptune_mp_server.Client( server_host=chief_worker_hostname, server_port=args.port) master = MySimulatorMaster(task_index, neptune_client, namec2s, names2c, M, dummy=args.dummy, predictor_threads=args.nr_predict_towers, predict_batch_size=args.predict_batch_size, do_train=args.do_train) # here's the data passed to the repeated data source dataflow = BatchData(DataFromQueue(master.queue), BATCH_SIZE) with tf.device(device_function): with tf.variable_scope(tf.get_variable_scope(), reuse=None): lr = tf.Variable(args.learning_rate, trainable=False, name='learning_rate') tf.summary.scalar('learning_rate', lr) intra_op_par = args.intra_op_par inter_op_par = args.inter_op_par session_config = get_default_sess_config(0.5) print("{} {}".format(intra_op_par, type(intra_op_par))) if intra_op_par is not None: session_config.intra_op_parallelism_threads = intra_op_par if inter_op_par is not None: session_config.inter_op_parallelism_threads = inter_op_par session_config.log_device_placement = False extra_arg = { 'dummy_predictor': args.dummy_predictor, 'intra_op_par': intra_op_par, 'inter_op_par': inter_op_par, 'max_steps': args.max_steps, 'device_count': { 'CPU': args.cpu_device_count }, 'threads_to_trace': args.threads_to_trace, 'dummy': args.dummy, 'cpu': args.cpu, 'queue_size': args.queue_size, #'worker_host' : "grpc://localhost:{}".format(cluster['worker'][my_task_index].split(':')[1]), 'worker_host': server.target, 'is_chief': is_chief, 'device_function': device_function, 'n_workers': n_workers, 'use_sync_opt': args.use_sync_opt, 'port': args.port, 'batch_size': BATCH_SIZE, 'debug_charts': args.debug_charts, 'adam_debug': args.adam_debug, 'task_index': task_index, 'lr': lr, 'schedule_hyper': args.schedule_hyper, 'experiment_dir': args.experiment_dir } print("\n\n worker host: {} \n\n".format(extra_arg['worker_host'])) with tf.device(device_function): if args.optimizer == 'adam': optimizer = tf.train.AdamOptimizer(lr, epsilon=args.epsilon, beta1=args.beta1, beta2=args.beta2) if args.adam_debug: optimizer = MyAdamOptimizer(lr, epsilon=args.epsilon, beta1=args.beta1, beta2=args.beta2) elif args.optimizer == 'gd': optimizer = tf.train.GradientDescentOptimizer(lr) elif args.optimizer == 'adagrad': optimizer = tf.train.AdagradOptimizer(lr) elif args.optimizer == 'adadelta': optimizer = tf.train.AdadeltaOptimizer(lr, epsilon=1e-3) elif args.optimizer == 'momentum': optimizer = tf.train.MomentumOptimizer(lr, momentum=0.9) elif args.optimizer == 'rms': optimizer = tf.train.RMSPropOptimizer(lr) # wrap in SyncReplicasOptimizer if args.use_sync_opt == 1: if not args.adam_debug: optimizer = tf.train.SyncReplicasOptimizer( optimizer, replicas_to_aggregate=args.num_grad, total_num_replicas=n_workers) else: optimizer = MySyncReplicasOptimizer( optimizer, replicas_to_aggregate=args.num_grad, total_num_replicas=n_workers) extra_arg['hooks'] = optimizer.make_session_run_hook(is_chief) callbacks = [ StatPrinter(), master, DebugLogCallback(neptune_client, worker_id=task_index, nr_send=args.send_debug_every, debug_charts=args.debug_charts, adam_debug=args.adam_debug, schedule_hyper=args.schedule_hyper) ] if args.debug_charts: callbacks.append( HeartPulseCallback('heart_pulse_{}.log'.format( os.environ['SLURMD_NODENAME']))) if args.early_stopping is not None: args.early_stopping = float(args.early_stopping) if my_task_index == 1 and not args.eval_node: # only one worker does evaluation callbacks.append( PeriodicCallback( Evaluator(EVAL_EPISODE, ['state'], ['logits'], neptune_client, worker_id=task_index, solved_score=args.early_stopping), 2)) elif my_task_index == 1 and not args.eval_node: # only 1 worker does evaluation callbacks.append( PeriodicCallback( Evaluator(EVAL_EPISODE, ['state'], ['logits'], neptune_client, worker_id=task_index), 2)) if args.save_every != 0: callbacks.append( PeriodicPerStepCallback( ModelSaver(var_collections=M.vars_for_save, models_dir=args.models_dir), args.save_every)) if args.schedule_hyper and my_task_index == 2: callbacks.append( HyperParameterScheduler('learning_rate', [(20, 0.0005), (60, 0.0001)])) callbacks.append( HyperParameterScheduler('entropy_beta', [(40, 0.005), (80, 0.001)])) return TrainConfig(dataset=dataflow, optimizer=optimizer, callbacks=Callbacks(callbacks), extra_threads_procs=[master], session_config=session_config, model=M, step_per_epoch=STEP_PER_EPOCH, max_epoch=args.max_epoch, extra_arg=extra_arg)
def dataflow(self, nr_prefetch=1000, nr_thread=1): ds = self ds = BatchData(ds, self.batch_size) ds = PrefetchData(ds, nr_prefetch, nr_thread) return ds
def get_dataflow_batch(path, is_train, batchsize): ds = get_dataflow(path, is_train) ds = BatchData(ds, batchsize) ds = PrefetchData(ds, 10, 2) return ds
def __call__(self, n_prefetch=1, n_thread=1): df = self df = BatchData(df, 1) df = PrefetchData(df, n_prefetch, n_thread) return df