def __new__(cls, root, transform=None, filter=None): from tensorflow.python.platform import gfile # pylint: disable=no-name-in-module glob_pattern = os.path.join(root, '*-*-of-*') file_names = gfile.Glob(glob_pattern) if not file_names: raise ValueError( 'Found no files in --root matching: {}'.format(glob_pattern)) # pylint: disable=no-name-in-module from tensorflow.python.data.experimental import parallel_interleave from lpot.experimental.data.transforms.imagenet_transform import ParseDecodeImagenet ds = tf.data.TFRecordDataset.list_files(file_names, shuffle=False) ds = ds.apply( parallel_interleave(tf.data.TFRecordDataset, cycle_length=len(file_names))) if transform is not None: transform.transform_list.insert(0, ParseDecodeImagenet()) else: transform = ParseDecodeImagenet() ds = ds.map(transform, num_parallel_calls=None) ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE ) # this number can be tuned return ds
def __new__(cls, root, num_cores=28, transform=None, filter=filter): record_iterator = tf.compat.v1.python_io.tf_record_iterator(root) example = tf.train.SequenceExample() for element in record_iterator: example.ParseFromString(element) break feature = example.context.feature if len(feature['image/object/class/text'].bytes_list.value) == 0 \ and len(feature['image/object/class/label'].int64_list.value) == 0: raise ValueError("Tfrecord format is incorrect, please refer\ 'https://github.com/tensorflow/models/blob/master/research/\ object_detection/dataset_tools/create_coco_tf_record.py' to\ create correct tfrecord") # pylint: disable=no-name-in-module from tensorflow.python.data.experimental import parallel_interleave tfrecord_paths = [root] ds = tf.data.TFRecordDataset.list_files(tfrecord_paths) ds = ds.apply( parallel_interleave(tf.data.TFRecordDataset, cycle_length=num_cores, block_length=5, sloppy=True, buffer_output_elements=10000, prefetch_input_elements=10000)) if transform is not None: ds = ds.map(transform, num_parallel_calls=None) if filter is not None: ds = ds.filter(filter) ds = ds.prefetch(buffer_size=1000) return ds
def __new__(cls, root, subset='validation', num_cores=28, transform=None, filter=None): assert subset in ('validation', 'train'), \ 'only support subset (validation, train)' from tensorflow.python.platform import gfile glob_pattern = os.path.join(root, '%s-*-of-*' % subset) file_names = gfile.Glob(glob_pattern) if not file_names: raise ValueError( 'Found no files in --root matching: {}'.format(glob_pattern)) from tensorflow.python.data.experimental import parallel_interleave ds = tf.data.TFRecordDataset.list_files(file_names, shuffle=False) ds = ds.apply( parallel_interleave(tf.data.TFRecordDataset, cycle_length=num_cores)) ds = ds.map(transform, num_parallel_calls=None) ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE ) # this number can be tuned return ds
def __new__(cls, root, subset='validation', num_cores=28, transform=None, filter=None): assert subset in ('validation', 'train'), \ 'only support subset (validation, train)' logger.warning('This api is going to be deprecated, ' 'please use ImageRecord instead') from tensorflow.python.platform import gfile glob_pattern = os.path.join(root, '%s-*-of-*' % subset) file_names = gfile.Glob(glob_pattern) if not file_names: raise ValueError('Found no files in --root matching: {}'.format(glob_pattern)) from tensorflow.python.data.experimental import parallel_interleave from lpot.experimental.data.transforms.imagenet_transform import ParseDecodeImagenet ds = tf.data.TFRecordDataset.list_files(file_names, shuffle=False) ds = ds.apply( parallel_interleave( tf.data.TFRecordDataset, cycle_length=num_cores)) if transform is not None: transform.transform_list.insert(0, ParseDecodeImagenet()) else: transform = ParseDecodeImagenet() ds = ds.map(transform, num_parallel_calls=None) ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) # this number can be tuned return ds
def accuracy_check(self): print("Inference for accuracy check.") total_iter = COCO_NUM_VAL_IMAGES break_session = 251 fm = category_map fm = dict(zip(list(fm.values()), list(fm.keys()))) print('total iteration is {0}'.format(str(total_iter))) global model, graph result = [] with tf.Session().as_default() as sess: if self.args.data_location: self.build_data_sess(need_reshape=True) else: raise Exception("no data location provided") evaluator = CocoDetectionEvaluator() total_samples = 0 self.coord = tf.train.Coordinator() tfrecord_paths = [self.args.data_location] ds = tf.data.TFRecordDataset.list_files(tfrecord_paths) ds = ds.apply( parallel_interleave(tf.data.TFRecordDataset, cycle_length=28, block_length=5, buffer_output_elements=10000, prefetch_input_elements=10000)) ds = ds.prefetch(buffer_size=10000) ds = ds.apply( map_and_batch(map_func=parse_and_preprocess, batch_size=self.args.batch_size, num_parallel_batches=1, num_parallel_calls=None)) ds = ds.prefetch(buffer_size=10000) ds_iterator = tf.data.make_one_shot_iterator(ds) state = None warmup_iter = 0 self.ground_truth_dicts = {} self.detect_dicts = {} self.total_iter = total_iter self.image_id_gt_dict = {} self.weights = np.arange(1000) if self.args.data_location: image_batches = [] obj = self for idx in tqdm(range(total_iter)): bbox, label, image_id, features = ds_iterator.get_next() result.append((bbox, label, image_id, features)) for idx in tqdm(range(total_iter)): run_ice_breaker_session(result, obj, fm, sess, total_iter, break_session, idx)
def __new__(cls, root, transform=None, filter=None): # pylint: disable=no-name-in-module from tensorflow.python.data.experimental import parallel_interleave from tensorflow.python.platform import gfile file_names = gfile.Glob(root) ds = tf.data.Dataset.from_tensor_slices(file_names) ds = ds.apply( parallel_interleave(tf.data.TFRecordDataset, cycle_length=len(file_names))) if transform is not None: ds = ds.map(transform, num_parallel_calls=None) ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE ) # this number can be tuned return ds
def run_inference(self, params): self.logfile.info("Inference for accuracy check.") total_iter = COCO_NUM_VAL_IMAGES fm = category_map fm = dict(zip(list(fm.values()), list(fm.keys()))) self.logfile.info('total iteration is {0}'.format(str(total_iter))) result = [] global model, graph with tf.compat.v1.Session().as_default() as sess: with sess.graph.as_default() as graph: evaluator = CocoDetectionEvaluator() total_samples = 0 self.coord = tf.train.Coordinator() tfrecord_paths = [self.args.data_location] ds = tf.data.TFRecordDataset.list_files(tfrecord_paths) ds = ds.apply( parallel_interleave(tf.data.TFRecordDataset, cycle_length=1, block_length=1, buffer_output_elements=10000, prefetch_input_elements=10000)) ds = ds.prefetch(buffer_size=10000) ds = ds.apply( map_and_batch(map_func=parse_and_preprocess, batch_size=self.args.batch_size, num_parallel_batches=1, num_parallel_calls=None)) ds = ds.prefetch(buffer_size=10000) ds_iterator = tf.compat.v1.data.make_one_shot_iterator(ds) state = None warmup_iter = 0 self.ground_truth_dicts = {} self.detect_dicts = {} self.total_iter = total_iter self.image_id_gt_dict = {} obj = self if self.args.data_location: for idx in range(total_iter): bbox, label, image_id, features = ds_iterator.get_next( ) result.append((bbox, label, image_id, features)) for idx in range(total_iter): run_ice_breaker_session(result, obj, params, fm, sess, total_iter, idx)
def minibatch(self, dataset, subset, cache_data=False): with tf.compat.v1.name_scope('batch_processing'): glob_pattern = dataset.tf_record_pattern(subset) file_names = gfile.Glob(glob_pattern) if not file_names: raise ValueError( 'Found no files in --data_dir matching: {}'.format( glob_pattern)) ds = tf.data.TFRecordDataset.list_files(file_names) # number of parallel open files and tfrecords should be tuned according to # different batch size ds = ds.apply( parallel_interleave(tf.data.TFRecordDataset, cycle_length=28, block_length=5, sloppy=True, buffer_output_elements=10000, prefetch_input_elements=10000)) if cache_data: ds = ds.take(1).cache().repeat() ds = ds.prefetch(buffer_size=10000) #ds = ds.prefetch(buffer_size=self.batch_size) ds = ds.apply( map_and_batch( map_func=self.parse_and_preprocess, batch_size=self.batch_size, num_parallel_batches=56, num_parallel_calls=None)) # this number should be tuned ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE ) # this number can be tuned ds_iterator = tf.compat.v1.data.make_one_shot_iterator(ds) images, labels = ds_iterator.get_next() # reshape labels = tf.reshape(labels, [self.batch_size]) return images, labels
def minibatch(self, dataset, subset, cache_data=False): with tf.name_scope('batch_processing'): glob_pattern = dataset.tf_record_pattern(subset) file_names = gfile.Glob(glob_pattern) if not file_names: raise ValueError( 'Found no files in --data_dir matching: {}'.format( glob_pattern)) ds = tf.data.TFRecordDataset.list_files(file_names) ds = ds.apply( parallel_interleave(tf.data.TFRecordDataset, cycle_length=self.num_cores, block_length=5, sloppy=True, buffer_output_elements=10000, prefetch_input_elements=10000)) if cache_data: ds = ds.take(1).cache().repeat() ds = ds.prefetch(buffer_size=10000) # ds = ds.prefetch(buffer_size=self.batch_size) # num of parallel batches not greater than 56 max_num_parallel_batches = min(56, 2 * self.num_cores) ds = ds.apply( map_and_batch(map_func=self.parse_and_preprocess, batch_size=self.batch_size, num_parallel_batches=max_num_parallel_batches, num_parallel_calls=None)) ds = ds.prefetch(buffer_size=tf.contrib.data.AUTOTUNE) ds_iterator = ds.make_one_shot_iterator() images, labels, filename = ds_iterator.get_next() # reshape labels = tf.reshape(labels, [self.batch_size]) filename = tf.reshape(filename, [self.batch_size]) return images, labels, filename
def get_input(self): tfrecord_paths = [self.args.data_location] ds = tf.data.TFRecordDataset.list_files(tfrecord_paths) ds = ds.apply( parallel_interleave(tf.data.TFRecordDataset, cycle_length=28, block_length=5, sloppy=True, buffer_output_elements=10000, prefetch_input_elements=10000)) ds = ds.prefetch(buffer_size=10000) ds = ds.apply( map_and_batch(map_func=parse_and_preprocess, batch_size=self.args.batch_size, num_parallel_batches=28, num_parallel_calls=None)) ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) ds_iterator = tf.compat.v1.data.make_one_shot_iterator(ds) images, bbox, label, image_id = ds_iterator.get_next() return images, bbox, label, image_id
def minibatch(self, dataset, subset, cache_data=False): with tf.compat.v1.name_scope('batch_processing'): glob_pattern = dataset.tf_record_pattern(subset) file_names = gfile.Glob(glob_pattern) if not file_names: raise ValueError( 'Found no files in --data_dir matching: {}'.format( glob_pattern)) ds = tf.data.TFRecordDataset.list_files(file_names, shuffle=False) ds = ds.apply( parallel_interleave(tf.data.TFRecordDataset, cycle_length=self.num_cores)) if cache_data: ds = ds.take(1).cache().repeat() ds = ds.prefetch(buffer_size=self.batch_size) # num of parallel batches not greater than 56 max_num_parallel_batches = min(56, 2 * self.num_cores) ds = ds.apply( map_and_batch(map_func=self.parse_and_preprocess, batch_size=self.batch_size, num_parallel_batches=max_num_parallel_batches, num_parallel_calls=None)) ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE ) # this number can be tuned ds_iterator = tf.compat.v1.data.make_one_shot_iterator(ds) images, labels = ds_iterator.get_next() # reshape labels = tf.reshape(labels, [self.batch_size]) return images, labels
def minibatch(self, dataset, subset, use_datasets, cache_data, shift_ratio=-1): if shift_ratio < 0: shift_ratio = self.shift_ratio with tf.compat.v1.name_scope('batch_processing'): # Build final results per split. images = [[] for _ in range(self.num_splits)] labels = [[] for _ in range(self.num_splits)] if use_datasets: glob_pattern = dataset.tf_record_pattern(subset) file_names = gfile.Glob(glob_pattern) if not file_names: raise ValueError( 'Found no files in --data_dir matching: {}'.format( glob_pattern)) ds = tf.data.TFRecordDataset.list_files(file_names) ds = ds.apply( #interleave_ops.parallel_interleave( parallel_interleave( # tf.data.TFRecordDataset, cycle_length=10)) if cache_data: ds = ds.take(1).cache().repeat() counter = tf.data.Dataset.range(self.batch_size) counter = counter.repeat() ds = tf.data.Dataset.zip((ds, counter)) ds = ds.prefetch(buffer_size=self.batch_size) ds = ds.shuffle(buffer_size=10000) ds = ds.repeat() ds = ds.apply( #batching.map_and_batch( map_and_batch( ### map_func=self.parse_and_preprocess, batch_size=self.batch_size_per_split, num_parallel_batches=self.num_splits)) ds = ds.prefetch(buffer_size=self.num_splits) ds_iterator = tf.compat.v1.data.make_one_shot_iterator(ds) for d in xrange(self.num_splits): labels[d], images[d] = ds_iterator.get_next() else: record_input = data_flow_ops.RecordInput( file_pattern=dataset.tf_record_pattern(subset), seed=301, parallelism=64, buffer_size=10000, batch_size=self.batch_size, shift_ratio=shift_ratio, name='record_input') records = record_input.get_yield_op() records = tf.split(records, self.batch_size, 0) records = [tf.reshape(record, []) for record in records] for idx in xrange(self.batch_size): value = records[idx] (label, image) = self.parse_and_preprocess(value, idx) split_index = idx % self.num_splits labels[split_index].append(label) images[split_index].append(image) for split_index in xrange(self.num_splits): if not use_datasets: images[split_index] = tf.parallel_stack( images[split_index]) labels[split_index] = tf.concat(labels[split_index], 0) images[split_index] = tf.cast(images[split_index], self.dtype) depth = 3 images[split_index] = tf.reshape(images[split_index], shape=[ self.batch_size_per_split, self.height, self.width, depth ]) labels[split_index] = tf.reshape(labels[split_index], [self.batch_size_per_split]) return images, labels
def run_benchmark(self): if self.args.data_location: print("Inference with real data.") else: print("Inference with dummy data.") global model, graph with tf.Session().as_default() as sess: with sess.graph.as_default() as graph: if self.args.data_location: self.build_data_sess() else: raise Exception("no data location provided") total_iter = 1000 warmup_iter = 0 ttime = 0.0 print('total iteration is {0}'.format(str(total_iter))) print('warm up iteration is {0}'.format(str(warmup_iter))) total_samples = 0 if self.args.data_location: tfrecord_paths = [self.args.data_location] ds = tf.data.TFRecordDataset.list_files(tfrecord_paths) ds = ds.apply( parallel_interleave(tf.data.TFRecordDataset, cycle_length=1, block_length=1, buffer_output_elements=10000, prefetch_input_elements=10000)) ds = ds.prefetch(buffer_size=10000) ds = ds.apply( map_and_batch(map_func=parse_and_preprocess, batch_size=self.args.batch_size, num_parallel_batches=1, num_parallel_calls=None)) ds = ds.prefetch(buffer_size=10000) ds_iterator = tf.data.make_one_shot_iterator(ds) for step in range(total_iter): box, label, image_source, features = ds_iterator.get_next( ) if features is None: break box = box.eval(session=sess) label = label.eval(session=sess).flatten() image_source = image_source.eval( session=sess).flatten() image_source = [ image_id if type(image_id) == 'str' else image_id.decode('utf-8') for image_id in image_source ] label = [ x if type(x) == 'str' else x.decode('utf-8') for x in label ] start_time = time.time() images = [ np.asarray( PIL.Image.open( os.path.join(self.args.imagesets_dir, image_id)).convert('RGB')) for image_id in image_source ] images = self.preprocess_bounding_box_images( images, bbox, image_source) total_samples += images.shape[0] arcface_features = self.arcface_model.predict( np.vstack(images), verbose=1) end_time = time.time() duration = end_time - start_time if (step + 1) % 10 == 0: print('steps = {0}, {1} sec'.format( str(step), str(duration))) if step + 1 > warmup_iter: ttime += duration # if len(image_batches) == self.args.batch_size: print('Batchsize: {0}'.format(str( self.args.batch_size))) print('Time spent per BATCH: {0:10.4f} ms'.format( ttime / total_samples * 1000)) print('Total samples/sec: {0:10.4f} samples/s'.format( total_samples * self.args.batch_size / ttime)) print('Total labeled samples: {0} person'.format( np.where(np.array(label) == "person")[0].shape[0]))
def input_fn(params): """The actual input function.""" batch_size = params["batch_size"] weights = tf.constant([ 2.07500868, 1.03715683, 1.93284648, 1.21630542, 1.13895236, 1.35432071, 1.09689163, 1.6204935, 1.16378624, 1.18903464, 1., 1.56280185, 1.35888667, 1.28946654, 1.37520308, 1.20126476, 1.1674391, 1.22790734, 1.10949764, 2.07500868, 1.49259834, 2.07500868 ]) tf.logging.info("Using {} threads".format(num_cpu_threads)) # `cycle_length` is the number of parallel files that get read. cycle_length = len(input_files) # For training, we want a lot of parallel reading and shuffling. # For eval, we want no shuffling and parallel reading doesn't matter. if is_training: tf.logging.info("There are {} files for training".format( len(input_files))) upsampling_factor = [ get_upsampling_factor(fn) for fn in input_files ] tf.logging.info(upsampling_factor) d = tf.data.Dataset.from_tensor_slices( (tf.constant(input_files), tf.constant(upsampling_factor))) d = d.shuffle(buffer_size=len(input_files)) # Repeat data in the file for unlimited number. This solves class imbalance problem. def get_tfrecord_dataset(filename, upsampling_factor): tfrecord_dataset = tf.data.TFRecordDataset( filename, compression_type='GZIP') if balance: tfrecord_dataset = tfrecord_dataset.repeat( tf.cast(upsampling_factor, dtype=tf.int64)) return tfrecord_dataset # `sloppy` mode means that the interleaving is not exact. This adds # even more randomness to the training pipeline. d = d.apply( parallel_interleave( lambda filename, upsampling_factor: get_tfrecord_dataset( filename, upsampling_factor), sloppy=is_training, cycle_length=cycle_length)) d = d.shuffle(buffer_size=200000) d = d.repeat() else: d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files)) # `sloppy` mode means that the interleaving is not exact. This adds # even more randomness to the training pipeline. d = d.apply( parallel_interleave(lambda filename: tf.data.TFRecordDataset( filename, compression_type='GZIP'), sloppy=is_training, cycle_length=cycle_length)) # Since we evaluate for a fixed number of steps we don't want to encounter # out-of-range exceptions. # d = d.repeat() d = d.shuffle(buffer_size=10000) # We must `drop_remainder` on training because the TPU requires fixed # size dimensions. For eval, we assume we are evaluating on the CPU or GPU # and we *don't* want to drop the remainder, otherwise we wont cover # every sample. d = d.apply( map_and_batch(lambda record: _decode_record( record, max_seq_length, max_predictions_per_seq, vocab_size, is_training, weights), batch_size=batch_size, num_parallel_batches=num_cpu_threads, drop_remainder=True)).prefetch(batch_size) return d
def accuracy_check(self): print("Inference for accuracy check.") total_iter = COCO_NUM_VAL_IMAGES fm = category_map fm = dict(zip(list(fm.values()),list(fm.keys()))) print('total iteration is {0}'.format(str(total_iter))) global model, graph with tf.Session().as_default() as sess: if self.args.data_location: network = self.build_data_sess() else: raise Exception("no data location provided") total_samples = 0 self.coord = tf.train.Coordinator() tfrecord_paths = [self.args.data_location] ds = tf.data.TFRecordDataset.list_files(tfrecord_paths) ds = ds.apply( parallel_interleave( tf.data.TFRecordDataset, cycle_length=1, block_length=1, buffer_output_elements=10000, prefetch_input_elements=10000)) ds = ds.prefetch(buffer_size=10000) ds = ds.apply( map_and_batch( map_func=parse_and_preprocess, batch_size=self.args.batch_size, num_parallel_batches=1, num_parallel_calls=None)) ds = ds.prefetch(buffer_size=10000) ds_iterator = tf.data.make_one_shot_iterator(ds) state = None warmup_iter = 0 self.ground_truth_dicts = {} self.detect_dicts = {} self.total_iter = total_iter self.image_id_gt_dict = {} if self.args.data_location: image_batches = [] for step in range(total_iter): bbox, label, image_id, features = ds_iterator.get_next() features, bbox, label, image_id = \ tuple(features.items()), bbox, label, image_id if features is None: break bbox, label, image_id = sess.run([bbox, label, image_id]) # ground truth of bounding boxes from pascal voc ground_truth = {} ground_truth['boxes'] = np.asarray(bbox[0]) label_gt = [fm[l] if type(l) == 'str' else fm[l.decode('utf-8')] for l in label] image_id_gt = [i if type(i) == 'str' else i.decode('utf-8') for i in image_id] ground_truth['classes'] = np.array(label_gt*len(ground_truth['boxes'])) # saving all ground truth dictionaries self.ground_truth_dicts[step] = ground_truth self.image_id_gt_dict[step] = image_id_gt[0] images = np.asarray(PIL.Image.open(os.path.join(self.args.imagesets_dir, image_id_gt[0])).convert('RGB')) # face detection boxes, confidences, classIds = self.face_detector(images, image_id_gt, self.args.batch_size) images = np.asarray(PIL.Image.open(os.path.join(self.args.imagesets_dir, image_id_gt[0])).convert('L')) boxes = self.filter_conventional_box_images(boxes) try: if len(boxes) > 0: # ground truth bounding box images_new = self.preprocess_bounding_box_images(images, bbox[0], image_id_gt, grayscale=True) total_samples += images_new.shape[0] images_new = self.augment_images(images_new, grayscale=True) # detection for bounding boxes from pascal voc detect = {} label_det = label_gt image_id_det = image_id_gt # detected conventional bounding box images_sync = self.preprocess_conventional_box_images(images, boxes, image_id_det, grayscale=True) images_sync = self.augment_images(images_sync, grayscale=True) if True: images_sync = np.expand_dims(images_sync, 3) images_sync = images_sync.transpose((0,3,1,2)) images = np.zeros((1,100,1,160,160)) l = 100 if len(images_sync) > 100 else len(images_sync) images[:,0:l,:,:] = images_sync[0:l] images_sync = images detect['boxes'] = np.asarray(boxes) detect['classes'] = np.asarray(label_det*len(detect['boxes'])) features = self.arcface_model.predict(network, images_sync, 0.5, verbose=1) measures = self.measure(features) if measures[0][0]: detect['scores'] = np.broadcast_to(np.mean(np.asarray(measures[0][1])),len(detect['boxes'])) elif np.mean(measures[0][1]) > 0: detect['scores'] = np.broadcast_to(np.mean(np.asarray(measures[0][1])),len(detect['boxes'])) else: detect['scores'] = np.broadcast_to(np.asarray([0]),len(detect['boxes'])) self.detect_dicts[step] = detect except Exception as e: raise e if (step + 1) % 10 == 0: print('steps = {0} step'.format(str(step))) if step == 70: break