def block_get(self, ids, dtype=None, show_progress=False): indices = [ self._id_to_idx[id] for id in progress_bar(ids, show_progress=show_progress) ] ret = self._data[indices, :] if dtype: ret = ret.astype(dtype) return ret
def download_images(item_type, item_ids, img_attr, image_paths, format=None, dimensions=None): '''Downloads all images and saves them the the specified paths and returns the successes as a list. The image downloading will be dispatched on Celery and the results will be aggregated using a Queue. :param item_type: The type of the model class for the items which contain the image data we want to download. This class should have `img_attr` attribute/property. :param item_ids: A list of IDs of database objects which have the location of the images to download. :param img_attr: The name of the attribute of the database object which stores the image to be downloaded. Note: This field has to be a django ImageField (or behave like one). :param image_paths: A list of image paths where the images will be downloaded ''' from cnntools.tasks import download_image_batch_task print 'Dispatching tasks...' batch_num = 0 for batch in iter_batch(progress_bar(zip(item_ids, image_paths)), n=1024): item_ids_batch, image_paths_batch = zip(*batch) download_image_batch_task.delay( item_type, item_ids_batch, image_paths_batch, img_attr, format, dimensions ) batch_num += 1 # TODO: Maybe put a wait here to let all the tasks be dispatched, because # we may quit before they are all finished... print 'Waiting for tasks to finish...' pbar = ProgressBar( widgets=progress_bar_widgets(), maxval=batch_num, ) pbar.start() while True: time.sleep(5) queue_info = get_queue_info('artifact') pbar.update(batch_num - queue_info.message_count) if queue_info.message_count == 0: break pbar.finish()
def block_get(self, ids, dtype=None, ret=None, batchsize=512, show_progress=False): """ Efficiently fetch a block of descriptors by IDs, optionally converting them to another dtype. """ try: if ret is None: ret_dtype = dtype if dtype else self._data.dtype ret = np.empty((len(ids), self._data.shape[1]), dtype=ret_dtype) if len(ids): # Sorting and batching is necessary due to the requirments of "fancy indexing" # (http://docs.h5py.org/en/latest/high/dataset.html#fancy-indexing). indices = np.array([self._id_to_idx[id] for id in ids]) order = np.argsort(indices) for i in progress_bar(xrange(0, len(ids), batchsize), show_progress=show_progress): sub_order = order[i:i+batchsize] ret[sub_order, :] = self._data[indices[sub_order], :] except: print 'Unhandled exception in block_get!' print 'File path: ', self._path raise return ret
def download_photos(photo_ids, image_paths, format=None, dimensions=None): '''This is a specialized function which handles only Photo objects. Downloads all images and saves them the the specified paths and returns the successes as a list. The image downloading will be dispatched on Celery and the results will be aggregated using a Queue. :param photo_ids: A list of Photo IDs to download. :param image_paths: A list of image paths where the images will be downloaded ''' from photos.tasks import download_photo_batch_task print 'Dispatching tasks...' batch_num = 0 for batch in iter_batch(progress_bar(zip(photo_ids, image_paths)), n=1024): photo_ids_batch, image_paths_batch = zip(*batch) download_photo_batch_task.delay( photo_ids_batch, image_paths_batch, format, dimensions ) batch_num += 1 # TODO: Maybe put a wait here to let all the tasks be dispatched, because # we may quit before they are all finished... print 'Waiting for tasks to finish...' pbar = ProgressBar( widgets=progress_bar_widgets(), maxval=batch_num, ) pbar.start() while True: time.sleep(5) queue_info = get_queue_info('artifact') pbar.update(batch_num - queue_info.message_count) if queue_info.message_count == 0: break pbar.finish()
def make_trainingfiles(rel_root_path, filename_suffix, item_type, item_ids, skip_test, split_attr, gen_image_specs_func, gen_line_func, trafo_image_func, gen_line_extra_kwargs=None, trafo_image_extra_kwargs=None, img_obj_type=None, img_attr=None, dimensions=(256, 256), max_valset_size=10000): """ This function creates the training text files which is used for CNN training with Caffe. It also downloads all photos which are part of the dataset. This is a general function which can be used for lots of different layers depending on the gen_line_func function. :param rel_root_path: The root path of the photos and generated training files relative to the Caffe root path. :param filename_suffix: Added suffix to the generated training file names. :param item_type: The type of the model class for the items which are classified (e.g. FgPhoto). This class should have 'photo', 'matclass_dataset_split' attributes/properties. The photo attribute should have most of the Photo model's fields. It is advised to use an actual Photo instance here. The matclass_dataset_split attribute should indicate which dataset split this item is in. The possible dataset splits are 'E' (test), 'V' (validation), 'R' (training). :param item_ids: List (or numpy array) of ids into the :ref:`item_type` table. It should contain the training, validation and test set. :param skip_test: If true, skip generating file and downloading images for the test split. :param split_attr: The attribute name which represents the dataset split in the database. It should be one character, 'E' meaning test, 'V' meaning validation, 'R' meaning training. :param gen_image_specs_func: Function which generates an id, photo id, image path triplet for each item which we later use to download the images. :param gen_line_func: Function which generates a line into the training text file given the image path and the extra parameters. :ref:`gen_line_extra_kwargs` will be passed as extra parameters to this function. :param trafo_image_func: If None, we don't apply any transformation on the images. Function which transforms an image given the image path and the extra parameters, it should return the path of the transformed image, which can be the original image path or a new path. :ref:`trafo_image_extra_kwargs` will be passed as extra parameters to this function. :param gen_line_extra_kwargs: Extra keyword arguments which will be passed to :ref:`gen_line_func` function. All of them should be a list which has the same order as :ref:`item_ids`. :param trafo_image_extra_kwargs: Extra keyword arguments which will be passed to :ref:`trafo_image_func` function. All of them should be a list which has the same order as :ref:`item_ids`. :param img_obj_type: The type of the model class which holds an image. :param img_attr: The attribute of `img_obj_type` which holds the image. :param dimensions: The dimensions to resize the downloaded images to. If None, keep the image as original size. :param max_valset_size: The maximum size for the validation set. """ image_data = process_images( rel_root_path=rel_root_path, item_type=item_type, item_ids=item_ids, skip_test=skip_test, split_attr=split_attr, gen_image_specs_func=gen_image_specs_func, trafo_image_func=trafo_image_func, trafo_image_extra_kwargs=trafo_image_extra_kwargs, img_obj_type=img_obj_type, img_attr=img_attr, dimensions=dimensions, max_valset_size=max_valset_size, ) abbr, fnames = get_abbr_fname(skip_test) for mc_ds_s, fname in zip(abbr, fnames): splitfile_path = os.path.join( rel_root_path, '{}{}.txt'.format(fname, filename_suffix) ) print 'Writing Caffe {} text file...'.format(fname) with open(os.path.join(settings.CAFFE_ROOT, splitfile_path), mode='w') as splitfile: item_idxs, image_paths_list = image_data[mc_ds_s] for item_idx, image_paths in progress_bar(zip(item_idxs, image_paths_list)): line = gen_line_func( image_paths, **index_kwargs(gen_line_extra_kwargs, item_idx) ) splitfile.write('{}\n'.format(line))
def process_images(rel_root_path, item_type, item_ids, skip_test, split_attr, gen_image_specs_func, trafo_image_func, trafo_image_extra_kwargs=None, img_obj_type=None, img_attr=None, dimensions=(256, 256), max_valset_size=10000): """ This function downloads all photos which are part of the dataset. This is a general function which can be used for lots of different layers. It returns a dictionary which contains the downloaded image paths. Key: dataset split identifier, can be 'E', 'V', 'R' Value: tuple of (item indexes in the item_ids array, corresponding image paths) :param rel_root_path: The root path of the photos and generated training files relative to the Caffe root path. :param item_type: The type of the model class for the items which are classified (e.g. FgPhoto). This class should have 'photo', 'matclass_dataset_split' attributes/properties. The photo attribute should have most of the Photo model's fields. It is advised to use an actual Photo instance here. The matclass_dataset_split attribute should indicate in which dataset split this item is in. The possible dataset splits are 'E' (test), 'V' (validation), 'R' (training). :param item_ids: List (or numpy array) of ids into the :ref:`item_type` table. It should contain the training, validation and test set. :param skip_test: If true, skip generating file and downloading images for the test split. :param split_attr: The attribute name which represents the dataset split in the database. It should be one character, 'E' meaning test, 'V' meaning validation, 'R' meaning training. :param gen_image_specs_func: Function which generates an id, photo id, image path triplet for each item which we later use to download the images. :param trafo_image_func: If None, we don't apply any transformation on the images. Function which transforms an image given the image path and the extra parameters, it should return the path of the transformed image, which can be the original image path or a new path. :ref:`trafo_image_extra_kwargs` will be passed as extra parameters to this function. :param trafo_image_extra_kwargs: Extra keyword arguments which will be passed to :ref:`trafo_image_func` function. All of them should be a list which has the same order as :ref:`item_ids`. :param img_obj_type: The type of the model class which holds an image. :param img_attr: The attribute of `img_obj_type` which holds the image. :param dimensions: The dimensions to resize the downloaded images to. If None, keep the image as original size. :param max_valset_size: The maximum size for the validation set. """ item_id_to_idx = {id: idx for idx, id in enumerate(item_ids)} abbr, fnames = get_abbr_fname(skip_test) # The return value image_data = {} for mc_ds_s, fname in zip(abbr, fnames): data_path = os.path.join(rel_root_path, 'data') ensuredir(os.path.join(settings.CAFFE_ROOT, data_path)) print 'Generating split file and downloading images for {} split...'.format(fname) print 'Generating a list of images to download...' image_specs = [] for item_ids_batch in progress_bar(iter_batch(item_ids, 10000)): # Note that the order is not going to be the same as # item_ids_batch, so we expect the data layer to shuffle the data! items_split = ( item_type.objects. filter(**{split_attr: mc_ds_s}). filter(id__in=item_ids_batch). order_by() ) # A list of item_id, image_url, image_path tuples image_specs += gen_image_specs_func(data_path, items_split) if not image_specs: image_data[mc_ds_s] = ([], []) continue # We want the validation step to finish in tractable time, so we have a # maximum threshold on the validation set size if mc_ds_s == 'V' and len(image_specs) > max_valset_size: print 'Sampling {} images to reduce the size of the validation set...'.format(max_valset_size) # For reproducibility random.seed(125) image_specs = random.sample(image_specs, max_valset_size) item_ids_perm, img_obj_ids, image_paths_list = zip(*image_specs) # A corresponding list of indices into the item_ids array item_idxs = [item_id_to_idx[item_id] for item_id in item_ids_perm] # Add caffe root to all paths for downloading full_image_paths_list = [ [ os.path.join(settings.CAFFE_ROOT, ip) for ip in ipl ] for ipl in image_paths_list ] # Downloading images download_images( item_type=img_obj_type, item_ids=list(itertools.chain.from_iterable(img_obj_ids)), img_attr=img_attr, image_paths=list(itertools.chain.from_iterable(full_image_paths_list)), format='JPEG', dimensions=dimensions, ) if trafo_image_func: print 'Transforming images...' new_image_paths_list = [] new_item_idxs = [] for item_idx, image_paths, full_image_paths in progress_bar(zip(item_idxs, image_paths_list, full_image_paths_list)): new_image_paths = trafo_image_func( image_paths, full_image_paths, **index_kwargs(trafo_image_extra_kwargs, item_idx) ) if not new_image_paths: print ':( {}'.format(full_image_paths) continue new_image_paths_list.append(new_image_paths) new_item_idxs.append(item_idx) image_paths_list = new_image_paths_list item_idxs = new_item_idxs image_data[mc_ds_s] = (item_idxs, image_paths_list) return image_data
def make_trainingfiles(rel_root_path, filename_suffix, item_type, item_ids, skip_test, split_attr, gen_image_specs_func, gen_line_func, trafo_image_func, gen_line_extra_kwargs=None, trafo_image_extra_kwargs=None, img_obj_type=None, img_attr=None, dimensions=(256, 256), max_valset_size=10000): """ This function creates the training text files which is used for CNN training with Caffe. It also downloads all photos which are part of the dataset. This is a general function which can be used for lots of different layers depending on the gen_line_func function. :param rel_root_path: The root path of the photos and generated training files relative to the Caffe root path. :param filename_suffix: Added suffix to the generated training file names. :param item_type: The type of the model class for the items which are classified (e.g. FgPhoto). This class should have 'photo', 'matclass_dataset_split' attributes/properties. The photo attribute should have most of the Photo model's fields. It is advised to use an actual Photo instance here. The matclass_dataset_split attribute should indicate which dataset split this item is in. The possible dataset splits are 'E' (test), 'V' (validation), 'R' (training). :param item_ids: List (or numpy array) of ids into the :ref:`item_type` table. It should contain the training, validation and test set. :param skip_test: If true, skip generating file and downloading images for the test split. :param split_attr: The attribute name which represents the dataset split in the database. It should be one character, 'E' meaning test, 'V' meaning validation, 'R' meaning training. :param gen_image_specs_func: Function which generates an id, photo id, image path triplet for each item which we later use to download the images. :param gen_line_func: Function which generates a line into the training text file given the image path and the extra parameters. :ref:`gen_line_extra_kwargs` will be passed as extra parameters to this function. :param trafo_image_func: If None, we don't apply any transformation on the images. Function which transforms an image given the image path and the extra parameters, it should return the path of the transformed image, which can be the original image path or a new path. :ref:`trafo_image_extra_kwargs` will be passed as extra parameters to this function. :param gen_line_extra_kwargs: Extra keyword arguments which will be passed to :ref:`gen_line_func` function. All of them should be a list which has the same order as :ref:`item_ids`. :param trafo_image_extra_kwargs: Extra keyword arguments which will be passed to :ref:`trafo_image_func` function. All of them should be a list which has the same order as :ref:`item_ids`. :param img_obj_type: The type of the model class which holds an image. :param img_attr: The attribute of `img_obj_type` which holds the image. :param dimensions: The dimensions to resize the downloaded images to. If None, keep the image as original size. :param max_valset_size: The maximum size for the validation set. """ image_data = process_images( rel_root_path=rel_root_path, item_type=item_type, item_ids=item_ids, skip_test=skip_test, split_attr=split_attr, gen_image_specs_func=gen_image_specs_func, trafo_image_func=trafo_image_func, trafo_image_extra_kwargs=trafo_image_extra_kwargs, img_obj_type=img_obj_type, img_attr=img_attr, dimensions=dimensions, max_valset_size=max_valset_size, ) abbr, fnames = get_abbr_fname(skip_test) for mc_ds_s, fname in zip(abbr, fnames): splitfile_path = os.path.join( rel_root_path, '{}{}.txt'.format(fname, filename_suffix)) print 'Writing Caffe {} text file...'.format(fname) with open(os.path.join(settings.CAFFE_ROOT, splitfile_path), mode='w') as splitfile: item_idxs, image_paths_list = image_data[mc_ds_s] for item_idx, image_paths in progress_bar( zip(item_idxs, image_paths_list)): line = gen_line_func( image_paths, **index_kwargs(gen_line_extra_kwargs, item_idx)) splitfile.write('{}\n'.format(line))
def process_images(rel_root_path, item_type, item_ids, skip_test, split_attr, gen_image_specs_func, trafo_image_func, trafo_image_extra_kwargs=None, img_obj_type=None, img_attr=None, dimensions=(256, 256), max_valset_size=10000): """ This function downloads all photos which are part of the dataset. This is a general function which can be used for lots of different layers. It returns a dictionary which contains the downloaded image paths. Key: dataset split identifier, can be 'E', 'V', 'R' Value: tuple of (item indexes in the item_ids array, corresponding image paths) :param rel_root_path: The root path of the photos and generated training files relative to the Caffe root path. :param item_type: The type of the model class for the items which are classified (e.g. FgPhoto). This class should have 'photo', 'matclass_dataset_split' attributes/properties. The photo attribute should have most of the Photo model's fields. It is advised to use an actual Photo instance here. The matclass_dataset_split attribute should indicate in which dataset split this item is in. The possible dataset splits are 'E' (test), 'V' (validation), 'R' (training). :param item_ids: List (or numpy array) of ids into the :ref:`item_type` table. It should contain the training, validation and test set. :param skip_test: If true, skip generating file and downloading images for the test split. :param split_attr: The attribute name which represents the dataset split in the database. It should be one character, 'E' meaning test, 'V' meaning validation, 'R' meaning training. :param gen_image_specs_func: Function which generates an id, photo id, image path triplet for each item which we later use to download the images. :param trafo_image_func: If None, we don't apply any transformation on the images. Function which transforms an image given the image path and the extra parameters, it should return the path of the transformed image, which can be the original image path or a new path. :ref:`trafo_image_extra_kwargs` will be passed as extra parameters to this function. :param trafo_image_extra_kwargs: Extra keyword arguments which will be passed to :ref:`trafo_image_func` function. All of them should be a list which has the same order as :ref:`item_ids`. :param img_obj_type: The type of the model class which holds an image. :param img_attr: The attribute of `img_obj_type` which holds the image. :param dimensions: The dimensions to resize the downloaded images to. If None, keep the image as original size. :param max_valset_size: The maximum size for the validation set. """ item_id_to_idx = {id: idx for idx, id in enumerate(item_ids)} abbr, fnames = get_abbr_fname(skip_test) # The return value image_data = {} for mc_ds_s, fname in zip(abbr, fnames): data_path = os.path.join(rel_root_path, 'data') ensuredir(os.path.join(settings.CAFFE_ROOT, data_path)) print 'Generating split file and downloading images for {} split...'.format( fname) print 'Generating a list of images to download...' image_specs = [] for item_ids_batch in progress_bar(iter_batch(item_ids, 10000)): # Note that the order is not going to be the same as # item_ids_batch, so we expect the data layer to shuffle the data! items_split = (item_type.objects.filter(**{ split_attr: mc_ds_s }).filter(id__in=item_ids_batch).order_by()) # A list of item_id, image_url, image_path tuples image_specs += gen_image_specs_func(data_path, items_split) if not image_specs: image_data[mc_ds_s] = ([], []) continue # We want the validation step to finish in tractable time, so we have a # maximum threshold on the validation set size if mc_ds_s == 'V' and len(image_specs) > max_valset_size: print 'Sampling {} images to reduce the size of the validation set...'.format( max_valset_size) # For reproducibility random.seed(125) image_specs = random.sample(image_specs, max_valset_size) item_ids_perm, img_obj_ids, image_paths_list = zip(*image_specs) # A corresponding list of indices into the item_ids array item_idxs = [item_id_to_idx[item_id] for item_id in item_ids_perm] # Add caffe root to all paths for downloading full_image_paths_list = [[ os.path.join(settings.CAFFE_ROOT, ip) for ip in ipl ] for ipl in image_paths_list] # Downloading images download_images( item_type=img_obj_type, item_ids=list(itertools.chain.from_iterable(img_obj_ids)), img_attr=img_attr, image_paths=list( itertools.chain.from_iterable(full_image_paths_list)), format='JPEG', dimensions=dimensions, ) if trafo_image_func: print 'Transforming images...' new_image_paths_list = [] new_item_idxs = [] for item_idx, image_paths, full_image_paths in progress_bar( zip(item_idxs, image_paths_list, full_image_paths_list)): new_image_paths = trafo_image_func( image_paths, full_image_paths, **index_kwargs(trafo_image_extra_kwargs, item_idx)) if not new_image_paths: print ':( {}'.format(full_image_paths) continue new_image_paths_list.append(new_image_paths) new_item_idxs.append(item_idx) image_paths_list = new_image_paths_list item_idxs = new_item_idxs image_data[mc_ds_s] = (item_idxs, image_paths_list) return image_data
def compute_cnn_features_gpu_task( item_type, task_id, batch_id, id_list, feature_name_list, kwa, ): ''' Computes the features for a list of model_class type objects (for the associated images), then sends the computed features to the redis server. The accumulator script will collect these and save them as a numpy array on disk. :param item_type: The class for the model which holds the information which we use the retrieve the images the feature will be computed on. This can also be 'redis', which means that we will fetch the item information from redis. :param task_id: ID of the task which will be used as a key to put the batch_id as a completed ID in redis :param batch_id: ID of the batch which will be used as a key to put the results in redis :param id_list: List of item_type IDs. If ``item_type`` is 'redis', these should be the redis keys corresponding to the items. :param feature_name_list: The features' name in the network which will be extracted :param kwa: The parameters to pass to the feature computer function. ''' # Change working directory to Caffe os.chdir(settings.CAFFE_ROOT) device_id = get_worker_gpu_device_id() deployfile_relpath, weights_relpath = download_snapshot( kwa['snapshot_id'], kwa['transfer_weights'] ) caffe, fet_extractor = load_fet_extractor( deployfile_relpath, weights_relpath, kwa['do_preprocessing'], kwa['image_dims'], kwa['mean'], device_id ) # This doesn't preserve order! if item_type == 'redis': client = redis.StrictRedis(**settings.REDIS_AGGRO_LOCAL_CONFIG) redis_vals = client.mget(*id_list) client.delete(*id_list) items = [RedisItem(key, value) for key, value in zip(id_list, redis_vals)] else: items = item_type.objects.in_bulk(id_list).values() fet_trafo_types = get_fet_trafo_types() fets = [] print 'Computing features for {} items...'.format(len(items)) show_progress = False if show_progress: items = progress_bar(items) for item in items: if kwa['input_trafo_func_name']: with Timer('Input transformation'): input_trafo = import_function(kwa['input_trafo_func_name']) inp = input_trafo(item, **kwa['input_trafo_kwargs']) else: inp = caffe.io.load_image(item.photo.image_300) if 'grayscale' in kwa and kwa['grayscale']: inp_gray = np.mean(inp, axis=2) inp = np.zeros_like(inp) inp[:, :, :] = inp_gray[:, :, np.newaxis] with Timer('Feature extraction'): # feature_name_list can contain 'img', which means that we want to # save the img or some transformation of the image as a final # feature result. Of course the CNN doesn't need to compute this # feature. fnl = list(feature_name_list) if 'img' in feature_name_list: fnl.remove('img') fetdic = fet_extractor.extract_features( inp, blob_names=fnl, auto_reshape=kwa['auto_reshape'] ) if kwa['fet_trafo_type_id']: with Timer('Feature transformation'): # This might add 'img' to the feature list fetdic = fet_trafo_types[kwa['fet_trafo_type_id']]( item, inp, fetdic, feature_name_list, **kwa['fet_trafo_kwargs'] ) for feature_name in feature_name_list: fetdic[feature_name] = np.ravel(np.squeeze(fetdic[feature_name])) fets.append((item.id, fetdic)) # Save results in redis with Timer('Uploading to Redis'): batch_ready(task_id, batch_id, packer.packb(fets, settings.API_VERSION))