def handle(self, message): image = message[0] if 'id' not in image: return local_name = image['id'] local_image_path = '%s/%s' % (self.trainer.job_model.get_dataset_downloads_dir(self.dataset), local_name) try: ensure_dir(os.path.dirname(local_image_path)) except Exception: pass if not os.path.isfile(local_image_path): if download_image(image['src'], local_image_path): try: img = Image.open(local_image_path) resize = bool(get_option(self.dataset['config'], 'resize', True)) if resize: os.remove(local_image_path) size = (int(get_option(self.dataset['config'], 'resizeWidth', 512)), int(get_option(self.dataset['config'], 'resizeHeight', 512))) quality = int(float(get_option(self.dataset['config'], 'resizeCompression', 0.8)) * 100) img.thumbnail(size, Image.ANTIALIAS) local_image_path = os.path.splitext(local_image_path)[0]+'.jpg' img.save(local_image_path, 'JPEG', quality=quality, optimize=True) except IOError as e: print("No valid image found %s" % (local_image_path,)) os.remove(local_image_path) except KeyboardInterrupt: self.controller['running'] = False self.images[image['id']] = local_image_path self.trainer.set_status('LOAD IMAGE %d/%d' % (len(self.images), self.max))
def handle(self, message): image = message[0] if 'id' not in image: return local_name = image['id'] local_image_path = '%s/%s' % ( self.trainer.job_model.get_dataset_downloads_dir( self.dataset), local_name) try: ensure_dir(os.path.dirname(local_image_path)) except Exception: pass if not os.path.isfile(local_image_path): if download_image(image['src'], local_image_path): try: with open(local_image_path, "rb") as fp: img = Image.open(fp) img.load() resize = bool( get_option(self.dataset['config'], 'resize', True)) if resize: os.remove(local_image_path) size = (int( get_option(self.dataset['config'], 'resizeWidth', 512)), int( get_option(self.dataset['config'], 'resizeHeight', 512))) quality = int( float( get_option(self.dataset['config'], 'resizeCompression', 0.8)) * 100) img.thumbnail(size, Image.ANTIALIAS) local_image_path = os.path.splitext( local_image_path)[0] + '.jpg' img.save(local_image_path, 'JPEG', quality=quality, optimize=True) except IOError as e: print(("No valid image found %s" % (local_image_path, ))) if os.path.exists(local_image_path): os.remove(local_image_path) except KeyboardInterrupt: self.controller['running'] = False self.images[image['id']] = local_image_path self.trainer.set_status('LOAD IMAGE %d/%d' % (len(self.images), self.max))
def handle(self, message): image = message[0] if 'id' not in image: return local_image_path = message[2] try: ensure_dir(os.path.dirname(local_image_path)) except Exception: pass if not os.path.isfile(local_image_path): if download_image(image['src'], local_image_path): try: with open(local_image_path, "rb") as fp: img = Image.open(fp) img.load() resize = bool( get_option(self.dataset['config'], 'resize', True)) if resize: os.remove(local_image_path) size = (int( get_option(self.dataset['config'], 'resizeWidth', 512)), int( get_option(self.dataset['config'], 'resizeHeight', 512))) quality = int( float( get_option(self.dataset['config'], 'resizeCompression', 0.8)) * 100) img.thumbnail(size, Image.ANTIALIAS) local_image_path = os.path.splitext( local_image_path)[0] + '.jpg' img.save(local_image_path, 'JPEG', quality=quality, optimize=True) except SystemExit: self.controller['running'] = False except KeyboardInterrupt: self.controller['running'] = False except Exception as e: print(("No valid image found %s" % (local_image_path, ))) if os.path.exists(local_image_path): os.remove(local_image_path) self.images[image['id']] = local_image_path self.progress.advance(1)
def get_datasets(self, trainer): datasets = {} from aetros.utils import get_option from .auto_dataset import get_images, read_images_keras_generator, read_images_in_memory # load placeholder, auto data config = self.job['config'] for layer in self.layers[0]: if 'datasetId' in layer and layer['datasetId']: if layer['datasetId'] not in self.datasets: raise Exception('Dataset %s not found in datasets %s' % (layer['datasetId'], simplejson.dumps(self.datasets.keys()))) dataset = self.datasets[layer['datasetId']] if not dataset: raise Exception('Dataset of id %s does not exists. Available %s' % ( layer['datasetId'], ','.join(list(self.datasets.keys())))) if dataset['type'] == 'images_upload' or dataset['type'] == 'images_search': connected_to_layer = self.get_connected_layer(self.layers, layer) if connected_to_layer is None: # this input is not in use, so we dont need to calculate its dataset continue datasets[layer['datasetId']] = get_images(self, dataset, layer, trainer) elif dataset['type'] == 'images_local': all_memory = get_option(dataset['config'], 'allMemory', False, 'bool') if all_memory: datasets[layer['datasetId']] = read_images_in_memory(self, dataset, layer, trainer) else: datasets[layer['datasetId']] = read_images_keras_generator(self, dataset, layer, trainer) elif dataset['type'] == 'python': name = dataset['id'] try: sys.path.append(os.path.abspath('./aetros/dataset/' + os.path.dirname(name))) data_provider = __import__(os.path.basename(name)) print("Imported dataset provider from %s " % (os.path.abspath('./aetros/dataset/' + name + '.py'),)) import inspect argSpec = inspect.getargspec(data_provider.get_data) if len(argSpec.args) > 0: datasets[dataset['id']] = data_provider.get_data(trainer) else: datasets[dataset['id']] = data_provider.get_data() except Exception: trainer.logger.error('Could not import dataset code: ' + name + ' in ' + os.path.abspath('./aetros/dataset/')) raise finally: sys.path.pop() return datasets
def get_image_data_augmentor_from_dataset(dataset): from keras.preprocessing.image import ImageDataGenerator dataset_config = dataset['config'] augShearRange = float(get_option(dataset_config, 'augShearRange', 0.1)) augZoomRange = float(get_option(dataset_config, 'augZoomRange', 0.1)) augHorizontalFlip = bool(get_option(dataset_config, 'augHorizontalFlip', False)) augVerticalFlip = bool(get_option(dataset_config, 'augVerticalFlip', False)) augRotationRange = float(get_option(dataset_config, 'augRotationRange', 0.2)) return ImageDataGenerator( rotation_range=augRotationRange, shear_range=augShearRange, zoom_range=augZoomRange, horizontal_flip=augHorizontalFlip, vertical_flip=augVerticalFlip )
def get_datasets(self, trainer): datasets_dir = self.get_dataset_dir() datasets = {} from aetros.utils import get_option from .auto_dataset import get_images, read_images_keras_generator, read_images_in_memory # load placeholder, auto data config = self.job['config'] for layer in config['layer'][0]: if 'datasetId' in layer and layer['datasetId']: dataset = config['datasets'][layer['datasetId']] if not dataset: raise Exception( 'Dataset of id %s does not exists. Available %s' % (layer['datasetId'], ','.join( list(config['datasets'].keys())))) if dataset['type'] == 'images_upload' or dataset[ 'type'] == 'images_search': connected_to_layer = self.get_connected_layer( config['layer'], layer) if connected_to_layer is None: # this input is not in use, so we dont need to calculate its dataset continue datasets[layer['datasetId']] = get_images( self, dataset, layer, trainer) elif dataset['type'] == 'images_local': all_memory = get_option(dataset['config'], 'allMemory', False, 'bool') if all_memory: datasets[layer['datasetId']] = read_images_in_memory( self, dataset, layer, trainer) else: datasets[ layer['datasetId']] = read_images_keras_generator( self, dataset, layer, trainer) elif dataset['type'] == 'python': name = dataset['id'].replace('/', '__') sys.path.append(datasets_dir) data_provider = __import__(name, '') print("Imported dataset provider in %s " % (datasets_dir + '/' + name + '.py', )) sys.path.pop() datasets[dataset['id']] = data_provider.get_data() return datasets
def handle(self, message): image = message[0] if 'id' not in image: return local_image_path = message[2] try: ensure_dir(os.path.dirname(local_image_path)) except Exception: pass if not os.path.isfile(local_image_path): if download_image(image['src'], local_image_path): try: with open(local_image_path, "rb") as fp: img = Image.open(fp) img.load() resize = bool(get_option(self.dataset['config'], 'resize', True)) if resize: os.remove(local_image_path) size = (int(get_option(self.dataset['config'], 'resizeWidth', 512)), int( get_option(self.dataset['config'], 'resizeHeight', 512))) quality = int( float(get_option(self.dataset['config'], 'resizeCompression', 0.8)) * 100) img.thumbnail(size, Image.ANTIALIAS) local_image_path = os.path.splitext(local_image_path)[0] + '.jpg' img.save(local_image_path, 'JPEG', quality=quality, optimize=True) except SystemExit: self.controller['running'] = False except KeyboardInterrupt: self.controller['running'] = False except Exception as e: print(("No valid image found %s" % (local_image_path,))) if os.path.exists(local_image_path): os.remove(local_image_path) self.images[image['id']] = local_image_path self.progress.advance(1)
def get_datasets(self, trainer): datasets = {} from aetros.utils import get_option from .auto_dataset import get_images, read_images_keras_generator, read_images_in_memory # load placeholder, auto data config = self.job['config'] for layer in self.layers[0]: if 'datasetId' in layer and layer['datasetId']: if layer['datasetId'] not in self.datasets: raise Exception( 'Dataset %s not found in datasets %s' % (layer['datasetId'], json.dumps(self.datasets.keys()))) dataset = self.datasets[layer['datasetId']] if not dataset: raise Exception( 'Dataset of id %s does not exists. Available %s' % (layer['datasetId'], ','.join( list(self.datasets.keys())))) if dataset['type'] == 'images_upload' or dataset[ 'type'] == 'images_search': connected_to_layer = self.get_connected_layer( self.layers, layer) if connected_to_layer is None: # this input is not in use, so we dont need to calculate its dataset continue datasets[layer['datasetId']] = get_images( self, dataset, layer, trainer) elif dataset['type'] == 'images_local': all_memory = get_option(dataset['config'], 'allMemory', False, 'bool') if all_memory: datasets[layer['datasetId']] = read_images_in_memory( self, dataset, layer, trainer) else: datasets[ layer['datasetId']] = read_images_keras_generator( self, dataset, layer, trainer) elif dataset['type'] == 'python': name = dataset['id'] try: sys.path.append( os.path.abspath('./aetros/dataset/' + os.path.dirname(name))) data_provider = __import__(os.path.basename(name)) print("Imported dataset provider from %s " % (os.path.abspath('./aetros/dataset/' + name + '.py'), )) import inspect argSpec = inspect.getargspec(data_provider.get_data) if len(argSpec.args) > 0: datasets[dataset['id']] = data_provider.get_data( trainer) else: datasets[dataset['id']] = data_provider.get_data() except Exception: trainer.logger.error( 'Could not import dataset code: ' + name + ' in ' + os.path.abspath('./aetros/dataset/')) raise finally: sys.path.pop() return datasets
def get_images(job_model, dataset, node, trainer): concurrent = 15 from PIL import ImageFile if hasattr(ImageFile, 'LOAD_TRUNCATED_IMAGES'): ImageFile.LOAD_TRUNCATED_IMAGES = True q = Queue(concurrent) config = dataset['config'] dir = job_model.get_dataset_downloads_dir(dataset) ensure_dir(dir) if 'classes' not in config or not config['classes']: trainer.logger.warning("Dataset %s does not contain any classes." % (dataset['id'],)) return { 'X_train': np.array([]), 'Y_train': np.array([]), 'X_test': np.array([]), 'Y_test': np.array([]) } classes = config['classes'] trainer.set_status('LOAD IMAGES') max = 0 images = {} dataset_path = job_model.get_dataset_downloads_dir(dataset) meta_information_file = dataset_path + '/meta.json' classes_changed = False config_changed = False had_previous = False classes_md5 = hashlib.md5(simplejson.dumps(classes, default=invalid_json_values, sort_keys=True).encode('utf-8')).hexdigest() validationFactor = 0.2 meta = {} if os.path.isdir(dataset_path): if os.path.isfile(meta_information_file): with open(meta_information_file) as f: meta = simplejson.load(f) if meta: had_previous = True if 'classes_md5' in meta and meta['classes_md5'] != classes_md5: classes_changed = True trigger_changed = ['resize', 'resizeWidth', 'resizeHeight', 'resizeCompression'] for i in trigger_changed: if i in meta['config'] and i in config and meta['config'][i] != config[i]: config_changed = True else: config_changed = True else: config_changed = True need_download = classes_changed or config_changed if need_download: if had_previous: trainer.logger.info("Reset dataset and re-download images to " + dir) if classes_changed: trainer.logger.info(" .. because classes changed in", meta['classes_md5'], classes_md5, meta_information_file) if config_changed: trainer.logger.info(" .. because settings changed in", meta_information_file) else: trainer.logger.info("Download images to " + dir) resize = bool(get_option(config, 'resize', True)) if resize: resizeSize = (int(get_option(config, 'resizeWidth', 64)), int(get_option(config, 'resizeHeight', 64))) trainer.logger.info(" .. with resizing to %dx%d " % resizeSize) # # we need to donwload all images shutil.rmtree(dataset_path) controller = {'running': True} try: for category in classes: max += len(category['images']) progress = trainer.job_backend.create_progress('dataset-download-images', max) progress.label('Download dataset images') for i in range(concurrent): t = ImageDownloaderWorker(q, progress, dataset, max, images, controller) t.daemon = True t.start() for category_idx, category in enumerate(classes): for image in category['images']: local_name = image['id'] local_path = '%s/%s' % (trainer.job_model.get_dataset_downloads_dir(dataset), local_name) q.put([image, category_idx, local_path]) q.join() controller['running'] = False def move_image(image, category='training'): if image['id'] in images and os.path.isfile(images[image['id']]): target_path = dataset_path + \ '/%s/category_%s/%s' % (category, category_idx, os.path.basename(images[image['id']])) ensure_dir(os.path.dirname(target_path)) os.rename(images[image['id']], target_path) for category_idx, category in enumerate(classes): random.shuffle(category['images']) position = int(math.ceil(len(category['images']) * validationFactor)) ensure_dir(dataset_path + '/training') ensure_dir(dataset_path + '/validation') for image in category['images'][position:]: # test data if image['id'] in images and os.path.isfile(images[image['id']]): move_image(image, 'training') for image in category['images'][:position]: # validation data if image['id'] in images and os.path.isfile(images[image['id']]): move_image(image, 'validation') with open(meta_information_file, 'w') as f: meta = { 'loaded_at': classes_md5, 'classes_md5': classes_md5, 'config': config } simplejson.dump(meta, f, default=invalid_json_values) except KeyboardInterrupt: controller['running'] = False sys.exit(1) else: trainer.logger.info("Downloaded images up2date in " + dir) trainer.logger.info(" - Remove this directory if you want to re-download all images of your dataset and re-shuffle training/validation images.") trainer.output_size = len(classes) # change to type local_images dataset_transformed = dataset.copy() dataset_transformed['config']['path'] = dir all_memory = get_option(dataset['config'], 'allMemory', False, 'bool') if all_memory: return read_images_in_memory(job_model, dataset_transformed, node, trainer) else: return read_images_keras_generator(job_model, dataset_transformed, node, trainer)
def read_images_keras_generator(job_model, dataset, node, trainer): from keras.preprocessing.image import ImageDataGenerator size = (int(node['width']), int(node['height'])) grayscale = False if node['inputType'] == 'image': grayscale = True dataset_config = dataset['config'] trainer.logger.info(("Generate image iterator in folder %s " % (dataset_config['path'],))) augmentation = bool(get_option(dataset_config, 'augmentation', False)) if augmentation: train_datagen = get_image_data_augmentor_from_dataset(dataset) else: train_datagen = ImageDataGenerator() if 'imageScale' not in node: node['imageScale'] = 255 if float(node['imageScale']) > 0: train_datagen.rescale = 1.0 / float(node['imageScale']) train_generator = train_datagen.flow_from_directory( directory=os.path.join(dataset_config['path'], 'training'), target_size=size, batch_size=job_model.job['config']['batchSize'], color_mode='grayscale' if grayscale is True else 'rgb', class_mode='categorical') classes = [] for folderName, outputNeuron in six.iteritems(train_generator.class_indices): if dataset['type'] == 'images_search' or dataset['type'] == 'images_upload': category_idx = int(folderName.replace('category_', '')) target_category = dataset_config['classes'][category_idx] classes.append(target_category['title'] or 'Category %s' % (category_idx, )) else: classes.append(folderName) trainer.set_info('classes', classes) trainer.classes = classes # ensure_dir(dataset_config['path'] + '/preview') test_datagen = ImageDataGenerator() if float(node['imageScale']) > 0: test_datagen.rescale = 1.0 / float(node['imageScale']) validation_generator = test_datagen.flow_from_directory( directory=os.path.join(dataset_config['path'], 'validation'), # save_to_dir=dataset_config['path'] + '/preview', target_size=size, batch_size=job_model.get_batch_size(), color_mode='grayscale' if grayscale is True else 'rgb', class_mode='categorical') validation_samples = 0 train_samples = 0 # Keras 2 if hasattr(train_generator, 'num_class'): trainer.output_size = train_generator.num_class if hasattr(train_generator, 'samples'): train_samples = train_generator.samples if hasattr(validation_generator, 'samples'): validation_samples = validation_generator.samples # Keras 1 if hasattr(train_generator, 'nb_class'): trainer.output_size = train_generator.nb_class if hasattr(train_generator, 'nb_sample'): train_samples = train_generator.nb_sample if hasattr(validation_generator, 'nb_sample'): validation_samples = validation_generator.nb_sample trainer.set_info('Dataset size', {'training': train_samples, 'validation': validation_samples}) trainer.set_generator_validation_nb(validation_samples) trainer.set_generator_training_nb(train_samples) trainer.logger.info(("Found %d classes, %d images (%d in training [%saugmented], %d in validation) in %s " % (len(classes), validation_samples + train_samples, train_samples, 'not ' if augmentation is False else '', validation_samples, dataset_config['path']))) if trainer.output_size == 0: trainer.logger.warning("Could not find any classes. Does the directory contains images?") sys.exit(1) trainer.logger.debug(str(train_generator.class_indices)) trainer.logger.debug(str(classes)) return { 'X_train': train_generator, 'Y_train': train_generator, 'X_test': validation_generator, 'Y_test': validation_generator, }
def read_images_in_memory(job_model, dataset, node, trainer): """ Reads all images into memory and applies augmentation if enabled """ concurrent = psutil.cpu_count() dataset_config = dataset['config'] controller = {'running': True} q = Queue(concurrent) result = { 'X_train': [], 'Y_train': [], 'X_test': [], 'Y_test': [] } images = [] max = 0 path = job_model.get_dataset_downloads_dir(dataset) if 'path' in dataset['config']: path = dataset['config']['path'] classes_count = 0 category_map = {} classes = [] trainer.set_status('LOAD IMAGES INTO MEMORY') try: for i in range(concurrent): t = ImageReadWorker(q, job_model, node, path, images, controller) t.daemon = True t.start() for validation_or_training in ['validation', 'training']: if os.path.isdir(os.path.normpath(path + '/' + validation_or_training)): for category_name in os.listdir(os.path.normpath(path + '/' + validation_or_training)): if os.path.isdir(os.path.normpath(path + '/' + validation_or_training + '/' + category_name)): if category_name not in category_map: category_map[category_name] = classes_count if 'classes' in dataset_config and 'category_' in category_name: category_idx = int(category_name.replace('category_', '')) category_map[category_name] = category_idx target_category = dataset_config['classes'][category_idx] classes.append(target_category['title'] or 'Class %s' % (category_idx, )) else: classes.append(category_name) classes_count += 1 for id in os.listdir(os.path.normpath(path + '/' + validation_or_training + '/' + category_name)): file_path = os.path.join(path, validation_or_training, category_name, id) q.put([file_path, validation_or_training == 'validation', category_name]) max += 1 q.join() controller['running'] = False train_images = [] test_images = [] for v in images: image, validation, category_dir = v if validation is True: test_images.append([image, category_map[category_dir]]) else: train_images.append([image, category_map[category_dir]]) train_datagen = None augmentation = bool(get_option(dataset_config, 'augmentation', False)) if augmentation: train_datagen = get_image_data_augmentor_from_dataset(dataset) train = InMemoryDataGenerator(train_datagen, train_images, classes_count, job_model.job['config']['batchSize']) test = InMemoryDataGenerator(None, test_images, classes_count, job_model.job['config']['batchSize']) nb_sample = len(train_images) trainer.set_info('Dataset size', {'training': nb_sample, 'validation': len(test_images)}) trainer.set_generator_training_nb(nb_sample) trainer.set_generator_validation_nb(len(test_images)) trainer.logger.info(("Found %d classes, %d images (%d in training [%saugmented], %d in validation). Read all images into memory from %s" % (classes_count, max, len(train_images), 'not ' if augmentation is False else '', len(test_images), path))) if classes_count == 0: trainer.logger.warning("Could not find any classes. Does the directory contains images?") sys.exit(1) trainer.output_size = classes_count trainer.set_info('classes', classes) trainer.classes = classes result['X_train'] = train result['Y_train'] = train result['X_test'] = test result['Y_test'] = test return result except KeyboardInterrupt: controller['running'] = False sys.exit(1)
def get_images(job_model, dataset, node, trainer): concurrent = 15 from PIL import ImageFile if hasattr(ImageFile, 'LOAD_TRUNCATED_IMAGES'): ImageFile.LOAD_TRUNCATED_IMAGES = True q = Queue(concurrent) config = dataset['config'] dir = trainer.job_model.get_dataset_downloads_dir(dataset) ensure_dir(dir) if 'classes' not in config or not config['classes']: print("Dataset %s does not contain any classes." % (dataset['id'], )) return { 'X_train': np.array([]), 'Y_train': np.array([]), 'X_test': np.array([]), 'Y_test': np.array([]) } classes = config['classes'] trainer.set_status('PREPARE_IMAGES') max = 0 images = {} dataset_path = trainer.job_model.get_dataset_downloads_dir(dataset) meta_information_file = dataset_path + '/meta.json' classes_changed = False config_changed = False had_previous = False classes_md5 = hashlib.md5( json.dumps(classes, default=invalid_json_values).encode('utf-8')).hexdigest() validationFactor = 0.2 if os.path.isdir(dataset_path): if os.path.isfile(meta_information_file): with open(meta_information_file) as f: meta = json.load(f) if meta: had_previous = True if 'classes_md5' in meta and meta[ 'classes_md5'] != classes_md5: classes_changed = True trigger_changed = [ 'resize', 'resizeWidth', 'resizeHeight', 'resizeCompression' ] for i in trigger_changed: if i in meta['config'] and i in config and meta[ 'config'][i] != config[i]: config_changed = True else: config_changed = True else: config_changed = True need_download = classes_changed or config_changed if need_download: if had_previous: print("Reset dataset and re-download images to " + dir) if classes_changed: print(" .. because classes changed") if config_changed: print(" .. because settings changed") else: print("Download images to " + dir) resize = bool(get_option(config, 'resize', True)) if resize: resizeSize = (int(get_option(config, 'resizeWidth', 64)), int(get_option(config, 'resizeHeight', 64))) print(" .. with resizing to %dx%d " % resizeSize) # we need to donwload all images shutil.rmtree(dataset_path) controller = {'running': True} try: for category in classes: max += len(category['images']) for i in range(concurrent): t = ImageDownloaderWorker(q, trainer, dataset, max, images, controller) t.daemon = True t.start() for category_idx, category in enumerate(classes): for image in category['images']: q.put([image, category_idx]) q.join() controller['running'] = False def move_image(image, category='training'): if image['id'] in images and os.path.isfile( images[image['id']]): target_path = dataset_path + \ '/%s/category_%s/%s' % (category, category_idx, os.path.basename(images[image['id']])) ensure_dir(os.path.dirname(target_path)) os.rename(images[image['id']], target_path) for category_idx, category in enumerate(classes): random.shuffle(category['images']) position = int( math.ceil(len(category['images']) * validationFactor)) ensure_dir(dataset_path + '/training') ensure_dir(dataset_path + '/validation') for image in category['images'][position:]: # test data if image['id'] in images and os.path.isfile( images[image['id']]): move_image(image, 'training') for image in category['images'][:position]: # validation data if image['id'] in images and os.path.isfile( images[image['id']]): move_image(image, 'validation') with open(meta_information_file, 'w') as f: meta = { 'loaded_at': classes_md5, 'classes_md5': classes_md5, 'config': config } json.dump(meta, f, default=invalid_json_values) except KeyboardInterrupt: controller['running'] = False sys.exit(1) else: print("Downloaded images up2date in " + dir) print( " - Remove this directory if you want to re-download all images of your dataset and re-shuffle training/validation images." ) trainer.output_size = len(classes) trainer.set_status('LOAD IMAGE DONE') # change to type local_images dataset_transformed = dataset.copy() dataset_transformed['config']['path'] = dir all_memory = get_option(dataset['config'], 'allMemory', False, 'bool') if all_memory: return read_images_in_memory(job_model, dataset_transformed, node, trainer) else: return read_images_keras_generator(job_model, dataset_transformed, node, trainer)
def read_images_keras_generator(job_model, dataset, node, trainer): from keras.preprocessing.image import ImageDataGenerator size = (int(node['width']), int(node['height'])) grayscale = False if node['inputType'] == 'image': grayscale = True dataset_config = dataset['config'] print( ("Generate image iterator in folder %s " % (dataset_config['path'], ))) augmentation = bool(get_option(dataset_config, 'augmentation', False)) if augmentation: train_datagen = get_image_data_augmentor_from_dataset(dataset) if 'imageScale' not in node: node['imageScale'] = 255 if float(node['imageScale']) > 0: train_datagen.rescale = 1.0 / float(node['imageScale']) else: train_datagen = ImageDataGenerator() train_generator = train_datagen.flow_from_directory( directory=os.path.join(dataset_config['path'], 'training'), target_size=size, batch_size=job_model.job['config']['settings']['batchSize'], color_mode='grayscale' if grayscale is True else 'rgb', class_mode='categorical') classes = [] for folderName, outputNeuron in six.iteritems( train_generator.class_indices): if dataset['type'] == 'images_search' or dataset[ 'type'] == 'images_upload': category_idx = int(folderName.replace('category_', '')) target_category = dataset_config['classes'][category_idx] classes.append(target_category['title'] or 'Category %s' % (category_idx, )) else: classes.append(folderName) trainer.set_job_system_info('classes', classes) trainer.classes = classes # ensure_dir(dataset_config['path'] + '/preview') test_datagen = ImageDataGenerator(rescale=1. / 255) validation_generator = test_datagen.flow_from_directory( directory=os.path.join(dataset_config['path'], 'validation'), # save_to_dir=dataset_config['path'] + '/preview', target_size=size, batch_size=trainer.get_batch_size(), color_mode='grayscale' if grayscale is True else 'rgb', class_mode='categorical') validation_samples = 0 train_samples = 0 # Keras 2 if hasattr(train_generator, 'num_class'): trainer.output_size = train_generator.num_class if hasattr(train_generator, 'samples'): train_samples = train_generator.samples if hasattr(validation_generator, 'samples'): validation_samples = validation_generator.samples # Keras 1 if hasattr(train_generator, 'nb_class'): trainer.output_size = train_generator.nb_class if hasattr(train_generator, 'nb_sample'): train_samples = train_generator.nb_sample if hasattr(validation_generator, 'nb_sample'): validation_samples = validation_generator.nb_sample trainer.set_generator_validation_nb(validation_samples) trainer.set_generator_training_nb(train_samples) print(( "Found %d classes, %d images (%d in training [%saugmented], %d in validation) in %s " % (len(classes), validation_samples + train_samples, train_samples, 'not ' if augmentation is False else '', validation_samples, dataset_config['path']))) if trainer.output_size == 0: print( "Could not find any classes. Does the directory contains images?") sys.exit(1) pprint(train_generator.class_indices) pprint(classes) return { 'X_train': train_generator, 'Y_train': train_generator, 'X_test': validation_generator, 'Y_test': validation_generator, }
def read_images_in_memory(job_model, dataset, node, trainer): """ Reads all images into memory and applies augmentation if enabled """ concurrent = psutil.cpu_count() dataset_config = dataset['config'] controller = {'running': True} config = dataset['config'] # TODO: config not used q = Queue(concurrent) result = {'X_train': [], 'Y_train': [], 'X_test': [], 'Y_test': []} images = [] max = 0 path = trainer.job_model.get_dataset_downloads_dir(dataset) if 'path' in dataset['config']: path = dataset['config']['path'] classes_count = 0 category_map = {} classes = [] try: for i in range(concurrent): t = ImageReadWorker(q, job_model, node, path, images, controller) t.daemon = True t.start() for validation_or_training in ['validation', 'training']: if os.path.isdir(path + '/' + validation_or_training): for category_name in os.listdir(path + '/' + validation_or_training): if os.path.isdir(path + '/' + validation_or_training + '/' + category_name): if category_name not in category_map: category_map[category_name] = classes_count if 'classes' in dataset_config and 'category_' in category_name: category_idx = int( category_name.replace('category_', '')) category_map[category_name] = category_idx target_category = dataset_config['classes'][ category_idx] classes.append(target_category['title'] or 'Class %s' % (category_idx, )) else: classes.append(category_name) classes_count += 1 for id in os.listdir(path + '/' + validation_or_training + '/' + category_name): file_path = os.path.join(path, validation_or_training, category_name, id) q.put([ file_path, validation_or_training == 'validation', category_name ]) max += 1 q.join() controller['running'] = False train_images = [] test_images = [] for v in images: image, validation, category_dir = v if validation is True: test_images.append([image, category_map[category_dir]]) else: train_images.append([image, category_map[category_dir]]) train_datagen = None augmentation = bool(get_option(dataset_config, 'augmentation', False)) if augmentation: train_datagen = get_image_data_augmentor_from_dataset(dataset) train = InMemoryDataGenerator( train_datagen, train_images, classes_count, job_model.job['config']['settings']['batchSize']) test = InMemoryDataGenerator( None, test_images, classes_count, job_model.job['config']['settings']['batchSize']) nb_sample = len(train_images) trainer.set_generator_training_nb(nb_sample) trainer.set_generator_validation_nb(len(test_images)) print(( "Found %d classes, %d images (%d in training [%saugmented], %d in validation). Read all images into memory from %s" % (classes_count, max, len(train_images), 'not ' if augmentation is False else '', len(test_images), path))) if classes_count == 0: print( "Could not find any classes. Does the directory contains images?" ) sys.exit(1) trainer.output_size = classes_count trainer.set_job_system_info('classes', classes) trainer.classes = classes result['X_train'] = train result['Y_train'] = train result['X_test'] = test result['Y_test'] = test return result except KeyboardInterrupt: controller['running'] = False sys.exit(1)
def get_images(job_model, dataset, node, trainer): concurrent = 15 q = Queue(concurrent) config = dataset['config'] dir = trainer.job_model.get_dataset_downloads_dir(dataset) ensure_dir(dir) classes = config['classes'] trainer.set_status('PREPARE_IMAGES') max = 0 images = {} dataset_path = trainer.job_model.get_dataset_downloads_dir(dataset) meta_information_file = dataset_path + '/meta.json' classes_changed = False config_changed = False had_previous = False classes_md5 = hashlib.md5(json.dumps(classes)).hexdigest() validationFactor = 0.2 if os.path.isdir(dataset_path): if os.path.isfile(meta_information_file): with open(meta_information_file) as f: meta = json.load(f) if meta: had_previous = True if 'classes_md5' in meta and meta['classes_md5'] != classes_md5: classes_changed = True trigger_changed = ['resize', 'resizeWidth', 'resizeHeight', 'resizeCompression'] for i in trigger_changed: if i in meta['config'] and i in config and meta['config'][i] != config[i]: config_changed = True else: config_changed = True else: config_changed = True need_download = classes_changed or config_changed if need_download: if had_previous: print "Reset dataset and re-download images to " + dir if classes_changed: print (" .. because classes changed") if config_changed: print (" .. because settings changed") else: print "Download images to " + dir resize = bool(get_option(config, 'resize', True)) if resize: resizeSize = (int(get_option(config, 'resizeWidth', 64)), int(get_option(config, 'resizeHeight', 64))) print " .. with resizing to %dx%d " % resizeSize # we need to donwload all images shutil.rmtree(dataset_path) controller = {'running': True} try: for category in classes: max += len(category['images']) for i in range(concurrent): t = ImageDownloaderWorker(q, trainer, dataset, max, images, controller) t.daemon = True t.start() for category_idx, category in enumerate(classes): for image in category['images']: q.put([image, category_idx]) q.join() controller['running'] = False def move_image(image, category = 'training'): if image['id'] in images and os.path.isfile(images[image['id']]): target_path = dataset_path + '/%s/category_%s/%s' % (category, category_idx, os.path.basename(images[image['id']])) ensure_dir(os.path.dirname(target_path)) os.rename(images[image['id']], target_path) for category_idx, category in enumerate(classes): random.shuffle(category['images']) position = int(math.ceil(len(category['images']) * validationFactor)) ensure_dir(dataset_path + '/training') ensure_dir(dataset_path + '/validation') for image in category['images'][position:]: #test data if image['id'] in images and os.path.isfile(images[image['id']]): move_image(image, 'training') for image in category['images'][:position]: #validation data if image['id'] in images and os.path.isfile(images[image['id']]): move_image(image, 'validation') with open(meta_information_file, 'w') as f: meta = { 'loaded_at': classes_md5, 'classes_md5': classes_md5, 'config': config } json.dump(meta, f) except KeyboardInterrupt: controller['running'] = False sys.exit(1) else: print "Downloaded images up2date in " + dir print " - Remove this directory if you want to re-download all images of your dataset and re-shuffle training/validation images." trainer.output_size = len(classes) trainer.set_status('LOAD IMAGE DONE') # change to type local_images dataset_transformed = dataset.copy() dataset_transformed['config']['path'] = dir all_memory = get_option(dataset['config'], 'allMemory', False, 'bool') if all_memory: return read_images_in_memory(job_model, dataset_transformed, node, trainer) else: return read_images_keras_generator(job_model, dataset_transformed, node, trainer)