def can_accumulate_gradients(self): if config_value('caffe_root')['flavor'] == 'BVLC': return True elif config_value('caffe_root')['flavor'] == 'NVIDIA': return config_value('caffe_root')['version'] > parse_version('0.14.0-alpha') else: raise ValueError('Unknown flavor. Support NVIDIA and BVLC flavors only.')
def save_weights(network_path, weights_path, gpu=None, logger=None): if config_value('torch_root') == '<PATHS>': torch_bin = 'th' else: torch_bin = os.path.join(config_value('torch_root'), 'bin', 'th') args = [ torch_bin, os.path.join(os.path.dirname(os.path.dirname(digits.__file__)), 'tools', 'torch', 'wrapper.lua'), 'getWeights.lua', '--network=%s' % os.path.basename(network_path).split(".")[0], '--networkDirectory=%s' % os.path.split(network_path)[0], '--snapshot=%s' % os.path.split(weights_path)[1], '--save=%s' % ".", '--type=%s' % "float" ] # Convert them all to strings args = [str(x) for x in args] env = os.environ.copy() p = subprocess.Popen(args, cwd=os.path.split(network_path)[0], close_fds=True, env=env) p.wait()
def can_accumulate_gradients(self): if config_value('caffe')['flavor'] == 'BVLC': return True elif config_value('caffe')['flavor'] == 'NVIDIA': return (parse_version(config_value('caffe')['version']) > parse_version('0.14.0-alpha')) else: raise ValueError('Unknown flavor. Support NVIDIA and BVLC flavors only.')
class DistributedCaffeFramework(CaffeFramework): """ derive from CaffeFramework, use for training in long-distance server """ """ Defines required methods to interact with the Caffe framework This class can be instantiated as many times as there are compatible instances of Caffe """ # short descriptive name NAME = 'Caffe' # identifier of framework class (intended to be the same across # all instances of this class) CLASS = 'caffe' # whether this framework can shuffle data during training CAN_SHUFFLE_DATA = False SUPPORTS_PYTHON_LAYERS_FILE = True SUPPORTS_TIMELINE_TRACING = False if config_value('caffe')['flavor'] == 'NVIDIA': if parse_version(config_value('caffe')['version']) > parse_version( '0.14.0-alpha'): SUPPORTED_SOLVER_TYPES = [ 'SGD', 'NESTEROV', 'ADAGRAD', 'RMSPROP', 'ADADELTA', 'ADAM' ] else: SUPPORTED_SOLVER_TYPES = ['SGD', 'NESTEROV', 'ADAGRAD'] elif config_value('caffe')['flavor'] == 'BVLC': SUPPORTED_SOLVER_TYPES = [ 'SGD', 'NESTEROV', 'ADAGRAD', 'RMSPROP', 'ADADELTA', 'ADAM' ] else: raise ValueError( 'Unknown flavor. Support NVIDIA and BVLC flavors only.') SUPPORTED_DATA_TRANSFORMATION_TYPES = ['MEAN_SUBTRACTION', 'CROPPING'] SUPPORTED_DATA_AUGMENTATION_TYPES = [] @override def __init__(self): super(CaffeFramework, self).__init__() self.framework_id = self.CLASS @override def create_train_task(self, **kwargs): """ create train task """ print 'return DistributedTrainTask' return DistributedTrainTask(framework_id=self.framework_id, **kwargs)
def setup_logging(): socketio_logger = logging.getLogger('socketio') socketio_logger.addHandler(logging.StreamHandler(sys.stdout)) # Set custom logger logging.setLoggerClass(JobIdLogger) formatter = logging.Formatter( fmt="%(asctime)s%(job_id)s [%(levelname)-5s] %(message)s", datefmt=DATE_FORMAT, ) ### digits logger main_logger = logging.getLogger('digits') main_logger.setLevel(logging.DEBUG) # Log to stdout stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(formatter) stdoutHandler.setLevel(logging.DEBUG) main_logger.addHandler(stdoutHandler) ### digits.webapp logger if config_value('log_file'): webapp_logger = logging.getLogger('digits.webapp') webapp_logger.setLevel(logging.DEBUG) # Log to file fileHandler = logging.handlers.RotatingFileHandler( config_value('log_file'), maxBytes=(1024*1024*10), # 10 MB backupCount=10, ) fileHandler.setFormatter(formatter) level = config_value('log_level') if level == 'debug': fileHandler.setLevel(logging.DEBUG) elif level == 'info': fileHandler.setLevel(logging.INFO) elif level == 'warning': fileHandler.setLevel(logging.WARNING) elif level == 'error': fileHandler.setLevel(logging.ERROR) elif level == 'critical': fileHandler.setLevel(logging.CRITICAL) webapp_logger.addHandler(fileHandler) ### Useful shortcut for the webapp, which may set job_id return JobIdLoggerAdapter(webapp_logger, {}) else: print 'WARNING: log_file config option not found - no log file is being saved' return JobIdLoggerAdapter(main_logger, {})
def setup_logging(): socketio_logger = logging.getLogger('socketio') socketio_logger.addHandler(logging.StreamHandler(sys.stdout)) # Set custom logger logging.setLoggerClass(JobIdLogger) formatter = logging.Formatter( fmt="%(asctime)s%(job_id)s [%(levelname)-5s] %(message)s", datefmt=DATE_FORMAT, ) ### digits logger main_logger = logging.getLogger('digits') main_logger.setLevel(logging.DEBUG) # Log to stdout stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(formatter) stdoutHandler.setLevel(logging.DEBUG) main_logger.addHandler(stdoutHandler) ### digits.webapp logger if config_value('log_file'): webapp_logger = logging.getLogger('digits.webapp') webapp_logger.setLevel(logging.DEBUG) # Log to file fileHandler = logging.handlers.RotatingFileHandler( config_value('log_file'), maxBytes=(1024 * 1024 * 10), # 10 MB backupCount=10, ) fileHandler.setFormatter(formatter) level = config_value('log_level') if level == 'debug': fileHandler.setLevel(logging.DEBUG) elif level == 'info': fileHandler.setLevel(logging.INFO) elif level == 'warning': fileHandler.setLevel(logging.WARNING) elif level == 'error': fileHandler.setLevel(logging.ERROR) elif level == 'critical': fileHandler.setLevel(logging.CRITICAL) webapp_logger.addHandler(fileHandler) ### Useful shortcut for the webapp, which may set job_id return JobIdLoggerAdapter(webapp_logger, {}) else: print 'WARNING: log_file config option not found - no log file is being saved' return JobIdLoggerAdapter(main_logger, {})
def write_deploy(self): # Write torch layers to json for layerwise graph visualization if config_value('torch_root') == '<PATHS>': torch_bin = 'th' else: torch_bin = os.path.join(config_value('torch_root'), 'bin', 'th') args = [torch_bin, os.path.join(os.path.dirname(os.path.dirname(digits.__file__)),'tools','torch','toGraph.lua'), '--network=%s' % os.path.split(self.get_deploy_path())[1].split(".")[0], '--output=%s' % self.get_model_def_path(True), ] env = os.environ.copy() p = subprocess.Popen(args,cwd=self.job_dir,env=env)
def save_max_activations(network_path, weights_path, height, width, layer, units=[-1], mean_file_path=None, gpu=None, logger=None): if config_value('torch_root') == '<PATHS>': torch_bin = 'th' else: torch_bin = os.path.join(config_value('torch_root'), 'bin', 'th') args = [ torch_bin, os.path.join(os.path.dirname(os.path.dirname(digits.__file__)), 'tools', 'torch', 'wrapper.lua'), 'gradientOptimizer.lua', '--network=%s' % os.path.basename(network_path).split(".")[0], '--weights=%s' % os.path.split(weights_path)[1], '--networkDirectory=%s' % os.path.split(network_path)[0], '--height=%s' % height, '--width=%s' % width, '--chain=%s' % layer, '--units=%s' % (','.join(str(x) for x in units)) ] # Convert them all to strings args = [str(x) for x in args] env = os.environ.copy() if mean_file_path is not None: args.append('--mean_file_path=%s' % mean_file_path) if gpu is not None: args.append('--type=cuda') # make only the selected GPU visible env['CUDA_VISIBLE_DEVICES'] = "%d" % gpu else: args.append('--type=float') # Append units at end: p = subprocess.Popen(args, cwd=os.path.split(network_path)[0], close_fds=True, env=env) p.wait()
def new(extension_id=None): """ Return a form for a new GenericImageModelJob """ form = GenericImageModelForm() form.dataset.choices = get_datasets(extension_id) form.standard_networks.choices = [] form.previous_networks.choices = get_previous_networks() form.pretrained_networks.choices = get_pretrained_networks() prev_network_snapshots = get_previous_network_snapshots() # Is there a request to clone a job with ?clone=<job_id> fill_form_if_cloned(form) return flask.render_template( 'models/images/generic/new.html', extension_id=extension_id, extension_title=extensions.data.get_extension( extension_id).get_title() if extension_id else None, form=form, frameworks=frameworks.get_frameworks(), previous_network_snapshots=prev_network_snapshots, previous_networks_fullinfo=get_previous_networks_fulldetails(), pretrained_networks_fullinfo=get_pretrained_networks_fulldetails(), multi_gpu=config_value('caffe')['multi_gpu'], )
def new(): """ Return a form for a new ImageClassificationModelJob """ form = ImageClassificationModelForm() form.dataset.choices = get_datasets() form.standard_networks.choices = get_standard_networks() form.standard_networks.default = get_default_standard_network() form.previous_networks.choices = get_previous_networks() form.pretrained_networks.choices = get_pretrained_networks() prev_network_snapshots = get_previous_network_snapshots() # Is there a request to clone a job with ?clone=<job_id> fill_form_if_cloned(form) return flask.render_template( 'models/images/classification/new.html', form=form, frameworks=frameworks.get_frameworks(), previous_network_snapshots=prev_network_snapshots, previous_networks_fullinfo=get_previous_networks_fulldetails(), pretrained_networks_fullinfo=get_pretrained_networks_fulldetails(), multi_gpu=config_value('caffe')['multi_gpu'], )
def new(extension_id=None): """ Return a form for a new GenericImageModelJob """ form = GenericImageModelForm() form.dataset.choices = get_datasets(extension_id) form.standard_networks.choices = [] form.previous_networks.choices = get_previous_networks() form.pretrained_networks.choices = get_pretrained_networks() prev_network_snapshots = get_previous_network_snapshots() ## Is there a request to clone a job with ?clone=<job_id> fill_form_if_cloned(form) return flask.render_template( 'models/images/generic/new.html', extension_id=extension_id, extension_title=extensions.data.get_extension(extension_id).get_title() if extension_id else None, form=form, frameworks=frameworks.get_frameworks(), previous_network_snapshots=prev_network_snapshots, previous_networks_fullinfo=get_previous_networks_fulldetails(), pretrained_networks_fullinfo=get_pretrained_networks_fulldetails(), multi_gpu=config_value('caffe')['multi_gpu'], )
def test_inference_while_training(self): # make sure we can do inference while all GPUs are in use for training # if no GPUs, just test inference during a normal training job # get number of GPUs if self.FRAMEWORK == 'tensorflow': raise unittest.SkipTest('Tensorflow CPU inference during training not supported') gpu_count = 1 if (config_value('gpu_list') and config_value('caffe')['cuda_enabled'] and config_value('caffe')['multi_gpu']): gpu_count = len(config_value('gpu_list').split(',')) # grab an image for testing category = self.imageset_paths.keys()[-1] image_path = self.imageset_paths[category][-1] image_path = os.path.join(self.imageset_folder, image_path) with open(image_path, 'rb') as infile: # StringIO wrapping is needed to simulate POST file upload. image_upload = (StringIO(infile.read()), 'image.png') # create a long-running training job job2_id = self.create_model( select_gpu_count=gpu_count, batch_size=10 * gpu_count, train_epochs=1000, ) try: while True: status = self.model_status(job2_id) if status in ['Initialized', 'Waiting']: time.sleep(0.01) elif status == 'Running': break else: raise RuntimeError('job status is %s' % status) rv = self.app.post( '/models/images/classification/classify_one/json?job_id=%s' % self.model_id, data={'image_file': image_upload} ) json.loads(rv.data) assert rv.status_code == 200, 'POST failed with %s' % rv.status_code finally: self.delete_model(job2_id)
def test_select_gpus(self): # test all possible combinations gpu_list = config_value('gpu_list').split(',') for i in xrange(len(gpu_list)): for combination in itertools.combinations(gpu_list, i + 1): if self.FRAMEWORK == 'torch' and len(combination) > 1: raise unittest.SkipTest('Torch not tested with multi-GPU') yield self.check_select_gpus, combination
def test_inference_while_training(self): # make sure we can do inference while all GPUs are in use for training # if no GPUs, just test inference during a normal training job # get number of GPUs if self.FRAMEWORK == 'tensorflow': raise unittest.SkipTest( 'Tensorflow CPU inference during training not supported') gpu_count = 1 if (config_value('gpu_list') and config_value('caffe')['cuda_enabled'] and config_value('caffe')['multi_gpu']): gpu_count = len(config_value('gpu_list').split(',')) # grab an image for testing category = self.imageset_paths.keys()[-1] image_path = self.imageset_paths[category][-1] image_path = os.path.join(self.imageset_folder, image_path) with open(image_path, 'rb') as infile: # StringIO wrapping is needed to simulate POST file upload. image_upload = (StringIO(infile.read()), 'image.png') # create a long-running training job job2_id = self.create_model( select_gpu_count=gpu_count, batch_size=10 * gpu_count, train_epochs=1000, ) try: while True: status = self.model_status(job2_id) if status in ['Initialized', 'Waiting']: time.sleep(0.01) elif status == 'Running': break else: raise RuntimeError('job status is %s' % status) rv = self.app.post( '/models/images/classification/classify_one.json?job_id=%s' % self.model_id, data={'image_file': image_upload}) json.loads(rv.data) assert rv.status_code == 200, 'POST failed with %s' % rv.status_code finally: self.delete_model(job2_id)
def test_select_gpus(self): # test all possible combinations gpu_list = config_value('gpu_list').split(',') for i in xrange(len(gpu_list)): for combination in itertools.combinations(gpu_list, i+1): if self.FRAMEWORK=='torch' and len(combination)>1: raise unittest.SkipTest('Torch not tested with multi-GPU') yield self.check_select_gpus, combination
def test_select_gpus(self): # test all possible combinations gpu_list = config_value('gpu_list').split(',') for i in range(len(gpu_list)): for combination in itertools.combinations(gpu_list, i + 1): # Don't test more than 4 GPUs if len(combination) <= 4: yield self.check_select_gpus, combination
def setup_logging(): # Set custom logger logging.setLoggerClass(JobIdLogger) formatter = logging.Formatter( fmt="%(asctime)s%(job_id)s [%(levelname)-5s] %(message)s", datefmt=DATE_FORMAT, ) # digits logger main_logger = logging.getLogger('digits') main_logger.setLevel(logging.DEBUG) # Log to stdout stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(formatter) stdoutHandler.setLevel(logging.DEBUG) main_logger.addHandler(stdoutHandler) # digits.webapp logger logfile_filename = config_value('log_file')['filename'] logfile_level = config_value('log_file')['level'] if logfile_filename is not None: webapp_logger = logging.getLogger('digits.webapp') webapp_logger.setLevel(logging.DEBUG) # Log to file fileHandler = logging.handlers.RotatingFileHandler( logfile_filename, maxBytes=(1024 * 1024 * 10), # 10 MB backupCount=10, ) fileHandler.setFormatter(formatter) fileHandler.setLevel(logfile_level) webapp_logger.addHandler(fileHandler) # Useful shortcut for the webapp, which may set job_id return JobIdLoggerAdapter(webapp_logger, {}) else: print( 'WARNING: log_file config option not found - no log file is being saved' ) return JobIdLoggerAdapter(main_logger, {})
def get_view_extensions(): """ return all enabled view extensions """ view_extensions = {} all_extensions = config_value('view_extension_list') for extension in all_extensions: view_extensions[extension.get_id()] = extension.get_title() return view_extensions
def setUpClass(cls): skipIfNotFramework('torch') if cls.FRAMEWORK == 'torch' and not config_value('torch')['enabled']: raise unittest.SkipTest('Torch not found') # Call super.setUpClass() unless we're the last in the class hierarchy supercls = super(TorchMixin, cls) if hasattr(supercls, 'setUpClass'): supercls.setUpClass()
def setUpClass(cls): skipIfNotFramework('caffe') if cls.FRAMEWORK == 'caffe' and not config_value('caffe')['loaded']: raise unittest.SkipTest('Caffe not found') # Call super.setUpClass() unless we're the last in the class hierarchy supercls = super(CaffeMixin, cls) if hasattr(supercls, 'setUpClass'): supercls.setUpClass()
def setup_logging(): # Set custom logger logging.setLoggerClass(JobIdLogger) formatter = logging.Formatter( fmt="%(asctime)s%(job_id)s [%(levelname)-5s] %(message)s", datefmt=DATE_FORMAT, ) # digits logger main_logger = logging.getLogger('digits') main_logger.setLevel(logging.DEBUG) # Log to stdout stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(formatter) stdoutHandler.setLevel(logging.DEBUG) main_logger.addHandler(stdoutHandler) # digits.webapp logger logfile_filename = config_value('log_file')['filename'] logfile_level = config_value('log_file')['level'] if logfile_filename is not None: webapp_logger = logging.getLogger('digits.webapp') webapp_logger.setLevel(logging.DEBUG) # Log to file fileHandler = logging.handlers.RotatingFileHandler( logfile_filename, maxBytes=(1024 * 1024 * 10), # 10 MB backupCount=10, ) fileHandler.setFormatter(formatter) fileHandler.setLevel(logfile_level) webapp_logger.addHandler(fileHandler) # Useful shortcut for the webapp, which may set job_id return JobIdLoggerAdapter(webapp_logger, {}) else: print 'WARNING: log_file config option not found - no log file is being saved' return JobIdLoggerAdapter(main_logger, {})
def get_net(self, epoch=None): """ Returns an instance of caffe.Net Keyword Arguments: epoch -- which snapshot to load (default is -1 to load the most recently generated snapshot) """ if not self.has_model(): return False file_to_load = None if not epoch: epoch = self.snapshots[-1][1] file_to_load = self.snapshots[-1][0] else: for snapshot_file, snapshot_epoch in self.snapshots: if snapshot_epoch == epoch: file_to_load = snapshot_file break if file_to_load is None: raise Exception('snapshot not found for epoch "%s"' % epoch) # check if already loaded if ( self.loaded_snapshot_file and self.loaded_snapshot_file == file_to_load and hasattr(self, "_caffe_net") and self._caffe_net is not None ): return self._caffe_net if config_value("caffe_root")["cuda_enabled"] and config_value("gpu_list"): caffe.set_mode_gpu() # load a new model self._caffe_net = caffe.Net(self.path(self.deploy_file), file_to_load, caffe.TEST) self.loaded_snapshot_epoch = epoch self.loaded_snapshot_file = file_to_load return self._caffe_net
def get_net(self, epoch=None): """ Returns an instance of caffe.Net Keyword Arguments: epoch -- which snapshot to load (default is -1 to load the most recently generated snapshot) """ if not self.has_model(): return False file_to_load = None if not epoch: epoch = self.snapshots[-1][1] file_to_load = self.snapshots[-1][0] else: for snapshot_file, snapshot_epoch in self.snapshots: if snapshot_epoch == epoch: file_to_load = snapshot_file break if file_to_load is None: raise Exception('snapshot not found for epoch "%s"' % epoch) # check if already loaded if self.loaded_snapshot_file and self.loaded_snapshot_file == file_to_load \ and hasattr(self, '_caffe_net') and self._caffe_net is not None: return self._caffe_net if config_value('caffe_root')['cuda_enabled'] and\ config_value('gpu_list'): caffe.set_mode_gpu() # load a new model self._caffe_net = caffe.Net(self.path(self.deploy_file), file_to_load, caffe.TEST) self.loaded_snapshot_epoch = epoch self.loaded_snapshot_file = file_to_load return self._caffe_net
def test_inference_while_training(self): # make sure we can do inference while all GPUs are in use for training # if no GPUs, just test inference during a normal training job # get number of GPUs gpu_count = 1 if config_value("gpu_list") and config_value("caffe")["cuda_enabled"] and config_value("caffe")["multi_gpu"]: gpu_count = len(config_value("gpu_list").split(",")) # grab an image for testing category = self.imageset_paths.keys()[-1] image_path = self.imageset_paths[category][-1] image_path = os.path.join(self.imageset_folder, image_path) with open(image_path, "rb") as infile: # StringIO wrapping is needed to simulate POST file upload. image_upload = (StringIO(infile.read()), "image.png") # create a long-running training job job2_id = self.create_model(select_gpu_count=gpu_count, batch_size=10 * gpu_count, train_epochs=1000) try: while True: status = self.model_status(job2_id) if status in ["Initialized", "Waiting"]: time.sleep(0.01) elif status == "Running": break else: raise RuntimeError("job status is %s" % status) rv = self.app.post( "/models/images/classification/classify_one.json?job_id=%s" % self.model_id, data={"image_file": image_upload}, ) data = json.loads(rv.data) assert rv.status_code == 200, "POST failed with %s" % rv.status_code finally: self.delete_model(job2_id)
def task_arguments(self, resources): args = [config_value("caffe_root")["executable"], "train", "--solver=%s" % self.path(self.solver_file)] if "gpus" in resources: identifiers = [] for identifier, value in resources["gpus"]: identifiers.append(identifier) if len(identifiers) == 1: args.append("--gpu=%s" % identifiers[0]) elif len(identifiers) > 1: args.append("--gpus=%s" % ",".join(identifiers)) if self.pretrained_model: args.append("--weights=%s" % self.path(self.pretrained_model)) return args
def __init__(self, name): """ Arguments: name -- name of this job """ super(Job, self).__init__() # create a unique ID self._id = '%s-%s' % (time.strftime('%Y%m%d-%H%M%S'), os.urandom(2).encode('hex')) self._dir = os.path.join(config_value('jobs_dir'), self._id) self._name = name self.pickver_job = PICKLE_VERSION self.tasks = [] self.exception = None os.mkdir(self._dir)
def generic_image_model_new(): """ Return a form for a new GenericImageModelJob """ form = GenericImageModelForm() form.dataset.choices = get_datasets() form.standard_networks.choices = [] form.previous_networks.choices = get_previous_networks() prev_network_snapshots = get_previous_network_snapshots() return flask.render_template('models/images/generic/new.html', form = form, previous_network_snapshots = prev_network_snapshots, multi_gpu = config_value('caffe_root')['multi_gpu'], )
def image_classification_model_new(): """ Return a form for a new ImageClassificationModelJob """ form = ImageClassificationModelForm() form.dataset.choices = get_datasets() form.standard_networks.choices = get_standard_networks() form.standard_networks.default = get_default_standard_network() form.previous_networks.choices = get_previous_networks() prev_network_snapshots = get_previous_network_snapshots() return flask.render_template('models/images/classification/new.html', form = form, previous_network_snapshots = prev_network_snapshots, multi_gpu = config_value('caffe_root')['multi_gpu'], )
def task_arguments(self, resources): args = [config_value('caffe_root')['executable'], 'train', '--solver=%s' % self.path(self.solver_file), ] if 'gpus' in resources: identifiers = [] for identifier, value in resources['gpus']: identifiers.append(identifier) if len(identifiers) == 1: args.append('--gpu=%s' % identifiers[0]) elif len(identifiers) > 1: args.append('--gpus=%s' % ','.join(identifiers)) if self.pretrained_model: args.append('--weights=%s' % self.path(self.pretrained_model)) return args
def __init__(self, name, workspace): """ Arguments: name -- name of this job workspace -- name of workspace to which the new job belongs to """ super(Job, self).__init__() # create a unique ID self._id = '%s-%s' % (time.strftime('%Y%m%d-%H%M%S'), os.urandom(2).encode('hex')) workspace = Organization.objects.get(id = workspace['workspace_id']) WorkspaceJob.objects.create(job_id = self._id, workspace = workspace).save() self._dir = os.path.join(config_value('jobs_dir'), self._id) self._name = name self.pickver_job = PICKLE_VERSION self.tasks = [] self.exception = None os.mkdir(self._dir)
def load(cls, job_id): """ Loads a Job in the given job_id Returns the Job or throws an exception """ from digits.model.tasks import TrainTask job_dir = os.path.join(config_value('jobs_dir'), job_id) filename = os.path.join(job_dir, cls.SAVE_FILE) with open(filename, 'rb') as savefile: job = pickle.load(savefile) # Reset this on load job._dir = job_dir for task in job.tasks: task.job_dir = job_dir if isinstance(task, TrainTask): # can't call this until the job_dir is set task.detect_snapshots() return job
def task_arguments(self, resources): args = [ config_value('caffe_root')['executable'], 'train', '--solver=%s' % self.path(self.solver_file), ] if 'gpus' in resources: identifiers = [] for identifier, value in resources['gpus']: identifiers.append(identifier) if len(identifiers) == 1: args.append('--gpu=%s' % identifiers[0]) elif len(identifiers) > 1: args.append('--gpus=%s' % ','.join(identifiers)) if self.pretrained_model: args.append('--weights=%s' % self.path(self.pretrained_model)) return args
def path(self, filename, relative=False): """ Returns a path to the given file Arguments: filename -- the requested file Keyword arguments: relative -- If False, return an absolute path to the file If True, return a path relative to the jobs directory """ if not filename: return None if os.path.isabs(filename): path = filename else: path = os.path.join(self._dir, filename) if relative: path = os.path.relpath(path, config_value('jobs_dir')) return str(path).replace("\\","/")
def image_classification_model_new(): """ Return a form for a new ImageClassificationModelJob """ form = ImageClassificationModelForm() form.dataset.choices = get_datasets() form.standard_networks.choices = get_standard_networks() form.standard_networks.default = get_default_standard_network() form.previous_networks.choices = get_previous_networks() prev_network_snapshots = get_previous_network_snapshots() return flask.render_template( "models/images/classification/new.html", form=form, frameworks=frameworks.get_frameworks(), previous_network_snapshots=prev_network_snapshots, previous_networks_fullinfo=get_previous_networks_fulldetails(), multi_gpu=config_value("caffe_root")["multi_gpu"], )
def __init__(self, name, username, persistent = True): """ Arguments: name -- name of this job username -- creator of this job """ super(Job, self).__init__() # create a unique ID self._id = '%s-%s' % (time.strftime('%Y%m%d-%H%M%S'), os.urandom(2).encode('hex')) self._dir = os.path.join(config_value('jobs_dir'), self._id) self._name = name self.username = username self.pickver_job = PICKLE_VERSION self.tasks = [] self.exception = None self._notes = None self.event = threading.Event() self.persistent = persistent os.mkdir(self._dir)
def generic_image_model_new(): """ Return a form for a new GenericImageModelJob """ form = GenericImageModelForm() form.dataset.choices = get_datasets() form.standard_networks.choices = [] form.previous_networks.choices = get_previous_networks() prev_network_snapshots = get_previous_network_snapshots() ## Is there a request to clone a job with ?clone=<job_id> fill_form_if_cloned(form) return flask.render_template('models/images/generic/new.html', form = form, frameworks = frameworks.get_frameworks(), previous_network_snapshots = prev_network_snapshots, previous_networks_fullinfo = get_previous_networks_fulldetails(), multi_gpu = config_value('caffe_root')['multi_gpu'], )
def __init__(self, name, username, group='', persistent=True): """ Arguments: name -- name of this job username -- creator of this job """ super(Job, self).__init__() # create a unique ID self._id = '%s-%s' % (time.strftime('%Y%m%d-%H%M%S'), codecs.encode(os.urandom(2), 'hex')) self._dir = os.path.join(config_value('jobs_dir'), self._id) self._name = name self.group = group self.username = username self.pickver_job = PICKLE_VERSION self.tasks = [] self.exception = None self._notes = None self.event = threading.Event() self.persistent = persistent os.mkdir(self._dir)
def new(): """ Return a form for a new ImageClassificationModelJob """ form = ImageClassificationModelForm() form.dataset.choices = get_datasets() form.standard_networks.choices = get_standard_networks() form.standard_networks.default = get_default_standard_network() form.previous_networks.choices = get_previous_networks() form.pretrained_networks.choices = get_pretrained_networks() prev_network_snapshots = get_previous_network_snapshots() # Is there a request to clone a job with ?clone=<job_id> fill_form_if_cloned(form) return flask.render_template('models/images/classification/new.html', form=form, frameworks=frameworks.get_frameworks(), previous_network_snapshots=prev_network_snapshots, previous_networks_fullinfo=get_previous_networks_fulldetails(), pretrained_networks_fullinfo=get_pretrained_networks_fulldetails(), multi_gpu=config_value('caffe')['multi_gpu'], )
def create(): """ Create a new ImageClassificationModelJob Returns JSON when requested: {job_id,name,status} or {errors:[]} """ form = ImageClassificationModelForm() form.dataset.choices = get_datasets() form.standard_networks.choices = get_standard_networks() form.standard_networks.default = get_default_standard_network() form.previous_networks.choices = get_previous_networks() form.pretrained_networks.choices = get_pretrained_networks() prev_network_snapshots = get_previous_network_snapshots() # Is there a request to clone a job with ?clone=<job_id> fill_form_if_cloned(form) if not form.validate_on_submit(): if request_wants_json(): return flask.jsonify({'errors': form.errors}), 400 else: return flask.render_template('models/images/classification/new.html', form=form, frameworks=frameworks.get_frameworks(), previous_network_snapshots=prev_network_snapshots, previous_networks_fullinfo=get_previous_networks_fulldetails(), pretrained_networks_fullinfo=get_pretrained_networks_fulldetails(), multi_gpu=config_value('caffe')['multi_gpu'], ), 400 datasetJob = scheduler.get_job(form.dataset.data) if not datasetJob: raise werkzeug.exceptions.BadRequest( 'Unknown dataset job_id "%s"' % form.dataset.data) # sweeps will be a list of the the permutations of swept fields # Get swept learning_rate sweeps = [{'learning_rate': v} for v in form.learning_rate.data] add_learning_rate = len(form.learning_rate.data) > 1 # Add swept batch_size sweeps = [dict(s.items() + [('batch_size', bs)]) for bs in form.batch_size.data for s in sweeps[:]] add_batch_size = len(form.batch_size.data) > 1 n_jobs = len(sweeps) jobs = [] for sweep in sweeps: # Populate the form with swept data to be used in saving and # launching jobs. form.learning_rate.data = sweep['learning_rate'] form.batch_size.data = sweep['batch_size'] # Augment Job Name extra = '' if add_learning_rate: extra += ' learning_rate:%s' % str(form.learning_rate.data[0]) if add_batch_size: extra += ' batch_size:%d' % form.batch_size.data[0] job = None try: job = ImageClassificationModelJob( username=utils.auth.get_username(), name=form.model_name.data + extra, group=form.group_name.data, dataset_id=datasetJob.id(), ) # get handle to framework object fw = frameworks.get_framework_by_id(form.framework.data) pretrained_model = None if form.method.data == 'standard': found = False # can we find it in standard networks? network_desc = fw.get_standard_network_desc(form.standard_networks.data) if network_desc: found = True network = fw.get_network_from_desc(network_desc) if not found: raise werkzeug.exceptions.BadRequest( 'Unknown standard model "%s"' % form.standard_networks.data) elif form.method.data == 'previous': old_job = scheduler.get_job(form.previous_networks.data) if not old_job: raise werkzeug.exceptions.BadRequest( 'Job not found: %s' % form.previous_networks.data) use_same_dataset = (old_job.dataset_id == job.dataset_id) network = fw.get_network_from_previous(old_job.train_task().network, use_same_dataset) for choice in form.previous_networks.choices: if choice[0] == form.previous_networks.data: epoch = float(flask.request.form['%s-snapshot' % form.previous_networks.data]) if epoch == 0: pass elif epoch == -1: pretrained_model = old_job.train_task().pretrained_model else: # verify snapshot exists pretrained_model = old_job.train_task().get_snapshot(epoch, download=True) if pretrained_model is None: raise werkzeug.exceptions.BadRequest( "For the job %s, selected pretrained_model for epoch %d is invalid!" % (form.previous_networks.data, epoch)) # the first is the actual file if a list is returned, other should be meta data if isinstance(pretrained_model, list): pretrained_model = pretrained_model[0] if not (os.path.exists(pretrained_model)): raise werkzeug.exceptions.BadRequest( "Pretrained_model for the selected epoch doesn't exist. " "May be deleted by another user/process. " "Please restart the server to load the correct pretrained_model details.") # get logical path pretrained_model = old_job.train_task().get_snapshot(epoch) break elif form.method.data == 'pretrained': pretrained_job = scheduler.get_job(form.pretrained_networks.data) model_def_path = pretrained_job.get_model_def_path() weights_path = pretrained_job.get_weights_path() network = fw.get_network_from_path(model_def_path) pretrained_model = weights_path elif form.method.data == 'custom': network = fw.get_network_from_desc(form.custom_network.data) pretrained_model = form.custom_network_snapshot.data.strip() else: raise werkzeug.exceptions.BadRequest( 'Unrecognized method: "%s"' % form.method.data) policy = {'policy': form.lr_policy.data} if form.lr_policy.data == 'fixed': pass elif form.lr_policy.data == 'step': policy['stepsize'] = form.lr_step_size.data policy['gamma'] = form.lr_step_gamma.data elif form.lr_policy.data == 'multistep': policy['stepvalue'] = form.lr_multistep_values.data policy['gamma'] = form.lr_multistep_gamma.data elif form.lr_policy.data == 'exp': policy['gamma'] = form.lr_exp_gamma.data elif form.lr_policy.data == 'inv': policy['gamma'] = form.lr_inv_gamma.data policy['power'] = form.lr_inv_power.data elif form.lr_policy.data == 'poly': policy['power'] = form.lr_poly_power.data elif form.lr_policy.data == 'sigmoid': policy['stepsize'] = form.lr_sigmoid_step.data policy['gamma'] = form.lr_sigmoid_gamma.data else: raise werkzeug.exceptions.BadRequest( 'Invalid learning rate policy') if config_value('caffe')['multi_gpu']: if form.select_gpus.data: selected_gpus = [str(gpu) for gpu in form.select_gpus.data] gpu_count = None elif form.select_gpu_count.data: gpu_count = form.select_gpu_count.data selected_gpus = None else: gpu_count = 1 selected_gpus = None else: if form.select_gpu.data == 'next': gpu_count = 1 selected_gpus = None else: selected_gpus = [str(form.select_gpu.data)] gpu_count = None # Set up data augmentation structure data_aug = {} data_aug['flip'] = form.aug_flip.data data_aug['quad_rot'] = form.aug_quad_rot.data data_aug['rot'] = form.aug_rot.data data_aug['scale'] = form.aug_scale.data data_aug['noise'] = form.aug_noise.data data_aug['contrast'] = form.aug_contrast.data data_aug['whitening'] = form.aug_whitening.data data_aug['hsv_use'] = form.aug_hsv_use.data data_aug['hsv_h'] = form.aug_hsv_h.data data_aug['hsv_s'] = form.aug_hsv_s.data data_aug['hsv_v'] = form.aug_hsv_v.data # Python Layer File may be on the server or copied from the client. fs.copy_python_layer_file( bool(form.python_layer_from_client.data), job.dir(), (flask.request.files[form.python_layer_client_file.name] if form.python_layer_client_file.name in flask.request.files else ''), form.python_layer_server_file.data) job.tasks.append(fw.create_train_task( job=job, dataset=datasetJob, train_epochs=form.train_epochs.data, snapshot_interval=form.snapshot_interval.data, learning_rate=form.learning_rate.data[0], lr_policy=policy, gpu_count=gpu_count, selected_gpus=selected_gpus, batch_size=form.batch_size.data[0], batch_accumulation=form.batch_accumulation.data, val_interval=form.val_interval.data, traces_interval=form.traces_interval.data, pretrained_model=pretrained_model, crop_size=form.crop_size.data, use_mean=form.use_mean.data, network=network, random_seed=form.random_seed.data, solver_type=form.solver_type.data, rms_decay=form.rms_decay.data, shuffle=form.shuffle.data, data_aug=data_aug, ) ) # Save form data with the job so we can easily clone it later. save_form_to_job(job, form) jobs.append(job) scheduler.add_job(job) if n_jobs == 1: if request_wants_json(): return flask.jsonify(job.json_dict()) else: return flask.redirect(flask.url_for('digits.model.views.show', job_id=job.id())) except: if job: scheduler.delete_job(job) raise if request_wants_json(): return flask.jsonify(jobs=[j.json_dict() for j in jobs]) # If there are multiple jobs launched, go to the home page. return flask.redirect('/')
def setUpClass(cls): super(BaseViewsTest, cls).setUpClass() if cls.FRAMEWORK=='torch' and not config_value('torch_root'): raise unittest.SkipTest('Torch not found')
def task_arguments(self, resources, env): if config_value('torch_root') == '<PATHS>': torch_bin = 'th' else: torch_bin = os.path.join(config_value('torch_root'), 'bin', 'th') dataset_backend = self.dataset.get_backend() assert dataset_backend=='lmdb' or dataset_backend=='hdf5' args = [torch_bin, os.path.join(os.path.dirname(os.path.dirname(digits.__file__)),'tools','torch','wrapper.lua'), 'main.lua', '--network=%s' % self.model_file.split(".")[0], '--epoch=%d' % int(self.train_epochs), '--networkDirectory=%s' % self.job_dir, '--save=%s' % self.job_dir, '--snapshotPrefix=%s' % self.snapshot_prefix, '--snapshotInterval=%s' % self.snapshot_interval, '--learningRate=%s' % self.learning_rate, '--policy=%s' % str(self.lr_policy['policy']), '--dbbackend=%s' % dataset_backend ] if self.batch_size is not None: args.append('--batchSize=%d' % self.batch_size) if self.use_mean != 'none': filename = self.create_mean_file() args.append('--mean=%s' % filename) if hasattr(self.dataset, 'labels_file'): args.append('--labels=%s' % self.dataset.path(self.dataset.labels_file)) train_feature_db_path = self.dataset.get_feature_db_path(constants.TRAIN_DB) train_label_db_path = self.dataset.get_label_db_path(constants.TRAIN_DB) val_feature_db_path = self.dataset.get_feature_db_path(constants.VAL_DB) val_label_db_path = self.dataset.get_label_db_path(constants.VAL_DB) args.append('--train=%s' % train_feature_db_path) if train_label_db_path: args.append('--train_labels=%s' % train_label_db_path) if val_feature_db_path: args.append('--validation=%s' % val_feature_db_path) if val_label_db_path: args.append('--validation_labels=%s' % val_label_db_path) #learning rate policy input parameters if self.lr_policy['policy'] == 'fixed': pass elif self.lr_policy['policy'] == 'step': args.append('--gamma=%s' % self.lr_policy['gamma']) args.append('--stepvalues=%s' % self.lr_policy['stepsize']) elif self.lr_policy['policy'] == 'multistep': args.append('--stepvalues=%s' % self.lr_policy['stepvalue']) args.append('--gamma=%s' % self.lr_policy['gamma']) elif self.lr_policy['policy'] == 'exp': args.append('--gamma=%s' % self.lr_policy['gamma']) elif self.lr_policy['policy'] == 'inv': args.append('--gamma=%s' % self.lr_policy['gamma']) args.append('--power=%s' % self.lr_policy['power']) elif self.lr_policy['policy'] == 'poly': args.append('--power=%s' % self.lr_policy['power']) elif self.lr_policy['policy'] == 'sigmoid': args.append('--stepvalues=%s' % self.lr_policy['stepsize']) args.append('--gamma=%s' % self.lr_policy['gamma']) if self.shuffle: args.append('--shuffle=yes') if self.crop_size: args.append('--crop=yes') args.append('--croplen=%d' % self.crop_size) if self.use_mean == 'pixel': args.append('--subtractMean=pixel') elif self.use_mean == 'image': args.append('--subtractMean=image') else: args.append('--subtractMean=none') if self.random_seed is not None: args.append('--seed=%s' % self.random_seed) if self.solver_type == 'SGD': args.append('--optimization=sgd') elif self.solver_type == 'NESTEROV': args.append('--optimization=nag') elif self.solver_type == 'ADAGRAD': args.append('--optimization=adagrad') elif self.solver_type == 'RMSPROP': args.append('--optimization=rmsprop') elif self.solver_type == 'ADADELTA': args.append('--optimization=adadelta') elif self.solver_type == 'ADAM': args.append('--optimization=adam') else: raise ValueError('Unknown solver_type %s' % self.solver_type) if self.val_interval > 0: args.append('--interval=%s' % self.val_interval) if 'gpus' in resources: identifiers = [] for identifier, value in resources['gpus']: identifiers.append(identifier) # make all selected GPUs visible to the Torch 'th' process. # don't make other GPUs visible though since Torch will load # CUDA libraries and allocate memory on all visible GPUs by # default. env['CUDA_VISIBLE_DEVICES'] = ','.join(identifiers) # switch to GPU mode args.append('--type=cuda') else: # switch to CPU mode args.append('--type=float') if self.pretrained_model: filenames = self.pretrained_model.split(os.path.pathsep) if len(filenames) > 1: raise ValueError('Torch does not support multiple pretrained model files') args.append('--weights=%s' % self.path(filenames[0])) # Augmentations assert self.data_aug['flip'] in ['none', 'fliplr', 'flipud', 'fliplrud'], 'Bad or unknown flag "flip"' args.append('--augFlip=%s' % self.data_aug['flip']) assert self.data_aug['quad_rot'] in ['none', 'rot90', 'rot180', 'rotall'], 'Bad or unknown flag "quad_rot"' args.append('--augQuadRot=%s' % self.data_aug['quad_rot']) if self.data_aug['rot']: args.append('--augRot=%s' % self.data_aug['rot']) if self.data_aug['scale']: args.append('--augScale=%s' % self.data_aug['scale']) if self.data_aug['noise']: args.append('--augNoise=%s' % self.data_aug['noise']) if self.data_aug['hsv_use']: args.append('--augHSVh=%s' % self.data_aug['hsv_h']) args.append('--augHSVs=%s' % self.data_aug['hsv_s']) args.append('--augHSVv=%s' % self.data_aug['hsv_v']) else: args.append('--augHSVh=0') args.append('--augHSVs=0') args.append('--augHSVv=0') return args
def test_select_gpu(self): for index in config_value('gpu_list').split(','): yield self.check_select_gpu, index
def test_select_gpus(self): # test all possible combinations gpu_list = config_value('gpu_list').split(',') for i in xrange(len(gpu_list)): for combination in itertools.combinations(gpu_list, i+1): yield self.check_select_gpus, combination
def generic_image_model_create(): """ Create a new GenericImageModelJob Returns JSON when requested: {job_id,name,status} or {errors:[]} """ form = GenericImageModelForm() form.dataset.choices = get_datasets() form.standard_networks.choices = [] form.previous_networks.choices = get_previous_networks() prev_network_snapshots = get_previous_network_snapshots() if not form.validate_on_submit(): if request_wants_json(): return flask.jsonify({'errors': form.errors}), 400 else: return flask.render_template('models/images/generic/new.html', form = form, previous_network_snapshots = prev_network_snapshots, previous_networks_fullinfo = get_previous_networks_fulldetails(), multi_gpu = config_value('caffe_root')['multi_gpu'], ), 400 datasetJob = scheduler.get_job(form.dataset.data) if not datasetJob: raise werkzeug.exceptions.BadRequest( 'Unknown dataset job_id "%s"' % form.dataset.data) job = None try: job = GenericImageModelJob( name = form.model_name.data, dataset_id = datasetJob.id(), ) # get framework (hard-coded to caffe for now) fw = frameworks.get_framework_by_id('caffe') pretrained_model = None #if form.method.data == 'standard': if form.method.data == 'previous': old_job = scheduler.get_job(form.previous_networks.data) if not old_job: raise werkzeug.exceptions.BadRequest( 'Job not found: %s' % form.previous_networks.data) network = fw.get_network_from_previous(old_job.train_task().network) for choice in form.previous_networks.choices: if choice[0] == form.previous_networks.data: epoch = float(flask.request.form['%s-snapshot' % form.previous_networks.data]) if epoch == 0: pass elif epoch == -1: pretrained_model = old_job.train_task().pretrained_model else: for filename, e in old_job.train_task().snapshots: if e == epoch: pretrained_model = filename break if pretrained_model is None: raise werkzeug.exceptions.BadRequest( "For the job %s, selected pretrained_model for epoch %d is invalid!" % (form.previous_networks.data, epoch)) if not (os.path.exists(pretrained_model)): raise werkzeug.exceptions.BadRequest( "Pretrained_model for the selected epoch doesn't exists. May be deleted by another user/process. Please restart the server to load the correct pretrained_model details") break elif form.method.data == 'custom': network = fw.get_network_from_desc(form.custom_network.data) pretrained_model = form.custom_network_snapshot.data.strip() else: raise werkzeug.exceptions.BadRequest( 'Unrecognized method: "%s"' % form.method.data) policy = {'policy': form.lr_policy.data} if form.lr_policy.data == 'fixed': pass elif form.lr_policy.data == 'step': policy['stepsize'] = form.lr_step_size.data policy['gamma'] = form.lr_step_gamma.data elif form.lr_policy.data == 'multistep': policy['stepvalue'] = form.lr_multistep_values.data policy['gamma'] = form.lr_multistep_gamma.data elif form.lr_policy.data == 'exp': policy['gamma'] = form.lr_exp_gamma.data elif form.lr_policy.data == 'inv': policy['gamma'] = form.lr_inv_gamma.data policy['power'] = form.lr_inv_power.data elif form.lr_policy.data == 'poly': policy['power'] = form.lr_poly_power.data elif form.lr_policy.data == 'sigmoid': policy['stepsize'] = form.lr_sigmoid_step.data policy['gamma'] = form.lr_sigmoid_gamma.data else: raise werkzeug.exceptions.BadRequest( 'Invalid learning rate policy') if config_value('caffe_root')['multi_gpu']: if form.select_gpu_count.data: gpu_count = form.select_gpu_count.data selected_gpus = None else: selected_gpus = [str(gpu) for gpu in form.select_gpus.data] gpu_count = None else: if form.select_gpu.data == 'next': gpu_count = 1 selected_gpus = None else: selected_gpus = [str(form.select_gpu.data)] gpu_count = None job.tasks.append(fw.create_train_task( job_dir = job.dir(), dataset = datasetJob, train_epochs = form.train_epochs.data, snapshot_interval = form.snapshot_interval.data, learning_rate = form.learning_rate.data, lr_policy = policy, gpu_count = gpu_count, selected_gpus = selected_gpus, batch_size = form.batch_size.data, val_interval = form.val_interval.data, pretrained_model= pretrained_model, crop_size = form.crop_size.data, use_mean = bool(form.use_mean.data), network = network, random_seed = form.random_seed.data, solver_type = form.solver_type.data, ) ) scheduler.add_job(job) if request_wants_json(): return flask.jsonify(job.json_dict()) else: return flask.redirect(flask.url_for('models_show', job_id=job.id())) except: if job: scheduler.delete_job(job) raise
def get_network_visualization(self, desc): """ return visualization of network """ # save network description to temporary file temp_network_handle, temp_network_path = tempfile.mkstemp(suffix='.py') os.write(temp_network_handle, desc) os.close(temp_network_handle) try: # do this in a try..finally clause to make sure we delete the temp file # build command line mxnet_bin = config_value('mxnet')['executable'] args = [mxnet_bin, os.path.join(os.path.dirname(digits.__file__), 'tools', 'mxnet', 'train'), '--network=%s' % os.path.splitext(os.path.basename(temp_network_path))[0], '--networkDirectory=%s' % os.path.dirname(temp_network_path), '--subtractMean=none', # we are not providing a mean image '--visualizeModel=yes', '--type=float' ] # execute command p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True, ) # TODO: need to include regular expression for MAC color codes regex = re.compile('\x1b\[[0-9;]*m', re.UNICODE) # the network description will be accumulated from the command output # when collecting_net_definition==True collecting_net_definition = False desc = [] unrecognized_output = [] while p.poll() is None: for line in utils.nonblocking_readlines(p.stdout): if line is not None: # Remove whitespace and color codes. # Color codes are appended to beginning and end of line by mxnet binary # i.e., 'th'. Check the below link for more information # https://groups.google.com/forum/#!searchin/mxnet7/color$20codes/mxnet7/8O_0lSgSzuA/Ih6wYg9fgcwJ # noqa line = regex.sub('', line) timestamp, level, message = MxnetTrainTask.preprocess_output_mxnet(line.strip()) if message: if message.startswith('Network definition'): collecting_net_definition = not collecting_net_definition else: if collecting_net_definition: desc.append(line) elif len(line): unrecognized_output.append(line) else: time.sleep(0.05) if not len(desc): # we did not find a network description raise NetworkVisualizationError(''.join(unrecognized_output)) else: output = flask.Markup('<pre>') for line in desc: output += flask.Markup.escape(line) output += flask.Markup('</pre>') return output finally: os.remove(temp_network_path)
# Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved. from __future__ import absolute_import from .framework import Framework from .torch_framework import TorchFramework from digits.config import config_value __all__ = [ 'Framework', 'TorchFramework', ] if config_value('tensorflow')['enabled']: from .tensorflow_framework import TensorflowFramework __all__.append('TensorflowFramework') # # create framework instances # # torch is optional torch = TorchFramework() if config_value('torch')['enabled'] else None # tensorflow is optional tensorflow = TensorflowFramework() if config_value( 'tensorflow')['enabled'] else None # # utility functions #
def infer_one_image(self, image, snapshot_epoch=None, layers=None, gpu=None): """ Classify an image Returns (predictions, visualizations) predictions -- an array of [ (label, confidence), ...] for each label, sorted by confidence visualizations -- an array of (layer_name, activations, weights) for the specified layers Returns (None, None) if something goes wrong Arguments: image -- a np.array Keyword arguments: snapshot_epoch -- which snapshot to use layers -- which layer activation[s] and weight[s] to visualize """ temp_image_handle, temp_image_path = tempfile.mkstemp(suffix='.png') os.close(temp_image_handle) image = PIL.Image.fromarray(image) try: image.save(temp_image_path, format='png') except KeyError: error_message = 'Unable to save file to "%s"' % temp_image_path self.logger.error(error_message) raise digits.inference.errors.InferenceError(error_message) if config_value('torch_root') == '<PATHS>': torch_bin = 'th' else: torch_bin = os.path.join(config_value('torch_root'), 'bin', 'th') file_to_load = self.get_snapshot(snapshot_epoch) args = [torch_bin, os.path.join(os.path.dirname(os.path.dirname(digits.__file__)),'tools','torch','wrapper.lua'), 'test.lua', '--image=%s' % temp_image_path, '--network=%s' % self.model_file.split(".")[0], '--networkDirectory=%s' % self.job_dir, '--snapshot=%s' % file_to_load, '--allPredictions=yes', ] if hasattr(self.dataset, 'labels_file'): args.append('--labels=%s' % self.dataset.path(self.dataset.labels_file)) if self.use_mean != 'none': filename = self.create_mean_file() args.append('--mean=%s' % os.path.join(self.job_dir, constants.MEAN_FILE_IMAGE)) if self.use_mean == 'pixel': args.append('--subtractMean=pixel') elif self.use_mean == 'image': args.append('--subtractMean=image') else: args.append('--subtractMean=none') if self.crop_size: args.append('--crop=yes') args.append('--croplen=%d' % self.crop_size) if layers=='all': args.append('--visualization=yes') args.append('--save=%s' % self.job_dir) # Convert them all to strings args = [str(x) for x in args] regex = re.compile('\x1b\[[0-9;]*m', re.UNICODE) #TODO: need to include regular expression for MAC color codes self.logger.info('%s classify one task started.' % self.get_framework_id()) unrecognized_output = [] predictions = [] self.visualization_file = None env = os.environ.copy() if gpu is not None: args.append('--type=cuda') # make only the selected GPU visible env['CUDA_VISIBLE_DEVICES'] = "%d" % gpu else: args.append('--type=float') p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=self.job_dir, close_fds=True, env=env, ) try: while p.poll() is None: for line in utils.nonblocking_readlines(p.stdout): if self.aborted.is_set(): p.terminate() raise digits.inference.errors.InferenceError('%s classify one task got aborted. error code - %d' % (self.get_framework_id(), p.returncode)) if line is not None: # Remove color codes and whitespace line=regex.sub('', line).strip() if line: if not self.process_test_output(line, predictions, 'one'): self.logger.warning('%s classify one task unrecognized input: %s' % (self.get_framework_id(), line.strip())) unrecognized_output.append(line) else: time.sleep(0.05) except Exception as e: if p.poll() is None: p.terminate() error_message = '' if type(e) == digits.inference.errors.InferenceError: error_message = e.__str__() else: error_message = '%s classify one task failed with error code %d \n %s' % (self.get_framework_id(), p.returncode, str(e)) self.logger.error(error_message) if unrecognized_output: unrecognized_output = '\n'.join(unrecognized_output) error_message = error_message + unrecognized_output raise digits.inference.errors.InferenceError(error_message) finally: self.after_test_run(temp_image_path) if p.returncode != 0: error_message = '%s classify one task failed with error code %d' % (self.get_framework_id(), p.returncode) self.logger.error(error_message) if unrecognized_output: unrecognized_output = '\n'.join(unrecognized_output) error_message = error_message + unrecognized_output raise digits.inference.errors.InferenceError(error_message) else: self.logger.info('%s classify one task completed.' % self.get_framework_id()) predictions = {'output': np.array(predictions)} visualizations = [] if layers=='all' and self.visualization_file: vis_db = h5py.File(self.visualization_file, 'r') # the HDF5 database is organized as follows: # <root> # |- layers # |- 1 # | |- name # | |- activations # | |- weights # |- 2 for layer_id,layer in vis_db['layers'].items(): layer_desc = layer['name'][...].tostring() if 'Sequential' in layer_desc or 'Parallel' in layer_desc: # ignore containers continue idx = int(layer_id) # activations if 'activations' in layer: data = np.array(layer['activations'][...]) # skip batch dimension if len(data.shape)>1 and data.shape[0]==1: data = data[0] vis = utils.image.get_layer_vis_square(data) mean, std, hist = self.get_layer_statistics(data) visualizations.append( { 'id': idx, 'name': layer_desc, 'vis_type': 'Activations', 'vis': vis, 'data_stats': { 'shape': data.shape, 'mean': mean, 'stddev': std, 'histogram': hist, } } ) # weights if 'weights' in layer: data = np.array(layer['weights'][...]) if 'Linear' not in layer_desc: vis = utils.image.get_layer_vis_square(data) else: # Linear (inner product) layers have too many weights # to display vis = None mean, std, hist = self.get_layer_statistics(data) parameter_count = reduce(operator.mul, data.shape, 1) if 'bias' in layer: bias = np.array(layer['bias'][...]) parameter_count += reduce(operator.mul, bias.shape, 1) visualizations.append( { 'id': idx, 'name': layer_desc, 'vis_type': 'Weights', 'vis': vis, 'param_count': parameter_count, 'data_stats': { 'shape': data.shape, 'mean': mean, 'stddev': std, 'histogram': hist, } } ) # sort by layer ID visualizations = sorted(visualizations,key=lambda x:x['id']) return (predictions,visualizations)
def infer_many_images(self, images, snapshot_epoch=None, gpu=None): """ Returns (labels, results): labels -- an array of strings results -- a 2D np array: [ [image0_label0_confidence, image0_label1_confidence, ...], [image1_label0_confidence, image1_label1_confidence, ...], ... ] Arguments: images -- a list of np.arrays Keyword arguments: snapshot_epoch -- which snapshot to use """ # create a temporary folder to store images and a temporary file # to store a list of paths to the images temp_dir_path = tempfile.mkdtemp() try: # this try...finally clause is used to clean up the temp directory in any case temp_imglist_handle, temp_imglist_path = tempfile.mkstemp(dir=temp_dir_path, suffix='.txt') for image in images: temp_image_handle, temp_image_path = tempfile.mkstemp( dir=temp_dir_path, suffix='.png') image = PIL.Image.fromarray(image) try: image.save(temp_image_path, format='png') except KeyError: error_message = 'Unable to save file to "%s"' % temp_image_path self.logger.error(error_message) raise digits.inference.errors.InferenceError(error_message) os.write(temp_imglist_handle, "%s\n" % temp_image_path) os.close(temp_image_handle) os.close(temp_imglist_handle) if config_value('torch_root') == '<PATHS>': torch_bin = 'th' else: torch_bin = os.path.join(config_value('torch_root'), 'bin', 'th') file_to_load = self.get_snapshot(snapshot_epoch) args = [torch_bin, os.path.join(os.path.dirname(os.path.dirname(digits.__file__)),'tools','torch','wrapper.lua'), 'test.lua', '--testMany=yes', '--allPredictions=yes', #all predictions are grabbed and formatted as required by DIGITS '--image=%s' % str(temp_imglist_path), '--network=%s' % self.model_file.split(".")[0], '--networkDirectory=%s' % self.job_dir, '--snapshot=%s' % file_to_load, ] if hasattr(self.dataset, 'labels_file'): args.append('--labels=%s' % self.dataset.path(self.dataset.labels_file)) if self.use_mean != 'none': filename = self.create_mean_file() args.append('--mean=%s' % os.path.join(self.job_dir, constants.MEAN_FILE_IMAGE)) if self.use_mean == 'pixel': args.append('--subtractMean=pixel') elif self.use_mean == 'image': args.append('--subtractMean=image') else: args.append('--subtractMean=none') if self.crop_size: args.append('--crop=yes') args.append('--croplen=%d' % self.crop_size) # Convert them all to strings args = [str(x) for x in args] regex = re.compile('\x1b\[[0-9;]*m', re.UNICODE) #TODO: need to include regular expression for MAC color codes self.logger.info('%s classify many task started.' % self.name()) env = os.environ.copy() if gpu is not None: args.append('--type=cuda') # make only the selected GPU visible env['CUDA_VISIBLE_DEVICES'] = "%d" % gpu else: args.append('--type=float') unrecognized_output = [] predictions = [] p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=self.job_dir, close_fds=True, env=env ) try: while p.poll() is None: for line in utils.nonblocking_readlines(p.stdout): if self.aborted.is_set(): p.terminate() raise digits.inference.errors.InferenceError('%s classify many task got aborted. error code - %d' % (self.get_framework_id(), p.returncode)) if line is not None: # Remove whitespace and color codes. color codes are appended to beginning and end of line by torch binary i.e., 'th'. Check the below link for more information # https://groups.google.com/forum/#!searchin/torch7/color$20codes/torch7/8O_0lSgSzuA/Ih6wYg9fgcwJ line=regex.sub('', line).strip() if line: if not self.process_test_output(line, predictions, 'many'): self.logger.warning('%s classify many task unrecognized input: %s' % (self.get_framework_id(), line.strip())) unrecognized_output.append(line) else: time.sleep(0.05) except Exception as e: if p.poll() is None: p.terminate() error_message = '' if type(e) == digits.inference.errors.InferenceError: error_message = e.__str__() else: error_message = '%s classify many task failed with error code %d \n %s' % (self.get_framework_id(), p.returncode, str(e)) self.logger.error(error_message) if unrecognized_output: unrecognized_output = '\n'.join(unrecognized_output) error_message = error_message + unrecognized_output raise digits.inference.errors.InferenceError(error_message) if p.returncode != 0: error_message = '%s classify many task failed with error code %d' % (self.get_framework_id(), p.returncode) self.logger.error(error_message) if unrecognized_output: unrecognized_output = '\n'.join(unrecognized_output) error_message = error_message + unrecognized_output raise digits.inference.errors.InferenceError(error_message) else: self.logger.info('%s classify many task completed.' % self.get_framework_id()) finally: shutil.rmtree(temp_dir_path) # task.infer_one() expects dictionary in return value return {'output': np.array(predictions)}