def _generate_train_luminoth_config(self, **kwargs): config = { "train": { "run_name": self.name, "job_dir": "datasets/object_recognition" }, "dataset": { "type": "object_detection", "dir": "datasets" }, "model": { "type": self.algorithm, "network": { "num_classes": len(self.classes) } } } if not os.path.exists("datasets/object_recognition"): os.mkdir("datasets/object_recognition") with open("datasets/luminoth.yml", "w") as f: f.write(yaml.dump(config)) return get_config("datasets/luminoth.yml")
def train(config_files, job_dir, override_params): """ Parse TF_CONFIG to cluster_spec and call run() function """ # TF_CONFIG environment variable is available when running using gcloud # either locally or on cloud. It has all the information required to create # a ClusterSpec which is important for running distributed code. tf_config_val = os.environ.get('TF_CONFIG') if tf_config_val: tf_config = json.loads(tf_config_val) else: tf_config = {} cluster = tf_config.get('cluster') job_name = tf_config.get('task', {}).get('type') task_index = tf_config.get('task', {}).get('index') environment = tf_config.get('environment', 'local') # Get the user config and the model type from it. try: config = get_config(config_files, override_params=override_params) except KeyError: # Without mode type defined we can't use the default config settings. raise KeyError('model.type should be set on the custom config.') if job_dir: override_params += ('train.job_dir={}'.format(job_dir), ) # If cluster information is empty or TF_CONFIG is not available, run local if job_name is None or task_index is None: return run_local(config, environment=environment)
def main(input_image): # ************************************************************************* with tf.gfile.Open(input_image, 'rb') as f: try: image = Image.open(f).convert('RGB') except (tf.errors.OutOfRangeError, OSError) as e: print 'Exception!' # ************************************************************************* config = get_config(CONFIG) config.model.rcnn.proposals.total_max_detections = MAX_DET config.model.rcnn.proposals.min_prob_threshold = MIN_PROB network = PredictorNetwork(config) objects = network.predict_image(image) print '************************* Num of Objects : ', len(objects) # ************************************************************************* ref_image = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2RGB) pref_list, images, ids = classify_cellscropping.load_json(ref_image, preds=objects) print '************************* Num of Cropped Objects : ', np.shape(ids) # ************************************************************************* final_json = classify_main.predict(images, ids, CKPT) output_json_path = os.path.join( OUTPUT_JSON_DIR, '%s.json' % input_image.split('/')[-1][:-4]) with open(output_json_path, 'w') as fp: json.dump(final_json, fp)
def web(config_files, checkpoint, override_params, host, port, debug): if debug: tf.logging.set_verbosity(tf.logging.DEBUG) else: tf.logging.set_verbosity(tf.logging.INFO) if checkpoint: config = get_checkpoint_config(checkpoint) elif config_files: config = get_config(config_files) else: click.echo('You must specify either a checkpoint or a config file.') return if override_params: config = override_config_params(config, override_params) # Bounding boxes will be filtered by frontend (using slider), so we set a # low threshold. config.model.rcnn.proposals.min_prob_threshold = 0.01 # Initialize model global NETWORK_START_THREAD NETWORK_START_THREAD = Thread(target=start_network, args=(config,)) NETWORK_START_THREAD.start() app.run(host=host, port=port, debug=debug)
def web(config_files, checkpoint, override_params, host, port, debug): if debug: tf.logging.set_verbosity(tf.logging.DEBUG) else: tf.logging.set_verbosity(tf.logging.INFO) if checkpoint: config = get_checkpoint_config(checkpoint) elif config_files: config = get_config(config_files) else: click.echo( 'Neither checkpoint not config specified, assuming `accurate`.') config = get_checkpoint_config('accurate') if override_params: config = override_config_params(config, override_params) # Bounding boxes will be filtered by frontend (using slider), so we set a # low threshold. if config.model.type == 'fasterrcnn': config.model.rcnn.proposals.min_prob_threshold = 0.01 elif config.model.type == 'ssd': config.model.proposals.min_prob_threshold = 0.01 else: raise ValueError("Model type '{}' not supported".format( config.model.type)) # Initialize model global NETWORK_START_THREAD NETWORK_START_THREAD = Thread(target=start_network, args=(config, )) NETWORK_START_THREAD.start() app.run(host=host, port=port, debug=debug)
def _load_model(self): config_path = f"{self.model_path}/luminoth.predict.yml" if not os.path.exists(config_path): self._generate_predict_luminoth_config() config = get_config(config_path) return PredictorNetwork(config)
def get_predictions(image_paths, config_files): """ Get predictions for multiple images. When predicting many images we don't want to load the checkpoint each time. We load the checkpoint in the first iteration and then use the same session and graph for subsequent images. """ config = get_config(config_files) if config.dataset.dir: # Gets the names of the classes classes_file = os.path.join(config.dataset.dir, 'classes.json') if tf.gfile.Exists(classes_file): class_labels = json.load(tf.gfile.GFile(classes_file)) else: class_labels = None session = None fetches = None image_tensor = None for image_path in image_paths: with tf.gfile.Open(image_path, 'rb') as im_file: try: image = Image.open(im_file).convert('RGB') except tf.errors.OutOfRangeError as e: yield { 'error': '{}'.format(e), 'image_path': image_path, } continue preds = get_prediction(image, config, session=session, fetches=fetches, image_tensor=image_tensor, class_labels=class_labels, return_tf_vars=True) if session is None: # After first loop session = preds['session'] fetches = preds['fetches'] image_tensor = preds['image_tensor'] yield { 'objects': preds['objects'], 'objects_labels': preds['objects_labels'], 'objects_labels_prob': preds['objects_labels_prob'], 'inference_time': preds['inference_time'], 'image_path': image_path, }
def train(config_files, job_dir, override_params): """ Parse TF_CONFIG to cluster_spec and call run() function """ # TF_CONFIG environment variable is available when running using gcloud # either locally or on cloud. It has all the information required to create # a ClusterSpec which is important for running distributed code. tf_config_val = os.environ.get("TF_CONFIG") if tf_config_val: tf_config = json.loads(tf_config_val) else: tf_config = {} cluster = tf_config.get("cluster") job_name = tf_config.get("task", {}).get("type") task_index = tf_config.get("task", {}).get("index") environment = tf_config.get("environment", "local") # Get the user config and the model type from it. try: config = get_config(config_files, override_params=override_params) except KeyError: # Without mode type defined we can't use the default config settings. raise KeyError("model.type should be set on the custom config.") if job_dir: override_params += ("train.job_dir={}".format(job_dir),) # If cluster information is empty or TF_CONFIG is not available, run local if job_name is None or task_index is None: return run(config, environment=environment) cluster_spec = tf.train.ClusterSpec(cluster) server = tf.train.Server(cluster_spec, job_name=job_name, task_index=task_index) # Wait for incoming connections forever # Worker ships the graph to the ps server # The ps server manages the parameters of the model. if job_name == "ps": server.join() return elif job_name in ["master", "worker"]: is_chief = job_name == "master" return run( config, target=server.target, cluster_spec=cluster_spec, is_chief=is_chief, job_name=job_name, task_index=task_index, environment=environment, )
def train(config_files, job_dir, override_params): """ Parse TF_CONFIG to cluster_spec and call run() function """ # TF_CONFIG environment variable is available when running using gcloud # either locally or on cloud. It has all the information required to create # a ClusterSpec which is important for running distributed code. tf_config_val = os.environ.get('TF_CONFIG') if tf_config_val: tf_config = json.loads(tf_config_val) else: tf_config = {} cluster = tf_config.get('cluster') job_name = tf_config.get('task', {}).get('type') task_index = tf_config.get('task', {}).get('index') environment = tf_config.get('environment', 'local') # Get the user config and the model type from it. try: config = get_config(config_files, override_params=override_params) except KeyError: # Without mode type defined we can't use the default config settings. raise KeyError('model.type should be set on the custom config.') if job_dir: override_params += ('train.job_dir={}'.format(job_dir), ) # If cluster information is empty or TF_CONFIG is not available, run local if job_name is None or task_index is None: return run( config, environment=environment ) cluster_spec = tf.train.ClusterSpec(cluster) server = tf.train.Server( cluster_spec, job_name=job_name, task_index=task_index) # Wait for incoming connections forever # Worker ships the graph to the ps server # The ps server manages the parameters of the model. if job_name == 'ps': server.join() return elif job_name in ['master', 'worker']: is_chief = job_name == 'master' return run( config, target=server.target, cluster_spec=cluster_spec, is_chief=is_chief, job_name=job_name, task_index=task_index, environment=environment )
def _generate_train_config(self, **kwargs): if self.algorithm == "fasterrcnn": train_config, hyperparams = self._generate_train_fasterrcnn_config(**kwargs) elif self.algorithm == "ssd": train_config, hyperparams = self._generate_train_ssd_config(**kwargs) if not os.path.exists("data/luminoth"): os.mkdir("data/luminoth") with open("data/luminoth/luminoth.yml", "w") as f: f.write(yaml.dump(train_config)) return get_config("data/luminoth/luminoth.yml"), hyperparams
def web(config_files, checkpoint, override_params, host, port, debug, min_prob, save_path): global SAVE_PATH_GLOBAL if save_path: SAVE_PATH_GLOBAL = save_path if debug: tf.logging.set_verbosity(tf.logging.DEBUG) else: tf.logging.set_verbosity(tf.logging.INFO) if checkpoint: config = get_checkpoint_config(checkpoint) elif config_files: config = get_config(config_files) else: raise ValueError( 'Neither checkpoint not config specified, assuming `accurate`.' ) if override_params: config = override_config_params(config, override_params) # Bounding boxes will be filtered by frontend (using slider), so we set a # low threshold. if config.model.type == 'fasterrcnn': config.model.rcnn.proposals.min_prob_threshold = min_prob elif config.model.type == 'ssd': config.model.proposals.min_prob_threshold = min_prob else: raise ValueError( "Model type '{}' not supported".format(config.model.type) ) # Verfy folder path or create try: os.stat(SAVE_PATH_GLOBAL) except: os.mkdir(SAVE_PATH_GLOBAL) # Initialize model global NETWORK_START_THREAD NETWORK_START_THREAD = Thread(target=start_network, args=(config,)) NETWORK_START_THREAD.start() if debug: app.config.from_object('config.DebugConfig') else: app.config.from_object('config.ProductionConfig') app.run(host=host, port=port, debug=debug)
def get_checkpoint_config(id_or_alias, prompt=True): """Returns the checkpoint config object in order to load the model. If `prompt` is ``True`` and the checkpoint is not present in the index, prompt the user to refresh the index. If the checkpoint is present in the index but is remote and not yet downloaded, prompt to download. """ db = read_checkpoint_db() checkpoint = get_checkpoint(db, id_or_alias) if prompt and not checkpoint: # Checkpoint not found in database. Prompt for refreshing the index and # try again. click.confirm( 'Checkpoint not found. Check remote repository?', abort=True ) db = refresh_remote_index() checkpoint = get_checkpoint(db, id_or_alias) if not checkpoint: # Still not found, abort. click.echo( "Checkpoint isn't available in remote repository either." ) raise ValueError('Checkpoint not found.') elif not checkpoint: # No checkpoint but didn't prompt. raise ValueError('Checkpoint not found.') if prompt and checkpoint['status'] == 'NOT_DOWNLOADED': # Checkpoint hasn't been downloaded yet. Prompt for downloading it # before continuing. click.confirm( 'Checkpoint not present locally. Want to download it?', abort=True ) download_remote_checkpoint(db, checkpoint) elif checkpoint['status'] == 'NOT_DOWNLOADED': # Not downloaded but didn't prompt. raise ValueError('Checkpoint not downloaded.') path = get_checkpoint_path(checkpoint['id']) config = get_config(os.path.join(path, 'config.yml')) # Config paths should point to the path where the checkpoint files are # stored. config.dataset.dir = path config.train.job_dir = get_checkpoints_directory() return config
def web(config_files, debug): if debug: tf.logging.set_verbosity(tf.logging.DEBUG) else: tf.logging.set_verbosity(tf.logging.INFO) config = get_config(config_files) app.config['config'] = config if config.dataset.dir: # Gets the names of the classes classes_file = os.path.join(config.dataset.dir, 'classes.json') if tf.gfile.Exists(classes_file): app.config['class_labels'] = json.load( tf.gfile.GFile(classes_file)) app.run(debug=debug)
def main(input_image): tf.reset_default_graph() # ********************************************************************** with tf.gfile.Open(input_image, 'rb') as f: try: image = Image.open(f).convert('RGB') except (tf.errors.OutOfRangeError, OSError) as e: print 'Exception!' # ********************************************************************** config = get_config(CONFIG) config.model.rcnn.proposals.total_max_detections = MAX_DET config.model.rcnn.proposals.min_prob_threshold = MIN_PROB network = PredictorNetwork(config) objects = network.predict_image(image) print '************************************** Num of Objects : ', len( objects) # ********************************************************************** ref_image = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2RGB) pref_list, images, ids = cell_croppings.load_json(ref_image, preds=objects) print '************************************** Num of Cropped Objects : ', np.shape( ids) # ********************************************************************** if not os.path.exists(CROP_CELLS_DIR): os.makedirs(CROP_CELLS_DIR) for i in range(ids.shape[0]): # SRC_DIR / IMAGE_ID _ COORDS . LABEL dst_filename = '%s_%s_%s.png' % (os.path.basename(input_image)[:-4], ids[i], i) dst_img_name = os.path.join(CROP_CELLS_DIR, dst_filename) cv2.imwrite(dst_img_name, images[i, ...])
def web(config_files, host, port, debug): if debug: tf.logging.set_verbosity(tf.logging.DEBUG) else: tf.logging.set_verbosity(tf.logging.INFO) config = get_config(config_files) # Bounding boxes will be filtered by frontend (using slider), so we set # a low threshold. config.model.rcnn.proposals.min_prob_threshold = 0.01 # Initialize model global NETWORK_START_THREAD NETWORK_START_THREAD = Thread(target=start_network, args=(config_files)) NETWORK_START_THREAD.start() app.run(host=host, port=port, debug=debug)
def web(config_files, host, port, debug): if debug: tf.logging.set_verbosity(tf.logging.DEBUG) else: tf.logging.set_verbosity(tf.logging.INFO) config = get_config(config_files) app.config['config'] = config # Bounding boxes will be filtered by frontend (using slider), so we set # a low threshold. config.model.rcnn.proposals.min_prob_threshold = 0.01 if config.dataset.dir: # Gets the names of the classes classes_file = os.path.join(config.dataset.dir, 'classes.json') if tf.gfile.Exists(classes_file): app.config['class_labels'] = json.load( tf.gfile.GFile(classes_file)) app.run(host=host, port=port, debug=debug)
def __init__(self, config_files): config = get_config(config_files) if config.dataset.dir: # Gets the names of the classes classes_file = os.path.join(config.dataset.dir, 'classes.json') if tf.gfile.Exists(classes_file): self.class_labels = json.load(tf.gfile.GFile(classes_file)) else: self.class_labels = None # Don't use data augmentation in predictions config.dataset.data_augmentation = None dataset_class = get_dataset(config.dataset.type) model_class = get_model(config.model.type) dataset = dataset_class(config) model = model_class(config) graph = tf.Graph() self.session = tf.Session(graph=graph) with graph.as_default(): self.image_placeholder = tf.placeholder(tf.float32, (None, None, 3)) image_tf, _, process_meta = dataset.preprocess( self.image_placeholder) pred_dict = model(image_tf) # Restore checkpoint if config.train.job_dir: job_dir = config.train.job_dir if config.train.run_name: job_dir = os.path.join(job_dir, config.train.run_name) ckpt = tf.train.get_checkpoint_state(job_dir) if not ckpt or not ckpt.all_model_checkpoint_paths: raise ValueError( 'Could not find checkpoint in {}.'.format(job_dir)) ckpt = ckpt.all_model_checkpoint_paths[-1] saver = tf.train.Saver(sharded=True, allow_empty=True) saver.restore(self.session, ckpt) tf.logging.info('Loaded checkpoint.') else: # A prediction without checkpoint is just used for testing tf.logging.warning( 'Could not load checkpoint. Using initialized model.') init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) self.session.run(init_op) if config.model.network.with_rcnn: cls_prediction = pred_dict['classification_prediction'] objects_tf = cls_prediction['objects'] objects_labels_tf = cls_prediction['labels'] objects_labels_prob_tf = cls_prediction['probs'] else: rpn_prediction = pred_dict['rpn_prediction'] objects_tf = rpn_prediction['proposals'] objects_labels_prob_tf = rpn_prediction['scores'] # All labels without RCNN are zero objects_labels_tf = tf.zeros(tf.shape(objects_labels_prob_tf), dtype=tf.int32) self.fetches = { 'objects': objects_tf, 'labels': objects_labels_tf, 'probs': objects_labels_prob_tf, 'scale_factor': process_meta['scale_factor'] } # If in debug mode, return the full prediction dictionary. if config.train.debug: self.fetches['_debug'] = pred_dict
def train( job_id, resume_job_id, bucket_name, region, config_files, dataset, scale_tier, master_type, worker_type, worker_count, parameter_server_type, parameter_server_count, ): account = ServiceAccount() account.validate_region(region) if bucket_name is None: bucket_name = "luminoth-{}".format(account.client_id) click.echo( 'Bucket name not specified. Using "{}".'.format(bucket_name)) # Creates bucket for logs and models if it doesn't exist bucket = account.get_bucket(bucket_name) if not job_id: job_id = "train_{}".format(datetime.now().strftime("%Y%m%d_%H%M%S")) # Path in bucket to store job's config, logs, etc. # If we are resuming a previous job, then we will use the same path # that job used, so Luminoth will load the checkpoint from there. base_path = "lumi_{}".format(resume_job_id if resume_job_id else job_id) package_path = build_package(bucket, base_path) job_dir = "gs://{}/{}".format(bucket_name, base_path) override_params = [ "train.job_dir={}".format(job_dir), ] if dataset: # Check if absolute or relative dataset path if not dataset.startswith("gs://"): dataset = "gs://{}".format(dataset) override_params.append("dataset.dir={}".format(dataset)) # Even if we are resuming job, we will use a new config. Thus, we will # overwrite the config in the old job's dir if it existed. config = get_config(config_files, override_params=override_params) # Update final config file to job bucket config_path = "{}/{}".format(base_path, DEFAULT_CONFIG_FILENAME) upload_data(bucket, config_path, dump_config(config)) args = ["--config", "{}/{}".format(job_dir, DEFAULT_CONFIG_FILENAME)] cloudml = account.cloud_service("ml") training_inputs = { "scaleTier": scale_tier, "packageUris": ["gs://{}/{}".format(bucket_name, package_path)], "pythonModule": "luminoth.train", "args": args, "region": region, "jobDir": job_dir, "runtimeVersion": RUNTIME_VERSION, "pythonVersion": PYTHON_VERSION, } if scale_tier == "CUSTOM": training_inputs["masterType"] = master_type if worker_count > 0: training_inputs["workerCount"] = worker_count training_inputs["workerType"] = worker_type if parameter_server_count > 0: training_inputs["parameterServerCount"] = parameter_server_count training_inputs["parameterServerType"] = parameter_server_type job_spec = {"jobId": job_id, "trainingInput": training_inputs} jobrequest = (cloudml.projects().jobs().create(body=job_spec, parent="projects/{}".format( account.project_id))) try: click.echo("Submitting training job.") res = jobrequest.execute() click.echo("Job submitted successfully.") click.echo("state = {}, createTime = {}".format( res.get("state"), res.get("createTime"))) if resume_job_id: click.echo( "\nNote: this job is resuming job {}.\n".format(resume_job_id)) click.echo("Job id: {}".format(job_id)) click.echo("Job directory: {}".format(job_dir)) save_run(config, environment="gcloud", extra_config=job_spec) except Exception as err: click.echo("There was an error creating the training job. " "Check the details: \n{}".format(err._get_reason()))
def detect_tile_cell(slide_path, tile_position, csv_dict, args, it_kwargs): start_t = time.time() print('--- Loading Image...') # get slide tile source ts = large_image.getTileSource(slide_path) # get requested tile tile_info = ts.getSingleTile( tile_position=tile_position, format=large_image.tilesource.TILE_FORMAT_NUMPY, **it_kwargs) # get tile image im_tile = tile_info['tile'][:, :, :3] t1 = time.time() - start_t csv_dict['Image Loading'].append(round(t1, 3)) print('--- Finished Loading Image') cv2.imwrite('hey_im_tile.png', im_tile) # ******************************************************* # # Perform cell detections # # ########################### DETECTION ################# print('--- Performing cell detections...') config = get_config(CONFIG) if not args.max_det is None: config.model.rcnn.proposals.total_max_detections = args.max_det else: config.model.rcnn.proposals.total_max_detections = MAX_DET if not args.min_prob is None: config.model.rcnn.proposals.min_prob_threshold = args.min_prob else: config.model.rcnn.proposals.min_prob_threshold = MIN_PROB print('--- Currently Analysing Input Image Size : ', im_tile.shape) network = PredictorNetwork(config) objects = network.predict_image(im_tile) print('--- Finished Cell Detections') t2 = time.time() - start_t t22 = float(t2) - float(t1) csv_dict['Cell Detection'].append(round(t22, 3)) print('***** Number of Detected Cells ****** : ', len(objects)) # # Perform JSON loading # print('--- Performing Cell Crops loading...') im_tile_rgb = cv2.cvtColor(im_tile, cv2.COLOR_BGR2RGB) if not args.inputImageFile is None: _, images, ids = classify_cellscropping.\ load_json(im_tile_rgb, preds=objects) else: _, images, ids = classify_cellscropping.\ load_json(im_tile_rgb, preds=objects) print('--- Finished Cell Crops loading') t3 = time.time() - start_t t33 = float(t3) - float(t2) csv_dict['Cell Cropping'].append(round(t33, 3)) csv_dict['Number of Cells'].append(len(ids)) # ########################### CLASSIFICATION ####################### print('--- Performing Cell Classification...') try: final_json = classify_main.predict(images, ids, CKPT) except ValueError: final_json = [] print( '!!!!! Can not Conduct Classification on 0 Number of Cells Detected !!!!!' ) print('--- Finished Cell Classification') t4 = time.time() - start_t t44 = float(t4) - float(t3) csv_dict['Cell Classification'].append(round(t44, 3)) # # Delete border nuclei # if args.ignore_border_nuclei is True: # im_nuclei_seg_mask = htk_seg_label.delete_border(im_nuclei_seg_mask) # generate cell annotations cell_annot_list = cli_utils.create_tile_cell_annotations( final_json, tile_info, args.cell_annotation_format) t5 = time.time() - start_t t55 = float(t5) - float(t4) csv_dict['Annotation Writing'].append(round(t55, 3)) return cell_annot_list, csv_dict
def eval(dataset_split, config_files, watch, from_global_step, override_params, files_per_class, iou_threshold, min_probability): """Evaluate models using dataset.""" # If the config file is empty, our config will be the base_config for the # default model. try: config = get_config(config_files, override_params=override_params) except KeyError: raise KeyError('model.type should be set on the custom config.') if not config.train.job_dir: raise KeyError('`job_dir` should be set.') if not config.train.run_name: raise KeyError('`run_name` should be set.') # `run_dir` is where the actual checkpoint and logs are located. run_dir = os.path.join(config.train.job_dir, config.train.run_name) # Only activate debug for if needed for debug visualization mode. if not config.train.debug: config.train.debug = config.eval.image_vis == 'debug' if config.train.debug or config.train.tf_debug: tf.logging.set_verbosity(tf.logging.DEBUG) else: tf.logging.set_verbosity(tf.logging.INFO) # Build the dataset tensors, overriding the default dataset split. config.dataset.split = dataset_split # Disable data augmentation. config.dataset.data_augmentation = [] # Only a single run over the dataset to calculate metrics. config.train.num_epochs = 1 if config.model.network.with_rcnn: config.model.rcnn.proposals.min_prob_threshold = min_probability else: config.model.rpn.proposals.min_prob_threshold = min_probability # Seed setup if config.train.seed: tf.set_random_seed(config.train.seed) # Set pretrained as not training config.model.base_network.trainable = False model_class = get_model(config.model.type) model = model_class(config) dataset_class = get_dataset(config.dataset.type) dataset = dataset_class(config) train_dataset = dataset() train_image = train_dataset['image'] train_objects = train_dataset['bboxes'] train_filename = train_dataset['filename'] # Build the graph of the model to evaluate, retrieving required # intermediate tensors. prediction_dict = model(train_image, train_objects) if config.model.network.with_rcnn: pred = prediction_dict['classification_prediction'] pred_objects = pred['objects'] pred_objects_classes = pred['labels'] pred_objects_scores = pred['probs'] else: # Force the num_classes to 1 config.model.network.num_classes = 1 pred = prediction_dict['rpn_prediction'] pred_objects = pred['proposals'] pred_objects_scores = pred['scores'] # When using only RPN all classes are 0. pred_objects_classes = tf.zeros( (tf.shape(pred_objects_scores)[0],), dtype=tf.int32 ) # Retrieve *all* the losses from the model and calculate their streaming # means, so we get the loss over the whole dataset. batch_losses = model.loss(prediction_dict, return_all=True) losses = {} for loss_name, loss_tensor in batch_losses.items(): loss_mean, _ = tf.metrics.mean( loss_tensor, name=loss_name, metrics_collections='metrics', updates_collections='metric_ops', ) full_loss_name = '{}_losses/{}'.format(dataset_split, loss_name) losses[full_loss_name] = loss_mean metric_ops = tf.get_collection('metric_ops') init_op = tf.group( tf.global_variables_initializer(), tf.local_variables_initializer() ) # Using a global saver instead of the one for the model. saver = tf.train.Saver(sharded=True, allow_empty=True) # Aggregate the required ops to evaluate into a dict.. ops = { 'init_op': init_op, 'metric_ops': metric_ops, 'pred_objects': pred_objects, 'pred_objects_classes': pred_objects_classes, 'pred_objects_scores': pred_objects_scores, 'train_objects': train_objects, 'losses': losses, 'prediction_dict': prediction_dict, 'filename': train_filename, 'train_image': train_image } metrics_scope = '{}_metrics'.format(dataset_split) # Use global writer for all checkpoints. We don't want to write different # files for each checkpoint. writer = tf.summary.FileWriter(run_dir) files_to_visualize = {} last_global_step = from_global_step while True: # Get the checkpoint files to evaluate. try: checkpoints = get_checkpoints( run_dir, last_global_step, last_only=not watch ) except ValueError as e: if not watch: tf.logging.error('Missing checkpoint.') raise e tf.logging.warning( 'Missing checkpoint; Checking again in a moment') time.sleep(5) continue for checkpoint in checkpoints: # Always returned in order, so it's safe to assign directly. tf.logging.info( 'Evaluating global_step {} using checkpoint \'{}\''.format( checkpoint['global_step'], checkpoint['file'] ) ) try: start = time.time() evaluate_once( config, writer, saver, ops, checkpoint, metrics_scope=metrics_scope, image_vis=config.eval.image_vis, files_per_class=files_per_class, files_to_visualize=files_to_visualize, iou_threshold=iou_threshold, min_probability=min_probability ) last_global_step = checkpoint['global_step'] tf.logging.info('Evaluated in {:.2f}s'.format( time.time() - start )) except tf.errors.NotFoundError: # The checkpoint is not ready yet. It was written in the # checkpoints file, but it still hasn't been completely saved. tf.logging.info( 'Checkpoint {} is not ready yet. ' 'Checking again in a moment.'.format( checkpoint['file'] ) ) time.sleep(5) continue # If no watching was requested, finish the execution. if not watch: return # Sleep for a moment and check for new checkpoints. tf.logging.info('All checkpoints evaluated; sleeping for a moment') time.sleep(5)
def create(config_files, override_params, entries): # Parse the entries passed as options. entries = parse_entries(entries) if entries is None: return click.echo('Creating checkpoint for given configuration...') # Get and build the configuration file for the model. config = get_config(config_files, override_params=override_params) # Retrieve the files for the last checkpoint available. run_dir = os.path.join(config.train.job_dir, config.train.run_name) ckpt = tf.train.get_checkpoint_state(run_dir) if not ckpt or not ckpt.all_model_checkpoint_paths: click.echo("Couldn't find checkpoint in '{}'.".format(run_dir)) return last_checkpoint = sorted([{ 'global_step': int(path.split('-')[-1]), 'file': path } for path in ckpt.all_model_checkpoint_paths], key=lambda c: c['global_step'])[-1]['file'] checkpoint_prefix = os.path.basename(last_checkpoint) checkpoint_paths = [ os.path.join(run_dir, file) for file in os.listdir(run_dir) if file.startswith(checkpoint_prefix) ] # Find the `classes.json` file. classes_path = os.path.join(config.dataset.dir, 'classes.json') if not os.path.exists(classes_path): classes_path = None # Create an checkpoint_id to identify the checkpoint. checkpoint_id = str(uuid.uuid4()).replace('-', '')[:12] # Update the directory paths for the configuration file. Since it's going # to be packed into a single tar file, we set them to the current directoy. config.dataset.dir = '.' config.train.job_dir = '.' config.train.run_name = checkpoint_id # Create the directory that will contain the model. path = get_checkpoint_path(checkpoint_id) tf.gfile.MakeDirs(path) with open(os.path.join(path, 'config.yml'), 'w') as f: json.dump(config, f) # Add the checkpoint files. for checkpoint_path in checkpoint_paths: shutil.copy2(checkpoint_path, path) # Add `checkpoint` file to indicate where the checkpoint is located. We # need to create it manually instead of just copying as it may contain # absolute paths. with open(os.path.join(path, 'checkpoint'), 'w') as f: f.write(""" model_checkpoint_path: "{0}" all_model_checkpoint_paths: "{0}" """.format(checkpoint_prefix)) # Add the `classes.json` file. Also get the number of classes, if # available. num_classes = None if classes_path: shutil.copy2(classes_path, path) with open(classes_path) as f: num_classes = len(json.load(f)) # Store the new checkpoint into the checkpoint index. metadata = { 'id': checkpoint_id, 'name': entries.get('name', ''), 'description': entries.get('description', ''), 'alias': entries.get('alias', ''), 'model': config.model.type, 'dataset': { 'name': entries.get('dataset.name', ''), 'num_classes': (num_classes or entries.get('dataset.num_classes', None)), }, 'luminoth_version': lumi_version, 'created_at': datetime.utcnow().isoformat(), 'status': 'LOCAL', 'source': 'local', 'url': None, # Only for remotes. } db = read_checkpoint_db() db['checkpoints'].append(metadata) save_checkpoint_db(db) click.echo('Checkpoint {} created successfully.'.format(checkpoint_id))
def eval( dataset_split, config_files, watch, from_global_step, override_params, files_per_class, max_detections, ): """Evaluate models using dataset.""" # If the config file is empty, our config will be the base_config for the # default model. try: config = get_config(config_files, override_params=override_params) except KeyError: raise KeyError("model.type should be set on the custom config.") if not config.train.job_dir: raise KeyError("`job_dir` should be set.") if not config.train.run_name: raise KeyError("`run_name` should be set.") # `run_dir` is where the actual checkpoint and logs are located. run_dir = os.path.join(config.train.job_dir, config.train.run_name) # Only activate debug for if needed for debug visualization mode. if not config.train.debug: config.train.debug = config.eval.image_vis == "debug" if config.train.debug or config.train.tf_debug: tf.logging.set_verbosity(tf.logging.DEBUG) else: tf.logging.set_verbosity(tf.logging.INFO) # Build the dataset tensors, overriding the default dataset split. config.dataset.split = dataset_split # Disable data augmentation. config.dataset.data_augmentation = [] # Attempt to get class names, if available. classes_file = os.path.join(config.dataset.dir, "classes.json") if tf.gfile.Exists(classes_file): class_labels = json.load(tf.gfile.GFile(classes_file)) else: class_labels = None if config.model.type == "fasterrcnn": # Override max detections with specified value. if config.model.network.with_rcnn: config.model.rcnn.proposals.total_max_detections = max_detections else: config.model.rpn.proposals.post_nms_top_n = max_detections # Also overwrite `min_prob_threshold` in order to use all detections. config.model.rcnn.proposals.min_prob_threshold = 0.0 elif config.model.type == "ssd": config.model.proposals.total_max_detections = max_detections config.model.proposals.min_prob_threshold = 0.0 else: raise ValueError("Model type '{}' not supported".format( config.model.type)) # Only a single run over the dataset to calculate metrics. config.train.num_epochs = 1 # Seed setup. if config.train.seed: tf.set_random_seed(config.train.seed) # Set pretrained as not training. config.model.base_network.trainable = False model_class = get_model(config.model.type) model = model_class(config) dataset_class = get_dataset(config.dataset.type) dataset = dataset_class(config) train_dataset = dataset() train_image = train_dataset["image"] train_objects = train_dataset["bboxes"] train_filename = train_dataset["filename"] # Build the graph of the model to evaluate, retrieving required # intermediate tensors. prediction_dict = model(train_image, train_objects) if config.model.type == "ssd" or config.model.network.with_rcnn: pred = prediction_dict["classification_prediction"] pred_objects = pred["objects"] pred_objects_classes = pred["labels"] pred_objects_scores = pred["probs"] else: # Force the num_classes to 1. config.model.network.num_classes = 1 pred = prediction_dict["rpn_prediction"] pred_objects = pred["proposals"] pred_objects_scores = pred["scores"] # When using only RPN all classes are 0. pred_objects_classes = tf.zeros((tf.shape(pred_objects_scores)[0], ), dtype=tf.int32) # Retrieve *all* the losses from the model and calculate their streaming # means, so we get the loss over the whole dataset. batch_losses = model.loss(prediction_dict, return_all=True) losses = {} for loss_name, loss_tensor in batch_losses.items(): loss_mean, _ = tf.metrics.mean( loss_tensor, name=loss_name, metrics_collections="metrics", updates_collections="metric_ops", ) full_loss_name = "{}_losses/{}".format(dataset_split, loss_name) losses[full_loss_name] = loss_mean metric_ops = tf.get_collection("metric_ops") init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # Using a global saver instead of the one for the model. saver = tf.train.Saver(sharded=True, allow_empty=True) # Aggregate the required ops to evaluate into a dict. ops = { "init_op": init_op, "metric_ops": metric_ops, "pred_objects": pred_objects, "pred_objects_classes": pred_objects_classes, "pred_objects_scores": pred_objects_scores, "train_objects": train_objects, "losses": losses, "prediction_dict": prediction_dict, "filename": train_filename, "train_image": train_image, } metrics_scope = "{}_metrics".format(dataset_split) # Use global writer for all checkpoints. We don't want to write different # files for each checkpoint. writer = tf.summary.FileWriter(run_dir) files_to_visualize = {} last_global_step = from_global_step while True: # Get the checkpoint files to evaluate. try: checkpoints = get_checkpoints(run_dir, last_global_step, last_only=not watch) except ValueError as e: if not watch: tf.logging.error("Missing checkpoint.") raise e tf.logging.warning( "Missing checkpoint; Checking again in a moment") time.sleep(5) continue for checkpoint in checkpoints: # Always returned in order, so it's safe to assign directly. tf.logging.info( "Evaluating global_step {} using checkpoint '{}'".format( checkpoint["global_step"], checkpoint["file"])) try: start = time.time() evaluate_once( config, writer, saver, ops, checkpoint, class_labels=class_labels, metrics_scope=metrics_scope, image_vis=config.eval.image_vis, files_per_class=files_per_class, files_to_visualize=files_to_visualize, ) last_global_step = checkpoint["global_step"] tf.logging.info("Evaluated in {:.2f}s".format(time.time() - start)) except tf.errors.NotFoundError: # The checkpoint is not ready yet. It was written in the # checkpoints file, but it still hasn't been completely saved. tf.logging.info("Checkpoint {} is not ready yet. " "Checking again in a moment.".format( checkpoint["file"])) time.sleep(5) continue # If no watching was requested, finish the execution. if not watch: return # Sleep for a moment and check for new checkpoints. tf.logging.info("All checkpoints evaluated; sleeping for a moment") time.sleep(5)
def detect_tile_nuclei(slide_path, tile_position, args, it_kwargs, src_mu_lab=None, src_sigma_lab=None, debug=False): # ========================================================================= # ======================= Tile Loading ==================================== # ========================================================================= print('\n>> Loading Tile ... \n') csv_dict = {} csv_dict['PreparationTime'] = [] csv_dict['ColorDeconvTime'] = [] csv_dict['TotalTileLoadingTime'] = [] csv_dict['CKPTLoadingTime'] = [] csv_dict['ModelInfernceTime'] = [] csv_dict['DetectionTime'] = [] csv_dict['ROIShape'] = [] csv_dict['ObjectsDict'] = [] csv_dict['NumObjects'] = [] csv_dict['AnnotationWritingTime'] = [] csv_dict['AnnotationDict'] = [] csv_dict['AnalysisDict'] = [] start_time = time.time() total_tileloading_start_time = time.time() ts = large_image.getTileSource(slide_path) tile_info = ts.getSingleTile( tile_position=tile_position, format=large_image.tilesource.TILE_FORMAT_NUMPY, **it_kwargs) im_tile = tile_info['tile'][:, :, :3] csv_dict['ROIShape'] = im_tile.shape[:2] prep_time = time.time() - start_time csv_dict['PreparationTime'] = round(prep_time, 3) # ========================================================================= # =================Img Normalization & Color Deconv======================== # ========================================================================= print('\n>> Color Deconvolving ... \n') start_time = time.time() im_nmzd = htk_cnorm.reinhard( im_tile, REFERENCE_MU_LAB, REFERENCE_STD_LAB, src_mu=src_mu_lab, src_sigma=src_sigma_lab ) # perform color decovolution if args.deconv_method == 'ruifrok': w = cli_utils.get_stain_matrix(args) im_stains = htk_cdeconv.color_deconvolution( im_nmzd, w).Stains.astype(np.float)[:, :, :2] elif args.deconv_method == 'macenko': w_est = htk_cdeconv.rgb_separate_stains_macenko_pca(im_tile, 255) im_stains = htk_cdeconv.color_deconvolution( im_tile, w_est, 255).Stains.astype(np.float) ch1 = htk_cdeconv.find_stain_index( htk_cdeconv.stain_color_map[args.stain_1], w_est) ch2 = htk_cdeconv.find_stain_index( htk_cdeconv.stain_color_map[args.stain_2], w_est) im_stains = im_stains[:, :, [ch1, ch2]] else: raise ValueError('Invalid deconvolution method parameter.') # ========================================================================= # ====================== Fuse the stain1 & stain2 pix====================== # ========================================================================= # compute nuclear foreground mask im_fgnd_mask_stain_1 = im_stains[ :, :, 0] < threshold_yen(im_stains[:, :, 0]) im_fgnd_mask_stain_2 = im_stains[ :, :, 1] < threshold_yen(im_stains[:, :, 1]) im_fgnd_seg_mask = im_fgnd_mask_stain_1 | im_fgnd_mask_stain_2 # segment nuclei im_nuc_det_input = np.squeeze(np.min(im_stains[:, :, :2], axis=2)) print('---> Fusing 2 Stains') deconv_time = time.time() - start_time csv_dict['ColorDeconvTime'] = round(deconv_time, 3) # ========================================================================= # ================= Nuclie Detection Deep Learning Block ================== # ========================================================================= total_tileloading_time = time.time() - total_tileloading_start_time csv_dict['TotalTileLoadingTime'] = round(total_tileloading_time, 3) start_time = time.time() config = get_config(CONFIG) config.model.rcnn.proposals.total_max_detections = args.max_det config.model.rcnn.proposals.min_prob_threshold = args.min_prob im_nuc_det_input = np.stack((im_nuc_det_input,) * 3, axis=-1) # ==================================================================================================================================== tf.reset_default_graph() dataset_class = get_dataset('object_detection') model_class = get_model('fasterrcnn') dataset = dataset_class(config) model = model_class(config) graph = tf.Graph() session = tf.Session(graph=graph) with graph.as_default(): image_placeholder = tf.placeholder( tf.float32, (None, None, 3), name='Input_Placeholder' ) pred_dict = model(image_placeholder) ckpt_loading_start_time = time.time() saver = tf.train.Saver(sharded=True, allow_empty=True) saver.restore(session, CKPT_DIR) tf.logging.info('Loaded checkpoint.') ckpt_loading_time = time.time() - ckpt_loading_start_time csv_dict['CKPTLoadingTime'] = round(ckpt_loading_time, 3) inference_start_time = time.time() cls_prediction = pred_dict['classification_prediction'] objects_tf = cls_prediction['objects'] objects_labels_tf = cls_prediction['labels'] objects_labels_prob_tf = cls_prediction['probs'] fetches = { 'objects': objects_tf, 'labels': objects_labels_tf, 'probs': objects_labels_prob_tf, } fetched = session.run(fetches, feed_dict={ image_placeholder: np.array(im_nuc_det_input) }) inference_time = time.time() - inference_start_time csv_dict['ModelInfernceTime'] = round(inference_time, 3) objects = fetched['objects'] labels = fetched['labels'].tolist() probs = fetched['probs'].tolist() # Cast to int to consistently return the same type in Python 2 and 3 objects = [ [int(round(coord)) for coord in obj] for obj in objects.tolist() ] predictions = sorted([ { 'bbox': obj, 'label': label, 'prob': round(prob, 4), } for obj, label, prob in zip(objects, labels, probs) ], key=lambda x: x['prob'], reverse=True) print('\n>> Finishing Detection ... \n') print('***** Number of Detected Cells ****** : ', len(predictions)) detection_time = time.time() - start_time csv_dict['DetectionTime'] = round(detection_time, 3) csv_dict['NumObjects'] = len(predictions) csv_dict['ObjectsDict'] = predictions # ========================================================================= # ======================= TODO: Implement border deletion ================= # ========================================================================= # ========================================================================= # ======================= Write Annotations =============================== # ========================================================================= start_time = time.time() objects_df = pd.DataFrame(objects) formatted_annot_list,\ formatter_analysis_list = cli_utils.convert_preds_to_utilformat( objects_df, probs, args.ignore_border_nuclei, im_tile_size=args.analysis_tile_size) nuclei_annot_list = cli_utils.create_tile_nuclei_annotations( formatted_annot_list, tile_info, args.nuclei_annotation_format) csv_dict['AnnotationDict'] = nuclei_annot_list csv_dict['AnalysisDict'] = formatter_analysis_list num_nuclei = len(nuclei_annot_list) anot_time = time.time() - start_time csv_dict['AnnotationWritingTime'] = round(anot_time, 3) return csv_dict
def evaluate(dataset_split, config_files, job_dir, watch, from_global_step, override_params, files_per_class): """ Evaluate models using dataset. """ # If the config file is empty, our config will be the base_config for the # default model. try: config = get_config(config_files, override_params=override_params) except KeyError: raise KeyError('model.type should be set on the custom config.') config.train.job_dir = job_dir or config.train.job_dir # Only activate debug for if needed for debug visualization mode. if not config.train.debug: config.train.debug = config.eval.image_vis == 'debug' if config.train.debug or config.train.tf_debug: tf.logging.set_verbosity(tf.logging.DEBUG) else: tf.logging.set_verbosity(tf.logging.INFO) # Build the dataset tensors, overriding the default dataset split. config.dataset.split = dataset_split # Disable data augmentation. config.dataset.data_augmentation = [] # Only a single run over the dataset to calculate metrics. config.train.num_epochs = 1 # Seed setup if config.train.seed: tf.set_random_seed(config.train.seed) # Set pretrained as not training config.model.base_network.trainable = False model_class = get_model(config.model.type) model = model_class(config) dataset_class = get_dataset(config.dataset.type) dataset = dataset_class(config) train_dataset = dataset() train_image = train_dataset['image'] train_objects = train_dataset['bboxes'] train_filename = train_dataset['filename'] # Build the graph of the model to evaluate, retrieving required # intermediate tensors. prediction_dict = model( train_image, train_objects ) if config.model.network.with_rcnn: pred = prediction_dict['classification_prediction'] pred_objects = pred['objects'] pred_objects_classes = pred['labels'] pred_objects_scores = pred['probs'] else: # Force the num_classes to 1 config.model.network.num_classes = 1 pred = prediction_dict['rpn_prediction'] pred_objects = pred['proposals'] pred_objects_scores = pred['scores'] # When using only RPN all classes are 0. pred_objects_classes = tf.zeros( (tf.shape(pred_objects_scores)[0],), dtype=tf.int32 ) # Retrieve *all* the losses from the model and calculate their streaming # means, so we get the loss over the whole dataset. batch_losses = model.loss(prediction_dict, return_all=True) losses = {} for loss_name, loss_tensor in batch_losses.items(): loss_mean, _ = tf.metrics.mean( loss_tensor, name=loss_name, metrics_collections='metrics', updates_collections='metric_ops', ) full_loss_name = '{}_losses/{}'.format(dataset_split, loss_name) losses[full_loss_name] = loss_mean metric_ops = tf.get_collection('metric_ops') init_op = tf.group( tf.global_variables_initializer(), tf.local_variables_initializer() ) # Using a global saver instead of the one for the model. saver = tf.train.Saver(sharded=True, allow_empty=True) # Aggregate the required ops to evaluate into a dict.. ops = { 'init_op': init_op, 'metric_ops': metric_ops, 'pred_objects': pred_objects, 'pred_objects_classes': pred_objects_classes, 'pred_objects_scores': pred_objects_scores, 'train_objects': train_objects, 'losses': losses, 'prediction_dict': prediction_dict, 'filename': train_filename, 'train_image': train_image } metrics_scope = '{}_metrics'.format(dataset_split) # Use global writer for all checkpoints. We don't want to write different # files for each checkpoint. writer = tf.summary.FileWriter(config.train.job_dir) files_to_visualize = {} last_global_step = from_global_step while True: # Get the checkpoint files to evaluate. try: checkpoints = get_checkpoints(config, last_global_step) except ValueError as e: if not watch: tf.logging.error('Missing checkpoint.') raise e tf.logging.warning( 'Missing checkpoint; Checking again in a minute') time.sleep(60) continue for checkpoint in checkpoints: # Always returned in order, so it's safe to assign directly. tf.logging.info( 'Evaluating global_step {} using checkpoint \'{}\''.format( checkpoint['global_step'], checkpoint['file'] ) ) try: start = time.time() evaluate_once( config, writer, saver, ops, checkpoint, metrics_scope=metrics_scope, image_vis=config.eval.image_vis, files_per_class=files_per_class, files_to_visualize=files_to_visualize ) last_global_step = checkpoint['global_step'] tf.logging.info('Evaluated in {:.2f}s'.format( time.time() - start )) except tf.errors.NotFoundError: # The checkpoint is not ready yet. It was written in the # checkpoints file, but it still hasn't been completely saved. tf.logging.info( 'Checkpoint {} is not ready yet. ' 'Checking again in a minute.'.format( checkpoint['file'] ) ) time.sleep(60) continue # If no watching was requested, finish the execution. if not watch: return # Sleep for a minute and check for new checkpoints. tf.logging.info('All checkpoints evaluated; sleeping for a minute') time.sleep(60)
def train(job_id, service_account_json, bucket_name, region, config_files, dataset, scale_tier, master_type, worker_type, worker_count, parameter_server_type, parameter_server_count): project_id = get_project_id(service_account_json) if project_id is None: raise ValueError( 'Missing "project_id" in service_account_json "{}"'.format( service_account_json)) if bucket_name is None: client_id = get_client_id(service_account_json) bucket_name = 'luminoth-{}'.format(client_id) click.echo( 'Bucket name not specified. Using "{}".'.format(bucket_name)) credentials = get_credentials(service_account_json) validate_region(region, project_id, credentials) # Creates bucket for logs and models if it doesn't exist bucket = get_bucket(service_account_json, bucket_name) if not job_id: job_id = 'train_{}'.format(datetime.now().strftime("%Y%m%d_%H%M%S")) # Define path in bucket to store job's config, logs, etc. base_path = 'lumi_{}'.format(job_id) package_path = build_package(bucket, base_path) job_dir = 'gs://{}/{}/'.format(bucket_name, base_path) override_params = [ 'train.job_dir={}'.format(job_dir), ] if dataset: # Check if absolute or relative dataset path if not dataset.startswith('gs://'): dataset = 'gs://{}'.format(dataset) override_params.append('dataset.dir={}'.format(dataset)) config = get_config(config_files, override_params=override_params) # We should validate config before submitting job # Update final config file to job bucket config_path = os.path.join(base_path, DEFAULT_CONFIG_FILENAME) upload_data(bucket, config_path, dump_config(config)) args = ['--config', os.path.join(job_dir, DEFAULT_CONFIG_FILENAME)] cloudml = cloud_service(credentials, 'ml') training_inputs = { 'scaleTier': scale_tier, 'packageUris': [ 'gs://{}/{}'.format(bucket_name, package_path) ], 'pythonModule': 'luminoth.train', 'args': args, 'region': region, 'jobDir': job_dir, 'runtimeVersion': RUNTIME_VERSION, } if scale_tier == 'CUSTOM': training_inputs['masterType'] = master_type if worker_count > 0: training_inputs['workerCount'] = worker_count training_inputs['workerType'] = worker_type if parameter_server_count > 0: training_inputs['parameterServerCount'] = parameter_server_count training_inputs['parameterServerType'] = parameter_server_type job_spec = { 'jobId': job_id, 'trainingInput': training_inputs } jobrequest = cloudml.projects().jobs().create( body=job_spec, parent='projects/{}'.format(project_id)) try: click.echo('Submitting training job.') res = jobrequest.execute() click.echo('Job {} submitted successfully.'.format(job_id)) click.echo('state = {}, createTime = {}'.format( res.get('state'), res.get('createTime'))) save_run(config, environment='gcloud', extra_config=job_spec) except Exception as err: click.echo( 'There was an error creating the training job. ' 'Check the details: \n{}'.format(err._get_reason()) )
def create(config_files, override_params, entries): # Parse the entries passed as options. entries = parse_entries(entries) if entries is None: return click.echo("Creating checkpoint for given configuration...") # Get and build the configuration file for the model. config = get_config(config_files, override_params=override_params) # Retrieve the files for the last checkpoint available. run_dir = os.path.join(config.train.job_dir, config.train.run_name) ckpt = tf.train.get_checkpoint_state(run_dir) if not ckpt or not ckpt.all_model_checkpoint_paths: click.echo("Couldn't find checkpoint in '{}'.".format(run_dir)) return last_checkpoint = sorted( [{ "global_step": int(path.split("-")[-1]), "file": path } for path in ckpt.all_model_checkpoint_paths], key=lambda c: c["global_step"], )[-1]["file"] checkpoint_prefix = os.path.basename(last_checkpoint) checkpoint_paths = [ os.path.join(run_dir, file) for file in os.listdir(run_dir) if file.startswith(checkpoint_prefix) ] # Find the `classes.json` file. classes_path = os.path.join(config.dataset.dir, "classes.json") if not os.path.exists(classes_path): classes_path = None # Create an checkpoint_id to identify the checkpoint. checkpoint_id = str(uuid.uuid4()).replace("-", "")[:12] # Update the directory paths for the configuration file. Since it's going # to be packed into a single tar file, we set them to the current directoy. config.dataset.dir = "." config.train.job_dir = "." config.train.run_name = checkpoint_id # Create the directory that will contain the model. path = get_checkpoint_path(checkpoint_id) tf.gfile.MakeDirs(path) with open(os.path.join(path, "config.yml"), "w") as f: json.dump(config, f) # Add the checkpoint files. for checkpoint_path in checkpoint_paths: shutil.copy2(checkpoint_path, path) # Add `checkpoint` file to indicate where the checkpoint is located. We # need to create it manually instead of just copying as it may contain # absolute paths. with open(os.path.join(path, "checkpoint"), "w") as f: f.write(""" model_checkpoint_path: "{0}" all_model_checkpoint_paths: "{0}" """.format(checkpoint_prefix)) # Add the `classes.json` file. Also get the number of classes, if # available. num_classes = None if classes_path: shutil.copy2(classes_path, path) with open(classes_path) as f: num_classes = len(json.load(f)) # Store the new checkpoint into the checkpoint index. metadata = { "id": checkpoint_id, "name": entries.get("name", ""), "description": entries.get("description", ""), "alias": entries.get("alias", ""), "model": config.model.type, "dataset": { "name": entries.get("dataset.name", ""), "num_classes": (num_classes or entries.get("dataset.num_classes", None)), }, "luminoth_version": lumi_version, "created_at": datetime.utcnow().isoformat(), "status": "LOCAL", "source": "local", "url": None, # Only for remotes. } db = read_checkpoint_db() db["checkpoints"].append(metadata) save_checkpoint_db(db) click.echo("Checkpoint {} created successfully.".format(checkpoint_id))
def predict(path_or_dir, config_files, checkpoint, override_params, output_path, save_media_to, min_prob, max_detections, only_class, ignore_class, debug): """Obtain a model's predictions. Receives either `config_files` or `checkpoint` in order to load the correct model. Afterwards, runs the model through the inputs specified by `path-or-dir`, returning predictions according to the format specified by `output`. Additional model behavior may be modified with `min-prob`, `only-class` and `ignore-class`. """ if debug: tf.logging.set_verbosity(tf.logging.DEBUG) else: tf.logging.set_verbosity(tf.logging.ERROR) if only_class and ignore_class: click.echo( "Only one of `only-class` or `ignore-class` may be specified.") return # Process the input and get the actual files to predict. files = resolve_files(path_or_dir) if not files: error = 'No files to predict found. Accepted formats are: {}.'.format( ', '.join(IMAGE_FORMATS + VIDEO_FORMATS)) click.echo(error) return else: click.echo('Found {} files to predict.'.format(len(files))) # Build the `Formatter` based on the outputs, which automatically writes # the formatted output to all the requested output files. if output_path == '-': output = sys.stdout else: output = open(output_path, 'w') # Create `save_media_to` if specified and it doesn't exist. if save_media_to: tf.gfile.MakeDirs(save_media_to) # Resolve the config to use and initialize the model. if checkpoint: config = get_checkpoint_config(checkpoint) elif config_files: config = get_config(config_files) else: click.echo( 'Neither checkpoint not config specified, assuming `accurate`.') config = get_checkpoint_config('accurate') if override_params: config = override_config_params(config, override_params) # Filter bounding boxes according to `min_prob` and `max_detections`. if config.model.type == 'fasterrcnn': if config.model.network.with_rcnn: config.model.rcnn.proposals.total_max_detections = max_detections else: config.model.rpn.proposals.post_nms_top_n = max_detections config.model.rcnn.proposals.min_prob_threshold = min_prob elif config.model.type == 'ssd': config.model.proposals.total_max_detections = max_detections config.model.proposals.min_prob_threshold = min_prob else: raise ValueError("Model type '{}' not supported".format( config.model.type)) # Instantiate the model indicated by the config. network = PredictorNetwork(config) # Iterate over files and run the model on each. for file in files: # Get the media output path, if media storage is requested. save_path = os.path.join(save_media_to, 'pred_{}'.format( os.path.basename(file))) if save_media_to else None file_type = get_file_type(file) predictor = predict_image if file_type == 'image' else predict_video objects = predictor( network, file, only_classes=only_class, ignore_classes=ignore_class, save_path=save_path, ) # TODO: Not writing jsons for video files for now. if objects is not None and file_type == 'image': output.write( json.dumps({ 'file': file, 'objects': objects, }) + '\n') output.close()
def train(job_id, bucket_name, region, config_files, dataset, scale_tier, master_type, worker_type, worker_count, parameter_server_type, parameter_server_count): account = ServiceAccount() account.validate_region(region) if bucket_name is None: bucket_name = 'luminoth-{}'.formata(account.client_id) click.echo( 'Bucket name not specified. Using "{}".'.format(bucket_name)) # Creates bucket for logs and models if it doesn't exist bucket = account.get_bucket(bucket_name) if not job_id: job_id = 'train_{}'.format(datetime.now().strftime("%Y%m%d_%H%M%S")) # Define path in bucket to store job's config, logs, etc. base_path = 'lumi_{}'.format(job_id) package_path = build_package(bucket, base_path) job_dir = 'gs://{}/{}'.format(bucket_name, base_path) override_params = [ 'train.job_dir={}'.format(job_dir), ] if dataset: # Check if absolute or relative dataset path if not dataset.startswith('gs://'): dataset = 'gs://{}'.format(dataset) override_params.append('dataset.dir={}'.format(dataset)) config = get_config(config_files, override_params=override_params) # Update final config file to job bucket config_path = '{}/{}'.format(base_path, DEFAULT_CONFIG_FILENAME) upload_data(bucket, config_path, dump_config(config)) args = ['--config', '{}/{}'.format(job_dir, DEFAULT_CONFIG_FILENAME)] cloudml = account.cloud_service('ml') training_inputs = { 'scaleTier': scale_tier, 'packageUris': ['gs://{}/{}'.format(bucket_name, package_path)], 'pythonModule': 'luminoth.train', 'args': args, 'region': region, 'jobDir': job_dir, 'runtimeVersion': RUNTIME_VERSION } if scale_tier == 'CUSTOM': training_inputs['masterType'] = master_type if worker_count > 0: training_inputs['workerCount'] = worker_count training_inputs['workerType'] = worker_type if parameter_server_count > 0: training_inputs['parameterServerCount'] = parameter_server_count training_inputs['parameterServerType'] = parameter_server_type job_spec = {'jobId': job_id, 'trainingInput': training_inputs} jobrequest = cloudml.projects().jobs().create(body=job_spec, parent='projects/{}'.format( account.project_id)) try: click.echo('Submitting training job.') res = jobrequest.execute() click.echo('Job submitted successfully.') click.echo('state = {}, createTime = {}'.format( res.get('state'), res.get('createTime'))) click.echo('\nJob id: {}'.format(job_id)) save_run(config, environment='gcloud', extra_config=job_spec) except Exception as err: click.echo('There was an error creating the training job. ' 'Check the details: \n{}'.format(err._get_reason()))
def predict(path_or_dir, config_files, checkpoint, override_params, output_dir, save, min_prob, ignore_classes, debug): if debug: tf.logging.set_verbosity(tf.logging.DEBUG) else: tf.logging.set_verbosity(tf.logging.INFO) # Get file paths if tf.gfile.IsDirectory(path_or_dir): file_paths = [ os.path.join(path_or_dir, f) for f in tf.gfile.ListDirectory(path_or_dir) if get_filetype(f) in ('image', 'video') ] else: if get_filetype(path_or_dir) in ('image', 'video'): file_paths = [path_or_dir] else: file_paths = [] errors = 0 successes = 0 created_files_paths = [] total_files = len(file_paths) if total_files == 0: no_files_message = ("No images or videos found. " "Accepted formats -> Image: {} - Video: {}") tf.logging.error(no_files_message.format(IMAGE_FORMATS, VIDEO_FORMATS)) exit() # Resolve the config to use and initialize the mdoel. if checkpoint: config = get_checkpoint_config(checkpoint) elif config_files: config = get_config(config_files) else: click.echo('You must specify either a checkpoint or a config file.') exit() if override_params: config = override_config_params(config, override_params) network = PredictorNetwork(config) # Create output_dir if it doesn't exist if output_dir: tf.gfile.MakeDirs(output_dir) tf.logging.info('Getting predictions for {} files'.format(total_files)) # Iterate over file paths for file_path in file_paths: save_path = 'pred_' + os.path.basename(file_path) if output_dir: save_path = os.path.join(output_dir, save_path) if get_filetype(file_path) == 'image': click.echo('Predicting {}...'.format(file_path)) with tf.gfile.Open(file_path, 'rb') as f: try: image = Image.open(f).convert('RGB') except (tf.errors.OutOfRangeError, OSError) as e: tf.logging.warning('Error: {}'.format(e)) tf.logging.warning("Couldn't open: {}".format(file_path)) errors += 1 continue # Run image through network prediction = network.predict_image(image) successes += 1 # Filter results if required by user if ignore_classes: prediction = filter_classes(prediction, ignore_classes) # Save prediction json file with open(save_path + '.json', 'w') as outfile: json.dump(prediction, outfile) created_files_paths.append(save_path + '.json') # Save predicted image if save: with tf.gfile.Open(file_path, 'rb') as im_file: image = Image.open(im_file) draw_bboxes_on_image(image, prediction, min_prob) image.save(save_path) created_files_paths.append(save_path) elif get_filetype(file_path) == 'video': # NOTE: We'll hardcode the video ouput to mp4 for the time being save_path = os.path.splitext(save_path)[0] + '.mp4' try: writer = skvideo.io.FFmpegWriter(save_path) except AssertionError as e: tf.logging.error(e) tf.logging.error( "Please install ffmpeg before making video predictions.") exit() num_of_frames = int( skvideo.io.ffprobe(file_path)['video']['@nb_frames']) video_progress_bar = click.progressbar( skvideo.io.vreader(file_path), length=num_of_frames, label='Predicting {}'.format(file_path)) with video_progress_bar as bar: try: for frame in bar: # Run image through network prediction = network.predict_image(frame) # Filter results if required by user if ignore_classes: prediction = filter_classes( prediction, ignore_classes) image = Image.fromarray(frame) draw_bboxes_on_image(image, prediction, min_prob) writer.writeFrame(np.array(image)) except RuntimeError as e: click.echo() # Error prints next to progress-bar if not tf.logging.error('Error: {}'.format(e)) tf.logging.error('Corrupt videofile: {}'.format(file_path)) tf.logging.error( 'Partially processed video file saved in {}'.format( save_path)) errors += 1 writer.close() created_files_paths.append(save_path) else: tf.logging.warning("{} isn't an image/video".format(file_path)) # Generate logs tf.logging.info("Created the following files: {}".format( ', '.join(created_files_paths))) if errors: tf.logging.warning('{} errors.'.format(errors))