def find_prediction_file(self, prediction_path: pathlib.Path) -> None: image_number: str = self.image_id.split('_')[1] if not self.attribute_id: prediction_file_candidates = [ prediction_file for prediction_file in prediction_path.iterdir() if image_number in prediction_file.stem ] else: prediction_file_candidates = [ prediction_file for prediction_file in prediction_path.iterdir() if image_number in prediction_file.stem and self.attribute_id in prediction_file.stem ] if not prediction_file_candidates: raise ScoreException( f'No matching submission for: {self.truth_file.name}') elif len(prediction_file_candidates) > 1: raise ScoreException( f'Multiple matching submissions for: {self.truth_file.name}') self.prediction_file = prediction_file_candidates[0]
def score(truth_path: pathlib.Path, prediction_path: pathlib.Path) -> ScoresType: for truth_file in truth_path.iterdir(): if re.match(r'^ISIC.*GroundTruth\.csv$', truth_file.name): break else: raise ScoreException('Internal error, truth file could not be found.') prediction_files = [ prediction_file for prediction_file in prediction_path.iterdir() if prediction_file.suffix.lower() == '.csv' ] if len(prediction_files) > 1: raise ScoreException( 'Multiple prediction files submitted. Exactly one CSV file should be submitted.' ) elif len(prediction_files) < 1: raise ScoreException( 'No prediction files submitted. Exactly one CSV file should be submitted.' ) prediction_file = prediction_files[0] with truth_file.open('rb') as truth_file_stream, prediction_file.open( 'rb') as prediction_file_stream: return compute_metrics(truth_file_stream, prediction_file_stream)
def ensure_manuscript(prediction_path: pathlib.Path): manuscript_file_count = sum( manuscript_file.suffix.lower() == '.pdf' for manuscript_file in prediction_path.iterdir()) if manuscript_file_count > 1: raise ScoreException( 'Multiple PDFs submitted. Exactly one PDF file, containing the descriptive manuscript, ' 'must included in the submission.') elif manuscript_file_count < 1: raise ScoreException( 'No PDF submitted. Exactly one PDF file, containing the descriptive manuscript, ' 'must included in the submission.')
def extract_zip(zip_path: pathlib.Path, output_path: pathlib.Path, flatten: bool = True) -> None: """Extract a zip file, optionally flattening it into a single directory.""" try: with zipfile.ZipFile(zip_path) as zf: if flatten: for member_info in zf.infolist(): member_name = member_info.filename if member_name.startswith('__MACOSX'): # Ignore Mac OS X metadata continue member_base_name = os.path.basename(member_name) if not member_base_name: # Skip directories continue member_output_path = output_path / member_base_name with zf.open(member_info ) as input_stream, member_output_path.open( 'wb') as output_stream: shutil.copyfileobj(input_stream, output_stream) else: zf.extractall(output_path) except zipfile.BadZipfile as e: raise ScoreException( f'Could not read ZIP file "{zip_path.name}": {str(e)}.')
def load_prediction_image(self) -> None: self.prediction_image = load_segmentation_image(self.prediction_file) if self.prediction_image.shape[0:2] != self.truth_image.shape[0:2]: raise ScoreException( f'Image {self.prediction_file.name} has dimensions ' f'{self.prediction_image.shape[0:2]}; expected {self.truth_image.shape[0:2]}.' )
def validate_rows(truth_probabilities: pd.DataFrame, prediction_probabilities: pd.DataFrame): """ Ensure prediction rows correspond to truth rows. Fail when predictionProbabilities is missing rows or has extra rows compared to truthProbabilities. """ missing_images = truth_probabilities.index.difference( prediction_probabilities.index) if not missing_images.empty: raise ScoreException( f'Missing images in CSV: {missing_images.tolist()}.') extra_images = prediction_probabilities.index.difference( truth_probabilities.index) if not extra_images.empty: raise ScoreException(f'Extra images in CSV: {extra_images.tolist()}.')
def unzip_all( input_path: pathlib.Path, allow_manuscript_directory: bool = False ) -> Tuple[pathlib.Path, tempfile.TemporaryDirectory]: """ Extract / copy all files in directory. Validates that the path contains exactly one file. Optionally allow an 'Abstract' directory to exist which contains exactly one manuscript file. Return a path to the extracted content. """ input_files = [f for f in input_path.iterdir() if f.is_file()] input_dirs = [f for f in input_path.iterdir() if f.is_dir()] if len(input_files) > 1: raise ScoreException( 'Multiple files submitted. Exactly one ZIP file should be submitted.' ) elif len(input_files) < 1: raise ScoreException( 'No files submitted. Exactly one ZIP file should be submitted.') input_file = input_files[0] manuscript_file = None if allow_manuscript_directory: if len(input_dirs) > 1: raise ScoreException('Internal error: multiple directories found.') elif len(input_dirs) == 1: input_dir = input_dirs[0] if input_dir.name != 'Abstract': raise ScoreException( f'Internal error: unexpected directory found: {input_dir.name}.' ) manuscript_files = list(input_dir.iterdir()) if not manuscript_files: raise ScoreException('Empty manuscript directory found.') elif len(manuscript_files) > 1: raise ScoreException( 'Multiple files found in manuscript directory.') manuscript_file = manuscript_files[0] elif input_dirs: # Expect only files raise ScoreException('Internal error: unexpected directory found.') output_temp_dir = tempfile.TemporaryDirectory() output_path = pathlib.Path(output_temp_dir.name) if input_file.suffix.lower() == '.zip': extract_zip(input_file, output_path) else: shutil.copy(input_file, output_path) if manuscript_file is not None: shutil.copy(manuscript_file, output_path) return output_path, output_temp_dir
def load_segmentation_image(image_path: pathlib.Path) -> np.ndarray: """Load a segmentation image as a NumPy array, given a file path.""" try: image: Image = Image.open(str(image_path)) except Exception as e: raise ScoreException( f'Could not decode image "{image_path.name}" because: "{str(e)}"') if image.mode == '1': # NumPy crashes if a 1-bit (black and white) image is directly # coerced to an array image = image.convert('L') if image.mode != 'L': raise ScoreException( f'Image {image_path.name} is not single-channel (greyscale).') image = np.array(image) return image
def assert_binary_image(image: np.ndarray, image_path: pathlib.Path): """Ensure a NumPy array image is binary, correcting if possible.""" image_values = set(np.unique(image)) if image_values <= {0, 255}: # Expected values pass elif len(image_values) <= 2: # Binary image with high value other than 255 can be corrected high_value = (image_values - {0}).pop() image /= high_value image *= 255 if set(np.unique(image)) > {0, 255}: raise ScoreException( f'Image {image_path.name} contains values other than 0 and 255.' ) else: raise ScoreException( f'Image {image_path.name} contains values other than 0 and 255.') return image
def parse_image_id(self): image_id_match: Match[str] = re.search(r'ISIC_[0-9]{7}', self.truth_file.stem) if not image_id_match: raise ScoreException( f'Internal error: unknown ground truth file: {self.truth_file.name}.' ) self.image_id = image_id_match.group(0) attribute_id_match: Match[str] = re.search(r'attribute_([a-z_]+)', self.truth_file.stem) if attribute_id_match: self.attribute_id = attribute_id_match.group(1)
def load_segmentation_image(image_path: pathlib.Path) -> np.ndarray: """Load a segmentation image as a NumPy array, given a file path.""" try: with Image.open(image_path) as image: # Ensure the image is loaded, sometimes NumPy fails to get the "__array_interface__" image.load() if image.mode == '1': # NumPy crashes if a 1-bit (black and white) image is directly # coerced to an array image = image.convert('L') if image.mode != 'L': raise ScoreException( f'Image {image_path.name} is not single-channel (greyscale).' ) np_image = np.asarray(image) except UnidentifiedImageError: raise ScoreException(f'Could not decode image "{image_path.name}"') return np_image
def score_all( truth_input_path: pathlib.Path, prediction_input_path: pathlib.Path, task_num: int, require_manuscript: bool, ): # Unzip zip files contained in the input folders truth_path, truth_temp_dir = unzip_all(truth_input_path) prediction_path, prediction_temp_dir = unzip_all( prediction_input_path, allow_manuscript_directory=True) if require_manuscript: ensure_manuscript(prediction_path) if task_num == 1: score = task1.score elif task_num == 2: score = task2.score elif task_num == 3: score = task3.score else: raise ScoreException( f'Internal error: unknown ground truth phase number: {task_num}.') scores: ScoresType = score(truth_path, prediction_path) # Output in Covalic format print( json.dumps([{ 'dataset': dataset, 'metrics': [{ 'name': metric_name, 'value': metric_value } for metric_name, metric_value in metrics.items()], } for dataset, metrics in scores.items()])) truth_temp_dir.cleanup() prediction_temp_dir.cleanup()
def parse_csv(csv_file_stream: TextIO, categories: pd.Index) -> pd.DataFrame: try: probabilities = pd.read_csv(csv_file_stream, header=0, index_col=False) except pd.errors.ParserError as e: # TODO: Test this case raise ScoreException(f'Could not parse CSV: "{str(e)}"') if 'image' not in probabilities.columns: raise ScoreException('Missing column in CSV: "image".') # Pandas represents strings as 'O' (object) if probabilities['image'].dtype != np.dtype('O'): # Coercing to 'U' (unicode) ensures that even NaN values are converted; # however, the resulting type is still 'O' probabilities['image'] = probabilities['image'].astype(np.dtype('U')) probabilities['image'] = probabilities['image'].str.replace(r'\.jpg$', '', case=False) if not probabilities['image'].is_unique: duplicate_images = probabilities['image'][ probabilities['image'].duplicated()].unique() raise ScoreException( f'Duplicate image rows detected in CSV: {duplicate_images.tolist()}.' ) # The duplicate check is the same as performed by 'verify_integrity' probabilities.set_index('image', drop=True, inplace=True, verify_integrity=False) missing_columns = categories.difference(probabilities.columns) if not missing_columns.empty: raise ScoreException( f'Missing columns in CSV: {missing_columns.tolist()}.') extra_columns = probabilities.columns.difference(categories) if not extra_columns.empty: raise ScoreException( f'Extra columns in CSV: {extra_columns.tolist()}.') # sort by the order in categories probabilities = probabilities.reindex(categories, axis='columns') missing_rows = probabilities[probabilities.isnull().any( axis='columns')].index if not missing_rows.empty: raise ScoreException( f'Missing value(s) in CSV for images: {missing_rows.tolist()}.') non_float_columns = probabilities.dtypes[probabilities.dtypes.apply( lambda x: x != np.float64)].index if not non_float_columns.empty: raise ScoreException( f'CSV contains non-floating-point value(s) in columns: {non_float_columns.tolist()}.' ) # TODO: identify specific failed rows out_of_range_rows = probabilities[probabilities.applymap( lambda x: x < 0.0 or x > 1.0).any(axis='columns')].index if not out_of_range_rows.empty: raise ScoreException( f'Values in CSV are outside the interval [0.0, 1.0] for images: ' f'{out_of_range_rows.tolist()}.') # TODO: fail on extra columns in data rows return probabilities