def unzip_files(tests: list, masts: list) -> None: """ Extract files from archives in sorted order :param tests: List of paths to the test .tar archives :param masts: List of paths to the master .tar archives :return: """ print('Warning: decompressing files. Make sure you have the necessary ' 'disk space to complete this operation...\n') time.sleep(5) # Make sure the lists are sorted masts.sort() tests.sort() for mast, test in zip(masts, tests): try: mode = 'r:gz' if mast.endswith('gz') else 'r' tar_mast = tarfile.open(mast, mode) tar_test = tarfile.open(test, mode) logger.info("{0} is {1} MB...\n".format( mast, os.path.getsize(mast) * 0.000001)) logger.info("{0} is {1} MB...\n".format( test, os.path.getsize(test) * 0.000001)) if os.path.getsize(mast) == 0: logger.critical( "Archive {0} is of zero size!".format(mast)) sys.exit(1) elif os.path.getsize(test) == 0: logger.critical( "Archive {0} is of zero size!".format(test)) sys.exit(1) except Exception as exc: logger.critical("Problem with archive file(s): %s and %s. %s", mast, test, str(exc)) sys.exit(1) try: tar_mast.extractall(path=os.path.dirname(mast)) tar_test.extractall(path=os.path.dirname(test)) except Exception as exc: logger.critical( "Problem extracting contents from archive files:" "%s and %s. %s", mast, test, str(exc)) return None
def find_files(target_dir: str, ext: str) -> list: """ Recursively find files by extension :param target_dir: The full path to the target directory :param ext: The file type to look for :return: """ out_files = list() for root, dirnames, filenames in os.walk(target_dir): for filename in fnmatch.filter(filenames, "*{}".format(ext)): out_files.append(os.path.join(root, filename)) if len(out_files) == 0: logger.critical("No files found in dir {0}".format(target_dir)) return sorted(out_files)
def check_xml_schema(test, schema): """Ensure XML matches ESPA schema. :param test: <str> XML metadata file to compare with schema. :param schema: <str> Path to XML schema file. :return: None """ # read schema xmlschema = etree.XMLSchema(etree.parse(schema)) # read XML xmlfile = etree.parse(test) # do validation result = xmlschema.validate(xmlfile) if result: logger.warning('XML file {0} is valid with XML schema {1}.'.format( test, schema)) else: logger.critical( 'XML file {0} is NOT valid with XML schema {1}.'.format( test, schema))
def check_images(test, mast, dir_out, ext, include_nd=False): """Compare the test and master images, both for their raw contents and geographic parameters. If differences exist, produce diff plot + CSV stats file. Args: test <str>: path to test image mast <str>: path to master image dir_out <str>: path to output directory ext <str>: file extension include_nd <bool>: incl. nodata values in file cmp (default=False) """ logger.warning("Checking {0} files...".format(ext)) # clean up non-matching files test, mast = Cleanup.remove_nonmatching_files(test, mast) # make sure there are actually files to check if mast is None or test is None: logger.error("No {0} files to check in test and/or mast directories.".format(ext)) return False # do other comparison checks, return stats + plots if diffs exist for i, j in zip(test, mast): logger.info("Checking Test {0} against Master {1}".format(i, j)) # Open each raster ds_test = RasterIO.open_raster(i) ds_mast = RasterIO.open_raster(j) # Compare various raster parameters status = list() status.append(RasterCmp.compare_proj_ref(ds_test, ds_mast)) status.append(RasterCmp.compare_geo_trans(ds_test, ds_mast)) status.append(RasterCmp.extent_diff_cols(ds_test, ds_mast)) status.append(RasterCmp.extent_diff_rows(ds_test, ds_mast)) # If any above tests fail, go to next iteration if any(stat is False for stat in status): continue # Count number of sub-bands in the files d_range = Find.count(i, ds_test, j, ds_mast, ext) if d_range is None: logger.critical("Number of files different; data cannot be tested successfully.") continue # if sub-bands exist, read them one-by-one and do diffs + stats if d_range > 1: for ii in range(0, d_range): # Get the first band from each raster if ext == ".img": logger.info("Reading sub-band {0} from .img {1}...".format(ii, i)) ds_tband = RasterIO.read_band_as_array(ds_test, ii) ds_mband = RasterIO.read_band_as_array(ds_mast, ii) else: logger.info("Reading .hdf/.nc SDS {0} from file {0}...".format(ii, i)) sds_tband = RasterIO.open_raster(RasterIO.get_sds(ds_test)[ii][0]) sds_mband = RasterIO.open_raster(RasterIO.get_sds(ds_mast)[ii][0]) ds_tband, t_nd = RasterIO.read_band_as_array(sds_tband) ds_mband, m_nd = RasterIO.read_band_as_array(sds_mband) # do image differencing without masking NoData if isinstance(t_nd, type(None)) or include_nd: diff = do_diff(ds_tband, ds_mband) # do image differencing with NoData masked else: diff = do_diff(ds_tband, ds_mband, nodata=int(t_nd)) # call stats functions to write out results/plots/etc. call_stats(i, j, diff, i, dir_out, rast_num=ii) else: # else it's a singleband raster logger.info("Reading {0}...".format(i)) # read in bands as array ds_tband, t_nd = RasterIO.read_band_as_array(ds_test) ds_mband, m_nd = RasterIO.read_band_as_array(ds_mast) # do diff if isinstance(t_nd, type(None)) or include_nd: diff = do_diff(ds_tband, ds_mband) else: diff = do_diff(ds_tband, ds_mband, nodata=int(t_nd)) # call stats functions to write out results/plots/etc. call_stats(i, j, diff, i, dir_out)
def qa_data(dir_mast: str, dir_test: str, dir_out: str, archive: bool = True, xml_schema: str = None, incl_nd: bool = False) -> None: """ Function to check files and call appropriate QA module(s) :param dir_mast: Full path to the master directory :param dir_test: Full path to the test directory :param dir_out: Full path to the QA output directory :param archive: If True, will clean up existing files and extract from archives :param xml_schema: Full path to XML files, default is None :param incl_nd: If True, include NoData in comparisons :return: """ # start timing code t0 = time.time() # create output dir if it doesn't exist if not os.path.exists(dir_out): os.makedirs(dir_out) if archive: # do initial cleanup of input directories Cleanup.cleanup_files(dir_mast) Cleanup.cleanup_files(dir_test) # create output directory if it doesn't exist if not os.path.exists(dir_out): os.makedirs(dir_out) # read in .tar.gz files test_files = Find.find_files(dir_test, ".tar*") mast_files = Find.find_files(dir_mast, ".tar*") # Extract files from archive Extract.unzip_files(test_files, mast_files) # find only the deepest dirs test_dirs = sorted([r for r, d, f in os.walk(dir_test) if not d]) mast_dirs = sorted([r for r, d, f in os.walk(dir_mast) if not d]) if len(test_dirs) != len(mast_dirs): logger.critical( "Directory structure of Master differs from Test., MASTER: %s, TEST: %s", mast_dirs, test_dirs) sys.exit(1) for i in range(0, len(test_dirs)): # Find extracted files all_test = sorted(Find.find_files(test_dirs[i], ".*")) all_mast = sorted(Find.find_files(mast_dirs[i], ".*")) # Find unique file extensions exts = Find.get_ext(all_test, all_mast) for ext in exts: logger.info("Finding {0} files...".format(ext)) test_f = Find.find_files(test_dirs[i], ext) mast_f = Find.find_files(mast_dirs[i], ext) logger.info("Performing QA on {0} files located in {1}".format( ext, dir_test)) logger.info("Test files: {0}".format(test_f)) logger.info("Mast files: {0}".format(mast_f)) # remove any _hdf.img files found with .img files if ext == ".img": test_f = Cleanup.rm_files(test_f, "_hdf.img") mast_f = Cleanup.rm_files(mast_f, "_hdf.img") # if a text-based file if (ext.lower() == ".txt" or ext.lower() == ".xml" or ext.lower() == ".gtf" or ext.lower() == ".hdr" or ext.lower() == ".stats"): MetadataQA.check_text_files(test_f, mast_f, ext) # if text-based file is xml if ext.lower() == ".xml" and xml_schema: MetadataQA.check_xml_schema(test_f, xml_schema) MetadataQA.check_xml_schema(mast_f, xml_schema) # if non-geo image elif ext.lower() == ".jpg": MetadataQA.check_jpeg_files(test_f, mast_f, dir_out) # if no extension elif len(ext) == 0: continue # else, it's probably a geo-based image else: GeoImage.check_images(test_f, mast_f, dir_out, ext, include_nd=incl_nd) if archive: # Clean up files Cleanup.cleanup_files(dir_mast) Cleanup.cleanup_files(dir_test) # end timing t1 = time.time() m, s = divmod(t1 - t0, 60) h, m = divmod(m, 60) logger.warning("Total runtime: {0}h, {1}m, {2}s.".format( h, round(m, 3), round(s, 3))) logger.warning("Done.") return None
def count(fn_test, test, fn_mast, mast, ext): """Count number of bands inside file to decide how to iterate through file. Args: fn_test <str>: file name of test raster. test <osgeo.gdal.Dataset>: test raster fn_mast <str>: file name of master raster. mast <osgeo.gdal.Dataset>: master raster ext <str>: file extension of raster """ def count_bands(r_name, raster): """Count number of bands inside raster Args: r_name <str>: file name of raster raster <osgeo.gdal.Dataset>: raster """ try: from osgeo import gdal except ImportError: import gdal d_r = raster.RasterCount logger.info("Number of bands in {0}: {1}".format(r_name, d_r)) return d_r def count_sds(r_name, raster): """Count number of SDS inside raster. Args: r_name <str>: file name of raster raster <osgeo.gdal.Dataset>: raster """ try: from osgeo import gdal except ImportError: import gdal d_r = len(raster.GetSubDatasets()) logger.info("Number of SDS in {0}: {1}".format(r_name, d_r)) return d_r # count bands in each raster. if > 1, then handle differently if ext == ".img": # count_bands returns a 0 if there's <= 1 band in data d_range_test = count_bands(fn_test, test) d_range_mast = count_bands(fn_mast, mast) elif ext == ".hdf" or ext == ".nc": d_range_test = count_sds(fn_test, test) d_range_mast = count_sds(fn_mast, mast) else: d_range_test = 1 d_range_mast = 1 if d_range_test == 1: logger.info("File {0} is a singleband raster.".format(fn_test)) else: logger.info("File {0} is a multiband raster.".format(fn_test)) if d_range_mast == 1: logger.info("File {0} is a singleband raster.".format(fn_mast)) else: logger.info("File {0} is a multiband raster.".format(fn_mast)) if int(d_range_test) != int(d_range_mast): logger.critical("Number of sub-bands inside raster do not match. " "Test: {0} | Master: {1}.".format( d_range_test, d_range_mast)) d_range = None else: d_range = d_range_test return d_range