def unzip_files(tests: list, masts: list) -> None:
        """
        Extract files from archives in sorted order
        :param tests: List of paths to the test .tar archives
        :param masts: List of paths to the master .tar archives
        :return:
        """
        print('Warning: decompressing files. Make sure you have the necessary '
              'disk space to complete this operation...\n')

        time.sleep(5)

        # Make sure the lists are sorted
        masts.sort()

        tests.sort()

        for mast, test in zip(masts, tests):
            try:
                mode = 'r:gz' if mast.endswith('gz') else 'r'
                tar_mast = tarfile.open(mast, mode)
                tar_test = tarfile.open(test, mode)

                logger.info("{0} is {1} MB...\n".format(
                    mast,
                    os.path.getsize(mast) * 0.000001))

                logger.info("{0} is {1} MB...\n".format(
                    test,
                    os.path.getsize(test) * 0.000001))

                if os.path.getsize(mast) == 0:
                    logger.critical(
                        "Archive {0} is of zero size!".format(mast))

                    sys.exit(1)

                elif os.path.getsize(test) == 0:
                    logger.critical(
                        "Archive {0} is of zero size!".format(test))

                    sys.exit(1)

            except Exception as exc:
                logger.critical("Problem with archive file(s): %s and %s. %s",
                                mast, test, str(exc))
                sys.exit(1)

            try:
                tar_mast.extractall(path=os.path.dirname(mast))

                tar_test.extractall(path=os.path.dirname(test))

            except Exception as exc:
                logger.critical(
                    "Problem extracting contents from archive files:"
                    "%s and %s. %s", mast, test, str(exc))

        return None
    def find_files(target_dir: str, ext: str) -> list:
        """
        Recursively find files by extension
        :param target_dir: The full path to the target directory
        :param ext: The file type to look for
        :return:
        """
        out_files = list()

        for root, dirnames, filenames in os.walk(target_dir):
            for filename in fnmatch.filter(filenames, "*{}".format(ext)):
                out_files.append(os.path.join(root, filename))

        if len(out_files) == 0:
            logger.critical("No files found in dir {0}".format(target_dir))

        return sorted(out_files)
Beispiel #3
0
    def check_xml_schema(test, schema):
        """Ensure XML matches ESPA schema.
        :param test: <str> XML metadata file to compare with schema.
        :param schema: <str> Path to XML schema file.
        :return: None
        """
        # read schema
        xmlschema = etree.XMLSchema(etree.parse(schema))

        # read XML
        xmlfile = etree.parse(test)

        # do validation
        result = xmlschema.validate(xmlfile)

        if result:
            logger.warning('XML file {0} is valid with XML schema {1}.'.format(
                test, schema))

        else:
            logger.critical(
                'XML file {0} is NOT valid with XML schema {1}.'.format(
                    test, schema))
    def check_images(test, mast, dir_out, ext, include_nd=False):
        """Compare the test and master images, both for their raw contents and
        geographic parameters. If differences exist, produce diff plot + CSV
        stats file.

        Args:
            test <str>: path to test image
            mast <str>: path to master image
            dir_out <str>: path to output directory
            ext <str>: file extension
            include_nd <bool>: incl. nodata values in file cmp (default=False)
        """
        logger.warning("Checking {0} files...".format(ext))

        # clean up non-matching files
        test, mast = Cleanup.remove_nonmatching_files(test, mast)

        # make sure there are actually files to check
        if mast is None or test is None:
            logger.error("No {0} files to check in test and/or mast directories.".format(ext))

            return False

        # do other comparison checks, return stats + plots if diffs exist
        for i, j in zip(test, mast):

            logger.info("Checking Test {0} against Master {1}".format(i, j))

            # Open each raster
            ds_test = RasterIO.open_raster(i)

            ds_mast = RasterIO.open_raster(j)

            # Compare various raster parameters
            status = list()

            status.append(RasterCmp.compare_proj_ref(ds_test, ds_mast))

            status.append(RasterCmp.compare_geo_trans(ds_test, ds_mast))

            status.append(RasterCmp.extent_diff_cols(ds_test, ds_mast))

            status.append(RasterCmp.extent_diff_rows(ds_test, ds_mast))

            # If any above tests fail, go to next iteration
            if any(stat is False for stat in status):
                continue

            # Count number of sub-bands in the files
            d_range = Find.count(i, ds_test, j, ds_mast, ext)

            if d_range is None:
                logger.critical("Number of files different; data cannot be tested successfully.")

                continue

            # if sub-bands exist, read them one-by-one and do diffs + stats
            if d_range > 1:
                for ii in range(0, d_range):
                    # Get the first band from each raster
                    if ext == ".img":
                        logger.info("Reading sub-band {0} from .img {1}...".format(ii, i))

                        ds_tband = RasterIO.read_band_as_array(ds_test, ii)

                        ds_mband = RasterIO.read_band_as_array(ds_mast, ii)

                    else:
                        logger.info("Reading .hdf/.nc SDS {0} from file {0}...".format(ii, i))

                        sds_tband = RasterIO.open_raster(RasterIO.get_sds(ds_test)[ii][0])

                        sds_mband = RasterIO.open_raster(RasterIO.get_sds(ds_mast)[ii][0])

                        ds_tband, t_nd = RasterIO.read_band_as_array(sds_tband)

                        ds_mband, m_nd = RasterIO.read_band_as_array(sds_mband)

                    # do image differencing without masking NoData
                    if isinstance(t_nd, type(None)) or include_nd:
                        diff = do_diff(ds_tband, ds_mband)

                    # do image differencing with NoData masked
                    else:
                        diff = do_diff(ds_tband, ds_mband, nodata=int(t_nd))

                    # call stats functions to write out results/plots/etc.
                    call_stats(i, j, diff, i, dir_out, rast_num=ii)

            else:  # else it's a singleband raster
                logger.info("Reading {0}...".format(i))

                # read in bands as array
                ds_tband, t_nd = RasterIO.read_band_as_array(ds_test)

                ds_mband, m_nd = RasterIO.read_band_as_array(ds_mast)

                # do diff
                if isinstance(t_nd, type(None)) or include_nd:
                    diff = do_diff(ds_tband, ds_mband)

                else:
                    diff = do_diff(ds_tband, ds_mband, nodata=int(t_nd))

                # call stats functions to write out results/plots/etc.
                call_stats(i, j, diff, i, dir_out)
def qa_data(dir_mast: str,
            dir_test: str,
            dir_out: str,
            archive: bool = True,
            xml_schema: str = None,
            incl_nd: bool = False) -> None:
    """
    Function to check files and call appropriate QA module(s)
    :param dir_mast: Full path to the master directory
    :param dir_test: Full path to the test directory
    :param dir_out: Full path to the QA output directory
    :param archive: If True, will clean up existing files and extract from archives
    :param xml_schema: Full path to XML files, default is None
    :param incl_nd: If True, include NoData in comparisons
    :return:
    """
    # start timing code
    t0 = time.time()

    # create output dir if it doesn't exist
    if not os.path.exists(dir_out):
        os.makedirs(dir_out)

    if archive:
        # do initial cleanup of input directories
        Cleanup.cleanup_files(dir_mast)

        Cleanup.cleanup_files(dir_test)

        # create output directory if it doesn't exist
        if not os.path.exists(dir_out):
            os.makedirs(dir_out)

        # read in .tar.gz files
        test_files = Find.find_files(dir_test, ".tar*")

        mast_files = Find.find_files(dir_mast, ".tar*")

        # Extract files from archive
        Extract.unzip_files(test_files, mast_files)

    # find only the deepest dirs
    test_dirs = sorted([r for r, d, f in os.walk(dir_test) if not d])

    mast_dirs = sorted([r for r, d, f in os.walk(dir_mast) if not d])

    if len(test_dirs) != len(mast_dirs):
        logger.critical(
            "Directory structure of Master differs from Test., MASTER: %s, TEST: %s",
            mast_dirs, test_dirs)

        sys.exit(1)

    for i in range(0, len(test_dirs)):
        # Find extracted files
        all_test = sorted(Find.find_files(test_dirs[i], ".*"))

        all_mast = sorted(Find.find_files(mast_dirs[i], ".*"))

        # Find unique file extensions
        exts = Find.get_ext(all_test, all_mast)

        for ext in exts:
            logger.info("Finding {0} files...".format(ext))

            test_f = Find.find_files(test_dirs[i], ext)

            mast_f = Find.find_files(mast_dirs[i], ext)

            logger.info("Performing QA on {0} files located in {1}".format(
                ext, dir_test))

            logger.info("Test files: {0}".format(test_f))

            logger.info("Mast files: {0}".format(mast_f))

            # remove any _hdf.img files found with .img files
            if ext == ".img":
                test_f = Cleanup.rm_files(test_f, "_hdf.img")

                mast_f = Cleanup.rm_files(mast_f, "_hdf.img")

            # if a text-based file
            if (ext.lower() == ".txt" or ext.lower() == ".xml"
                    or ext.lower() == ".gtf" or ext.lower() == ".hdr"
                    or ext.lower() == ".stats"):

                MetadataQA.check_text_files(test_f, mast_f, ext)

                # if text-based file is xml
                if ext.lower() == ".xml" and xml_schema:
                    MetadataQA.check_xml_schema(test_f, xml_schema)

                    MetadataQA.check_xml_schema(mast_f, xml_schema)

            # if non-geo image
            elif ext.lower() == ".jpg":
                MetadataQA.check_jpeg_files(test_f, mast_f, dir_out)

            # if no extension
            elif len(ext) == 0:
                continue

            # else, it's probably a geo-based image
            else:
                GeoImage.check_images(test_f,
                                      mast_f,
                                      dir_out,
                                      ext,
                                      include_nd=incl_nd)

    if archive:
        # Clean up files
        Cleanup.cleanup_files(dir_mast)

        Cleanup.cleanup_files(dir_test)

    # end timing
    t1 = time.time()

    m, s = divmod(t1 - t0, 60)

    h, m = divmod(m, 60)

    logger.warning("Total runtime: {0}h, {1}m, {2}s.".format(
        h, round(m, 3), round(s, 3)))

    logger.warning("Done.")

    return None
    def count(fn_test, test, fn_mast, mast, ext):
        """Count number of bands inside file to decide how to iterate through
        file.

        Args:
            fn_test <str>: file name of test raster.
            test <osgeo.gdal.Dataset>: test raster
            fn_mast <str>: file name of master raster.
            mast <osgeo.gdal.Dataset>: master raster
            ext <str>: file extension of raster
        """
        def count_bands(r_name, raster):
            """Count number of bands inside raster

            Args:
                r_name <str>: file name of raster
                raster <osgeo.gdal.Dataset>: raster
            """
            try:
                from osgeo import gdal
            except ImportError:
                import gdal

            d_r = raster.RasterCount

            logger.info("Number of bands in {0}: {1}".format(r_name, d_r))

            return d_r

        def count_sds(r_name, raster):
            """Count number of SDS inside raster.

            Args:
                r_name <str>: file name of raster
                raster <osgeo.gdal.Dataset>: raster
            """
            try:
                from osgeo import gdal
            except ImportError:
                import gdal

            d_r = len(raster.GetSubDatasets())

            logger.info("Number of SDS in {0}: {1}".format(r_name, d_r))

            return d_r

        # count bands in each raster. if > 1, then handle differently
        if ext == ".img":
            # count_bands returns a 0 if there's <= 1 band in data
            d_range_test = count_bands(fn_test, test)
            d_range_mast = count_bands(fn_mast, mast)

        elif ext == ".hdf" or ext == ".nc":
            d_range_test = count_sds(fn_test, test)
            d_range_mast = count_sds(fn_mast, mast)

        else:
            d_range_test = 1
            d_range_mast = 1

        if d_range_test == 1:
            logger.info("File {0} is a singleband raster.".format(fn_test))
        else:
            logger.info("File {0} is a multiband raster.".format(fn_test))

        if d_range_mast == 1:
            logger.info("File {0} is a singleband raster.".format(fn_mast))
        else:
            logger.info("File {0} is a multiband raster.".format(fn_mast))

        if int(d_range_test) != int(d_range_mast):
            logger.critical("Number of sub-bands inside raster do not match. "
                            "Test: {0} | Master: {1}.".format(
                                d_range_test, d_range_mast))
            d_range = None

        else:
            d_range = d_range_test

        return d_range