def decompress_corpus(archive_path, documents_path, uncompressed_size): if uncompressed_size: console.info( "Decompressing track data from [%s] to [%s] (resulting size: %.2f GB) ... " % (archive_path, documents_path, convert.bytes_to_gb(uncompressed_size)), end='', flush=True, logger=logger) else: console.info("Decompressing track data from [%s] to [%s] ... " % (archive_path, documents_path), end='', flush=True, logger=logger) io.decompress(archive_path, io.dirname(archive_path)) console.println("[OK]") if not os.path.isfile(documents_path): raise exceptions.DataError( "Decompressing [%s] did not create [%s]. Please check with the track author if the compressed " "archive has been created correctly." % (archive_path, documents_path)) extracted_bytes = os.path.getsize(documents_path) if uncompressed_size is not None and extracted_bytes != uncompressed_size: raise exceptions.DataError( "[%s] is corrupt. Extracted [%d] bytes but [%d] bytes are expected." % (documents_path, extracted_bytes, uncompressed_size))
def decompress(data_set_path, expected_size_in_bytes): # we assume that track data are always compressed and try to decompress them before running the benchmark basename, extension = io.splitext(data_set_path) decompressed = False if not os.path.isfile(basename) or os.path.getsize( basename) != expected_size_in_bytes: decompressed = True if type.uncompressed_size_in_bytes: console.info( "Decompressing track data from [%s] to [%s] (resulting size: %.2f GB) ... " % (data_set_path, basename, convert.bytes_to_gb(type.uncompressed_size_in_bytes)), end='', flush=True, logger=logger) else: console.info( "Decompressing track data from [%s] to [%s] ... " % (data_set_path, basename), end='', flush=True, logger=logger) io.decompress(data_set_path, io.dirname(data_set_path)) console.println("[OK]") extracted_bytes = os.path.getsize(basename) if expected_size_in_bytes is not None and extracted_bytes != expected_size_in_bytes: raise exceptions.DataError( "[%s] is corrupt. Extracted [%d] bytes but [%d] bytes are expected." % (basename, extracted_bytes, expected_size_in_bytes)) return basename, decompressed
def _update(self, distribution_version): try: if self.remote and not self.offline: branch = versions.best_match( git.branches(self.tracks_dir, remote=self.remote), distribution_version) if branch: logger.info( "Rebasing on '%s' in '%s' for distribution version '%s'." % (branch, self.tracks_dir, distribution_version)) git.rebase(self.tracks_dir, branch=branch) return else: msg = "Could not find track data remotely for distribution version %s. " \ "Trying to find track data locally." % distribution_version logger.warn(msg) branch = versions.best_match( git.branches(self.tracks_dir, remote=False), distribution_version) if branch: logger.info( "Checking out '%s' in '%s' for distribution version '%s'." % (branch, self.tracks_dir, distribution_version)) git.checkout(self.tracks_dir, branch=branch) else: raise exceptions.SystemSetupError( "Cannot find track data for distribution version %s" % distribution_version) except exceptions.SupplyError as e: raise exceptions.DataError("Cannot update track data in '%s': %s" % (self.tracks_dir, e))
def create_file_offset_table(document_file_path, expected_number_of_lines): # just rebuild the file every time for the time being. Later on, we might check the data file fingerprint to avoid it lines_read = io.prepare_file_offset_table(document_file_path) if lines_read and lines_read != expected_number_of_lines: io.remove_file_offset_table(document_file_path) raise exceptions.DataError("Data in [%s] for track [%s] are invalid. Expected [%d] lines but got [%d]." % (document_file_path, track, expected_number_of_lines, lines_read))
def _update(self, distribution_version): try: if self.remote and not self.offline: branch = versions.best_match(git.branches(self.tracks_dir, remote=self.remote), distribution_version) if branch: # Allow uncommitted changes iff we do not have to change the branch logger.info( "Checking out [%s] in [%s] for distribution version [%s]." % (branch, self.tracks_dir, distribution_version)) git.checkout(self.tracks_dir, branch=branch) logger.info("Rebasing on [%s] in [%s] for distribution version [%s]." % (branch, self.tracks_dir, distribution_version)) try: git.rebase(self.tracks_dir, branch=branch) except exceptions.SupplyError: logger.exception("Cannot rebase due to local changes in [%s]" % self.tracks_dir) console.warn( "Local changes in [%s] prevent track update from remote. Please commit your changes." % self.tracks_dir) return else: msg = "Could not find track data remotely for distribution version [%s]. " \ "Trying to find track data locally." % distribution_version logger.warning(msg) branch = versions.best_match(git.branches(self.tracks_dir, remote=False), distribution_version) if branch: logger.info("Checking out [%s] in [%s] for distribution version [%s]." % (branch, self.tracks_dir, distribution_version)) git.checkout(self.tracks_dir, branch=branch) else: raise exceptions.SystemSetupError("Cannot find track data for distribution version %s" % distribution_version) except exceptions.SupplyError: tb = sys.exc_info()[2] raise exceptions.DataError("Cannot update track data in [%s]." % self.tracks_dir).with_traceback(tb)
def download_corpus(root_url, target_path, size_in_bytes, track_name, offline, test_mode): file_name = os.path.basename(target_path) if not root_url: raise exceptions.DataError("%s is missing and it cannot be downloaded because no source URL is provided in the track." % target_path) if offline: raise exceptions.SystemSetupError("Cannot find %s. Please disable offline mode and retry again." % target_path) data_url = "%s/%s" % (source_root_url, file_name) try: io.ensure_dir(os.path.dirname(target_path)) if size_in_bytes: size_in_mb = round(convert.bytes_to_mb(size_in_bytes)) logger.info("Downloading data from [%s] (%s MB) to [%s]." % (data_url, size_in_mb, target_path)) else: logger.info("Downloading data from [%s] to [%s]." % (data_url, target_path)) # we want to have a bit more accurate download progress as these files are typically very large progress = net.Progress("[INFO] Downloading data for track %s" % track_name, accuracy=1) net.download(data_url, target_path, size_in_bytes, progress_indicator=progress) progress.finish() logger.info("Downloaded data from [%s] to [%s]." % (data_url, target_path)) except urllib.error.HTTPError as e: if e.code == 404 and test_mode: raise exceptions.DataError("Track [%s] does not support test mode. Please ask the track author to add it or " "disable test mode and retry." % track_name) else: msg = "Could not download [%s] to [%s]" % (data_url, target_path) if e.reason: msg += " (HTTP status: %s, reason: %s)" % (str(e.code), e.reason) else: msg += " (HTTP status: %s)" % str(e.code) raise exceptions.DataError(msg) except urllib.error.URLError: logger.exception("Could not download [%s] to [%s]." % (data_url, target_path)) raise exceptions.DataError("Could not download [%s] to [%s]." % (data_url, target_path)) if not os.path.isfile(target_path): raise exceptions.SystemSetupError( "Cannot download from %s to %s. Please verify that data are available at %s and " "check your internet connection." % (data_url, target_path, data_url)) actual_size = os.path.getsize(target_path) if size_in_bytes is not None and actual_size != size_in_bytes: raise exceptions.DataError("[%s] is corrupt. Downloaded [%d] bytes but [%d] bytes are expected." % (target_path, actual_size, size_in_bytes))
def download(cfg, url, local_path, size_in_bytes): offline = cfg.opts("system", "offline.mode") file_exists = os.path.isfile(local_path) # ensure we only skip the download if the file size also matches our expectation if file_exists and (size_in_bytes is None or os.path.getsize(local_path) == size_in_bytes): logger.info("[%s] already exists locally. Skipping download." % local_path) return False if not offline: try: io.ensure_dir(os.path.dirname(local_path)) if size_in_bytes: size_in_mb = round(convert.bytes_to_mb(size_in_bytes)) # ensure output appears immediately logger.info("Downloading data from [%s] (%s MB) to [%s]." % (url, size_in_mb, local_path)) else: logger.info("Downloading data from [%s] to [%s]." % (url, local_path)) # we want to have a bit more accurate download progress as these files are typically very large progress = net.Progress( "[INFO] Downloading data for track %s" % track.name, accuracy=1) net.download(url, local_path, size_in_bytes, progress_indicator=progress) progress.finish() logger.info("Downloaded data from [%s] to [%s]." % (url, local_path)) except urllib.error.URLError: logger.exception("Could not download [%s] to [%s]." % (url, local_path)) # file must exist at this point -> verify if not os.path.isfile(local_path): if offline: raise exceptions.SystemSetupError( "Cannot find %s. Please disable offline mode and retry again." % local_path) else: raise exceptions.SystemSetupError( "Cannot download from %s to %s. Please verify that data are available at %s and " "check your internet connection." % (url, local_path, url)) actual_size = os.path.getsize(local_path) if size_in_bytes is not None and actual_size != size_in_bytes: raise exceptions.DataError( "[%s] is corrupt. Downloaded [%d] bytes but [%d] bytes are expected." % (local_path, actual_size, size_in_bytes)) return True
def download(cfg, url, local_path, size_in_bytes): offline = cfg.opts("system", "offline.mode") file_exists = os.path.isfile(local_path) # ensure we only skip the download if the file size also matches our expectation if file_exists and (size_in_bytes is None or os.path.getsize(local_path) == size_in_bytes): logger.info("[%s] already exists locally. Skipping download." % local_path) return False if not offline: try: io.ensure_dir(os.path.dirname(local_path)) if size_in_bytes: size_in_mb = round(convert.bytes_to_mb(size_in_bytes)) # ensure output appears immediately console.info( "Downloading data from [%s] (%s MB) to [%s] ... " % (url, size_in_mb, local_path), end='', flush=True, logger=logger) else: console.info("Downloading data from [%s] to [%s] ... " % (url, local_path), end='', flush=True, logger=logger) net.download(url, local_path, size_in_bytes) console.println("[OK]") except urllib.error.URLError: logger.exception("Could not download [%s] to [%s]." % (url, local_path)) # file must exist at this point -> verify if not os.path.isfile(local_path): if offline: raise exceptions.SystemSetupError( "Cannot find %s. Please disable offline mode and retry again." % local_path) else: raise exceptions.SystemSetupError( "Cannot download from %s to %s. Please verify that data are available at %s and " "check your internet connection." % (url, local_path, url)) actual_size = os.path.getsize(local_path) if size_in_bytes is not None and actual_size != size_in_bytes: raise exceptions.DataError( "[%s] is corrupt. Downloaded [%d] bytes but [%d] bytes are expected." % (local_path, actual_size, size_in_bytes)) return True
def decompress(data_set_path, expected_size_in_bytes): # we assume that track data are always compressed and try to decompress them before running the benchmark basename, extension = io.splitext(data_set_path) if not os.path.isfile(basename) or os.path.getsize(basename) != expected_size_in_bytes: logger.info("Unzipping track data from [%s] to [%s]." % (data_set_path, basename)) print("Decompressing %s (resulting size: %.2f GB) ... " % (type.document_archive, convert.bytes_to_gb(type.uncompressed_size_in_bytes)), end='', flush=True) io.decompress(data_set_path, io.dirname(data_set_path)) print("Done") extracted_bytes = os.path.getsize(basename) if extracted_bytes != expected_size_in_bytes: raise exceptions.DataError("[%s] is corrupt. Extracted [%d] bytes but [%d] bytes are expected." % (basename, extracted_bytes, expected_size_in_bytes))
def prepare_track(track, cfg): """ Ensures that all track data are available for running the benchmark. :param track: A track that is about to be run. :param cfg: The config object. """ def download(cfg, url, local_path, size_in_bytes): offline = cfg.opts("system", "offline.mode") file_exists = os.path.isfile(local_path) # ensure we only skip the download if the file size also matches our expectation if file_exists and (size_in_bytes is None or os.path.getsize(local_path) == size_in_bytes): logger.info("[%s] already exists locally. Skipping download." % local_path) return False if not offline: try: io.ensure_dir(os.path.dirname(local_path)) if size_in_bytes: size_in_mb = round(convert.bytes_to_mb(size_in_bytes)) # ensure output appears immediately logger.info("Downloading data from [%s] (%s MB) to [%s]." % (url, size_in_mb, local_path)) else: logger.info("Downloading data from [%s] to [%s]." % (url, local_path)) # we want to have a bit more accurate download progress as these files are typically very large progress = net.Progress( "[INFO] Downloading data for track %s" % track.name, accuracy=1) net.download(url, local_path, size_in_bytes, progress_indicator=progress) progress.finish() logger.info("Downloaded data from [%s] to [%s]." % (url, local_path)) except urllib.error.URLError: logger.exception("Could not download [%s] to [%s]." % (url, local_path)) # file must exist at this point -> verify if not os.path.isfile(local_path): if offline: raise exceptions.SystemSetupError( "Cannot find %s. Please disable offline mode and retry again." % local_path) else: raise exceptions.SystemSetupError( "Cannot download from %s to %s. Please verify that data are available at %s and " "check your internet connection." % (url, local_path, url)) actual_size = os.path.getsize(local_path) if size_in_bytes is not None and actual_size != size_in_bytes: raise exceptions.DataError( "[%s] is corrupt. Downloaded [%d] bytes but [%d] bytes are expected." % (local_path, actual_size, size_in_bytes)) return True def decompress(data_set_path, expected_size_in_bytes): # we assume that track data are always compressed and try to decompress them before running the benchmark basename, extension = io.splitext(data_set_path) decompressed = False if not os.path.isfile(basename) or os.path.getsize( basename) != expected_size_in_bytes: decompressed = True if type.uncompressed_size_in_bytes: console.info( "Decompressing track data from [%s] to [%s] (resulting size: %.2f GB) ... " % (data_set_path, basename, convert.bytes_to_gb(type.uncompressed_size_in_bytes)), end='', flush=True, logger=logger) else: console.info( "Decompressing track data from [%s] to [%s] ... " % (data_set_path, basename), end='', flush=True, logger=logger) io.decompress(data_set_path, io.dirname(data_set_path)) console.println("[OK]") extracted_bytes = os.path.getsize(basename) if expected_size_in_bytes is not None and extracted_bytes != expected_size_in_bytes: raise exceptions.DataError( "[%s] is corrupt. Extracted [%d] bytes but [%d] bytes are expected." % (basename, extracted_bytes, expected_size_in_bytes)) return basename, decompressed if not track.source_root_url: logger.info( "Track [%s] does not specify a source root URL. Assuming data are available locally." % track.name) data_root = cfg.opts("benchmarks", "local.dataset.cache") for index in track.indices: for type in index.types: if type.document_archive: absolute_archive_path = os.path.join(data_root, type.document_archive) if track.source_root_url: data_url = "%s/%s" % ( track.source_root_url, os.path.basename(absolute_archive_path)) download(cfg, data_url, absolute_archive_path, type.compressed_size_in_bytes) if not os.path.exists(absolute_archive_path): if cfg.opts("track", "test.mode.enabled"): logger.error( "[%s] does not exist so assuming that track [%s] does not support test mode." % (absolute_archive_path, track)) raise exceptions.DataError( "Track [%s] does not support test mode. Please ask the track author to add it or " "disable test mode and retry." % track) else: logger.error("[%s] does not exist." % absolute_archive_path) raise exceptions.DataError( "Track data file [%s] is missing." % absolute_archive_path) decompressed_file_path, was_decompressed = decompress( absolute_archive_path, type.uncompressed_size_in_bytes) # just rebuild the file every time for the time being. Later on, we might check the data file fingerprint to avoid it lines_read = io.prepare_file_offset_table( decompressed_file_path) if lines_read and lines_read != type.number_of_lines: io.remove_file_offset_table(decompressed_file_path) raise exceptions.DataError( "Data in [%s] for track [%s] are invalid. Expected [%d] lines but got [%d]." % (decompressed_file_path, track, type.number_of_lines, lines_read)) else: logger.info( "Type [%s] in index [%s] does not define a document archive. No data are indexed from a file for this type." % (type.name, index.name))