Exemple #1
0
 def create_file_offset_table(document_file_path, expected_number_of_lines):
     # just rebuild the file every time for the time being. Later on, we might check the data file fingerprint to avoid it
     lines_read = io.prepare_file_offset_table(document_file_path)
     if lines_read and lines_read != expected_number_of_lines:
         io.remove_file_offset_table(document_file_path)
         raise exceptions.DataError("Data in [%s] for track [%s] are invalid. Expected [%d] lines but got [%d]."
                                    % (document_file_path, track, expected_number_of_lines, lines_read))
Exemple #2
0
def prepare_track(track, cfg):
    """
    Ensures that all track data are available for running the benchmark.

    :param track: A track that is about to be run.
    :param cfg: The config object.
    """
    def download(cfg, url, local_path, size_in_bytes):
        offline = cfg.opts("system", "offline.mode")
        file_exists = os.path.isfile(local_path)

        # ensure we only skip the download if the file size also matches our expectation
        if file_exists and (size_in_bytes is None
                            or os.path.getsize(local_path) == size_in_bytes):
            logger.info("[%s] already exists locally. Skipping download." %
                        local_path)
            return False

        if not offline:
            try:
                io.ensure_dir(os.path.dirname(local_path))
                if size_in_bytes:
                    size_in_mb = round(convert.bytes_to_mb(size_in_bytes))
                    # ensure output appears immediately
                    logger.info("Downloading data from [%s] (%s MB) to [%s]." %
                                (url, size_in_mb, local_path))
                else:
                    logger.info("Downloading data from [%s] to [%s]." %
                                (url, local_path))

                # we want to have a bit more accurate download progress as these files are typically very large
                progress = net.Progress(
                    "[INFO] Downloading data for track %s" % track.name,
                    accuracy=1)
                net.download(url,
                             local_path,
                             size_in_bytes,
                             progress_indicator=progress)
                progress.finish()
                logger.info("Downloaded data from [%s] to [%s]." %
                            (url, local_path))
            except urllib.error.URLError:
                logger.exception("Could not download [%s] to [%s]." %
                                 (url, local_path))

        # file must exist at this point -> verify
        if not os.path.isfile(local_path):
            if offline:
                raise exceptions.SystemSetupError(
                    "Cannot find %s. Please disable offline mode and retry again."
                    % local_path)
            else:
                raise exceptions.SystemSetupError(
                    "Cannot download from %s to %s. Please verify that data are available at %s and "
                    "check your internet connection." % (url, local_path, url))

        actual_size = os.path.getsize(local_path)
        if size_in_bytes is not None and actual_size != size_in_bytes:
            raise exceptions.DataError(
                "[%s] is corrupt. Downloaded [%d] bytes but [%d] bytes are expected."
                % (local_path, actual_size, size_in_bytes))

        return True

    def decompress(data_set_path, expected_size_in_bytes):
        # we assume that track data are always compressed and try to decompress them before running the benchmark
        basename, extension = io.splitext(data_set_path)
        decompressed = False
        if not os.path.isfile(basename) or os.path.getsize(
                basename) != expected_size_in_bytes:
            decompressed = True
            if type.uncompressed_size_in_bytes:
                console.info(
                    "Decompressing track data from [%s] to [%s] (resulting size: %.2f GB) ... "
                    % (data_set_path, basename,
                       convert.bytes_to_gb(type.uncompressed_size_in_bytes)),
                    end='',
                    flush=True,
                    logger=logger)
            else:
                console.info(
                    "Decompressing track data from [%s] to [%s] ... " %
                    (data_set_path, basename),
                    end='',
                    flush=True,
                    logger=logger)

            io.decompress(data_set_path, io.dirname(data_set_path))
            console.println("[OK]")
            extracted_bytes = os.path.getsize(basename)
            if expected_size_in_bytes is not None and extracted_bytes != expected_size_in_bytes:
                raise exceptions.DataError(
                    "[%s] is corrupt. Extracted [%d] bytes but [%d] bytes are expected."
                    % (basename, extracted_bytes, expected_size_in_bytes))
        return basename, decompressed

    if not track.source_root_url:
        logger.info(
            "Track [%s] does not specify a source root URL. Assuming data are available locally."
            % track.name)

    data_root = cfg.opts("benchmarks", "local.dataset.cache")
    for index in track.indices:
        for type in index.types:
            if type.document_archive:
                absolute_archive_path = os.path.join(data_root,
                                                     type.document_archive)
                if track.source_root_url:
                    data_url = "%s/%s" % (
                        track.source_root_url,
                        os.path.basename(absolute_archive_path))
                    download(cfg, data_url, absolute_archive_path,
                             type.compressed_size_in_bytes)
                if not os.path.exists(absolute_archive_path):
                    if cfg.opts("track", "test.mode.enabled"):
                        logger.error(
                            "[%s] does not exist so assuming that track [%s] does not support test mode."
                            % (absolute_archive_path, track))
                        raise exceptions.DataError(
                            "Track [%s] does not support test mode. Please ask the track author to add it or "
                            "disable test mode and retry." % track)
                    else:
                        logger.error("[%s] does not exist." %
                                     absolute_archive_path)
                        raise exceptions.DataError(
                            "Track data file [%s] is missing." %
                            absolute_archive_path)
                decompressed_file_path, was_decompressed = decompress(
                    absolute_archive_path, type.uncompressed_size_in_bytes)
                # just rebuild the file every time for the time being. Later on, we might check the data file fingerprint to avoid it
                lines_read = io.prepare_file_offset_table(
                    decompressed_file_path)
                if lines_read and lines_read != type.number_of_lines:
                    io.remove_file_offset_table(decompressed_file_path)
                    raise exceptions.DataError(
                        "Data in [%s] for track [%s] are invalid. Expected [%d] lines but got [%d]."
                        % (decompressed_file_path, track, type.number_of_lines,
                           lines_read))
            else:
                logger.info(
                    "Type [%s] in index [%s] does not define a document archive. No data are indexed from a file for this type."
                    % (type.name, index.name))
Exemple #3
0
def prepare_track(track, cfg):
    """
    Ensures that all track data are available for running the benchmark.

    :param track: A track that is about to be run.
    :param cfg: The config object.
    """

    def download(cfg, url, local_path, size_in_bytes):
        offline = cfg.opts("system", "offline.mode")
        file_exists = os.path.isfile(local_path)

        # ensure we only skip the download if the file size also matches our expectation
        if file_exists and (size_in_bytes is None or os.path.getsize(local_path) == size_in_bytes):
            logger.info("[%s] already exists locally. Skipping download." % local_path)
            return False

        if not offline:
            try:
                io.ensure_dir(os.path.dirname(local_path))
                if size_in_bytes:
                    size_in_mb = round(convert.bytes_to_mb(size_in_bytes))
                    # ensure output appears immediately
                    logger.info("Downloading data from [%s] (%s MB) to [%s]." % (url, size_in_mb, local_path))
                else:
                    logger.info("Downloading data from [%s] to [%s]." % (url, local_path))

                # we want to have a bit more accurate download progress as these files are typically very large
                progress = net.Progress("[INFO] Downloading data for track %s" % track.name, accuracy=1)
                net.download(url, local_path, size_in_bytes, progress_indicator=progress)
                progress.finish()
                logger.info("Downloaded data from [%s] to [%s]." % (url, local_path))
            except urllib.error.URLError:
                logger.exception("Could not download [%s] to [%s]." % (url, local_path))

        # file must exist at this point -> verify
        if not os.path.isfile(local_path):
            if offline:
                raise exceptions.SystemSetupError(
                    "Cannot find %s. Please disable offline mode and retry again." % local_path)
            else:
                raise exceptions.SystemSetupError(
                    "Cannot download from %s to %s. Please verify that data are available at %s and "
                    "check your internet connection." % (url, local_path, url))

        actual_size = os.path.getsize(local_path)
        if size_in_bytes is not None and actual_size != size_in_bytes:
            raise exceptions.DataError("[%s] is corrupt. Downloaded [%d] bytes but [%d] bytes are expected." %
                                       (local_path, actual_size, size_in_bytes))

        return True

    def decompress(data_set_path, expected_size_in_bytes):
        # we assume that track data are always compressed and try to decompress them before running the benchmark
        basename, extension = io.splitext(data_set_path)
        decompressed = False
        if not os.path.isfile(basename) or os.path.getsize(basename) != expected_size_in_bytes:
            decompressed = True
            if type.uncompressed_size_in_bytes:
                console.info("Decompressing track data from [%s] to [%s] (resulting size: %.2f GB) ... " %
                             (data_set_path, basename, convert.bytes_to_gb(type.uncompressed_size_in_bytes)),
                             end='', flush=True, logger=logger)
            else:
                console.info("Decompressing track data from [%s] to [%s] ... " % (data_set_path, basename), end='',
                             flush=True, logger=logger)

            io.decompress(data_set_path, io.dirname(data_set_path))
            console.println("[OK]")
            extracted_bytes = os.path.getsize(basename)
            if expected_size_in_bytes is not None and extracted_bytes != expected_size_in_bytes:
                raise exceptions.DataError("[%s] is corrupt. Extracted [%d] bytes but [%d] bytes are expected." %
                                           (basename, extracted_bytes, expected_size_in_bytes))
        return basename, decompressed

    if not track.source_root_url:
        logger.info("Track [%s] does not specify a source root URL. Assuming data are available locally." % track.name)

    data_root = cfg.opts("benchmarks", "local.dataset.cache")
    for index in track.indices:
        for type in index.types:
            if type.document_archive:
                absolute_archive_path = os.path.join(data_root, type.document_archive)
                if track.source_root_url:
                    data_url = "%s/%s" % (track.source_root_url, os.path.basename(absolute_archive_path))
                    download(cfg, data_url, absolute_archive_path, type.compressed_size_in_bytes)
                if not os.path.exists(absolute_archive_path):
                    if cfg.opts("track", "test.mode.enabled"):
                        logger.error("[%s] does not exist so assuming that track [%s] does not support test mode." %
                                     (absolute_archive_path, track))
                        raise exceptions.DataError("Track [%s] does not support test mode. Please ask the track author to add it or "
                                                   "disable test mode and retry." % track)
                    else:
                        logger.error("[%s] does not exist." % absolute_archive_path)
                        raise exceptions.DataError("Track data file [%s] is missing." % absolute_archive_path)
                decompressed_file_path, was_decompressed = decompress(absolute_archive_path, type.uncompressed_size_in_bytes)
                # just rebuild the file every time for the time being. Later on, we might check the data file fingerprint to avoid it
                lines_read = io.prepare_file_offset_table(decompressed_file_path)
                if lines_read and lines_read != type.number_of_lines:
                    io.remove_file_offset_table(decompressed_file_path)
                    raise exceptions.DataError("Data in [%s] for track [%s] are invalid. Expected [%d] lines but got [%d]."
                                               % (decompressed_file_path, track, type.number_of_lines, lines_read))
            else:
                logger.info("Type [%s] in index [%s] does not define a document archive. No data are indexed from a file for this type." %
                            (type.name, index.name))