def post_process_for_test_mode(t): logger.info("Preparing track [%s] for test mode." % str(t)) for index in t.indices: for type in index.types: if type.has_valid_document_data(): logger.info( "Reducing corpus size to 1000 documents for [%s/%s]" % (index, type)) type.number_of_documents = 1000 path, ext = io.splitext(type.document_archive) path_2, ext_2 = io.splitext(path) type.document_archive = "%s-1k%s%s" % (path_2, ext_2, ext) type.document_file = "%s-1k%s" % (path_2, ext_2) # we don't want to check sizes type.compressed_size_in_bytes = None type.uncompressed_size_in_bytes = None for challenge in t.challenges: for task in challenge.schedule: if task.warmup_iterations > 1: logger.info("Resetting warmup iterations to 1 for [%s]" % str(task)) task.warmup_iterations = 1 if task.iterations > 1: logger.info("Resetting measurement iterations to 1 for [%s]" % str(task)) task.iterations = 1 if task.warmup_time_period is not None: logger.info( "Resetting warmup time period for [%s] to 1 second." % str(task)) task.warmup_time_period = 1 return t
def post_process_for_test_mode(t): logger.info("Preparing track [%s] for test mode." % str(t)) for index in t.indices: for type in index.types: if type.has_valid_document_data(): logger.info("Reducing corpus size to 1000 documents for [%s/%s]" % (index, type)) type.number_of_documents = 1000 path, ext = io.splitext(type.document_archive) path_2, ext_2 = io.splitext(path) type.document_archive = "%s-1k%s%s" % (path_2, ext_2, ext) type.document_file = "%s-1k%s" % (path_2, ext_2) # we don't want to check sizes type.compressed_size_in_bytes = None type.uncompressed_size_in_bytes = None for challenge in t.challenges: for task in challenge.schedule: if task.warmup_iterations > 1: logger.info("Resetting warmup iterations to 1 for [%s]" % str(task)) task.warmup_iterations = 1 if task.iterations > 1: logger.info("Resetting measurement iterations to 1 for [%s]" % str(task)) task.iterations = 1 if task.warmup_time_period is not None: logger.info("Resetting warmup time period for [%s] to 1 second." % str(task)) task.warmup_time_period = 1 return t
def _create_type(self, type_spec, mapping_dir): docs = self._r(type_spec, "documents", mandatory=False) if docs: if io.is_archive(docs): document_archive = docs document_file = io.splitext(docs)[0] else: document_archive = None document_file = docs number_of_documents = self._r(type_spec, "document-count") compressed_bytes = self._r(type_spec, "compressed-bytes", mandatory=False) uncompressed_bytes = self._r(type_spec, "uncompressed-bytes", mandatory=False) else: document_archive = None document_file = None number_of_documents = 0 compressed_bytes = 0 uncompressed_bytes = 0 mapping_file = os.path.join(mapping_dir, self._r(type_spec, "mapping")) with self.source(mapping_file, "rt") as f: mapping = json.load(f) return track.Type(name=self._r(type_spec, "name"), mapping=mapping, document_file=document_file, document_archive=document_archive, includes_action_and_meta_data=self._r(type_spec, "includes-action-and-meta-data", mandatory=False, default_value=False), number_of_documents=number_of_documents, compressed_size_in_bytes=compressed_bytes, uncompressed_size_in_bytes=uncompressed_bytes)
def _create_type(self, type_spec, mapping_dir, data_dir): compressed_docs = self._r(type_spec, "documents", mandatory=False) if compressed_docs: document_archive = "%s/%s" % (data_dir, compressed_docs) document_file = "%s/%s" % (data_dir, io.splitext(compressed_docs)[0]) else: document_archive = None document_file = None return track.Type( name=self._r(type_spec, "name"), mapping_file="%s/%s" % (mapping_dir, self._r(type_spec, "mapping")), document_file=document_file, document_archive=document_archive, number_of_documents=self._r(type_spec, "document-count", mandatory=False, default_value=0), compressed_size_in_bytes=self._r(type_spec, "compressed-bytes", mandatory=False), uncompressed_size_in_bytes=self._r(type_spec, "uncompressed-bytes", mandatory=False))
def decompress(data_set_path, expected_size_in_bytes): # we assume that track data are always compressed and try to decompress them before running the benchmark basename, extension = io.splitext(data_set_path) decompressed = False if not os.path.isfile(basename) or os.path.getsize( basename) != expected_size_in_bytes: decompressed = True if type.uncompressed_size_in_bytes: console.info( "Decompressing track data from [%s] to [%s] (resulting size: %.2f GB) ... " % (data_set_path, basename, convert.bytes_to_gb(type.uncompressed_size_in_bytes)), end='', flush=True, logger=logger) else: console.info( "Decompressing track data from [%s] to [%s] ... " % (data_set_path, basename), end='', flush=True, logger=logger) io.decompress(data_set_path, io.dirname(data_set_path)) console.println("[OK]") extracted_bytes = os.path.getsize(basename) if expected_size_in_bytes is not None and extracted_bytes != expected_size_in_bytes: raise exceptions.DataError( "[%s] is corrupt. Extracted [%d] bytes but [%d] bytes are expected." % (basename, extracted_bytes, expected_size_in_bytes)) return basename, decompressed
def _unzip(self, data_set_path): # we assume that track data are always compressed and try to unzip them before running the benchmark basename, extension = io.splitext(data_set_path) if not os.path.isfile(basename): logger.info("Unzipping track data from [%s] to [%s]." % (data_set_path, basename)) io.unzip(data_set_path, io.dirname(data_set_path)) return basename
def mapping_file_name(self, type): distribution_version = self._config.opts("source", "distribution.version", mandatory=False) if distribution_version and len(distribution_version.strip()) > 0: path, extension = io.splitext(type.mapping_file_name) return "%s-%s%s" % (path, distribution_version, extension) else: return type.mapping_file_name
def _create_type(self, type_spec, mapping_dir): compressed_docs = self._r(type_spec, "documents", mandatory=False) if compressed_docs: relative_data_dir = self.name.lower() document_archive = os.path.join(relative_data_dir, compressed_docs) document_file = os.path.join(relative_data_dir, io.splitext(compressed_docs)[0]) number_of_documents = self._r(type_spec, "document-count") compressed_bytes = self._r(type_spec, "compressed-bytes", mandatory=False) uncompressed_bytes = self._r(type_spec, "uncompressed-bytes", mandatory=False) else: document_archive = None document_file = None number_of_documents = 0 compressed_bytes = 0 uncompressed_bytes = 0 mapping_file = os.path.join(mapping_dir, self._r(type_spec, "mapping")) with self.source(mapping_file, "rt") as f: mapping = json.load(f) return track.Type(name=self._r(type_spec, "name"), mapping=mapping, document_file=document_file, document_archive=document_archive, includes_action_and_meta_data=self._r(type_spec, "includes-action-and-meta-data", mandatory=False, default_value=False), number_of_documents=number_of_documents, compressed_size_in_bytes=compressed_bytes, uncompressed_size_in_bytes=uncompressed_bytes)
def post_process_for_test_mode(t): logger.info("Preparing track [%s] for test mode." % str(t)) for index in t.indices: for type in index.types: if type.has_valid_document_data(): logger.info( "Reducing corpus size to 1000 documents for [%s/%s]" % (index, type)) type.number_of_documents = 1000 path, ext = io.splitext(type.document_archive) path_2, ext_2 = io.splitext(path) type.document_archive = "%s-1k%s%s" % (path_2, ext_2, ext) type.document_file = "%s-1k%s" % (path_2, ext_2) # we don't want to check sizes type.compressed_size_in_bytes = None type.uncompressed_size_in_bytes = None for challenge in t.challenges: for task in challenge.schedule: # we need iterate over leaf tasks and await iterating over possible intermediate 'parallel' elements for leaf_task in task: # iteration-based schedules are divided among all clients and we should provide at least one iteration for each client. if leaf_task.warmup_iterations > leaf_task.clients: count = leaf_task.clients logger.info("Resetting warmup iterations to %d for [%s]" % (count, str(leaf_task))) leaf_task.warmup_iterations = count if leaf_task.iterations > leaf_task.clients: count = leaf_task.clients logger.info( "Resetting measurement iterations to %d for [%s]" % (count, str(leaf_task))) leaf_task.iterations = count if leaf_task.warmup_time_period is not None and leaf_task.warmup_time_period > 0: leaf_task.warmup_time_period = 0 logger.info( "Resetting warmup time period for [%s] to [%d] seconds." % (str(leaf_task), leaf_task.warmup_time_period)) if leaf_task.time_period is not None and leaf_task.time_period > 10: leaf_task.time_period = 10 logger.info( "Resetting measurement time period for [%s] to [%d] seconds." % (str(leaf_task), leaf_task.time_period)) return t
def decompress(data_set_path, expected_size_in_bytes): # we assume that track data are always compressed and try to decompress them before running the benchmark basename, extension = io.splitext(data_set_path) if not os.path.isfile(basename) or os.path.getsize(basename) != expected_size_in_bytes: logger.info("Unzipping track data from [%s] to [%s]." % (data_set_path, basename)) io.decompress(data_set_path, io.dirname(data_set_path)) extracted_bytes = os.path.getsize(basename) if extracted_bytes != expected_size_in_bytes: raise exceptions.DataError("[%s] is corrupt. Extracted [%d] bytes but [%d] bytes are expected." % (basename, extracted_bytes, expected_size_in_bytes))
def decompress(data_set_path, expected_size_in_bytes): # we assume that track data are always compressed and try to decompress them before running the benchmark basename, extension = io.splitext(data_set_path) if not os.path.isfile(basename) or os.path.getsize(basename) != expected_size_in_bytes: logger.info("Unzipping track data from [%s] to [%s]." % (data_set_path, basename)) print("Decompressing %s (resulting size: %.2f GB) ... " % (type.document_archive, convert.bytes_to_gb(type.uncompressed_size_in_bytes)), end='', flush=True) io.decompress(data_set_path, io.dirname(data_set_path)) print("Done") extracted_bytes = os.path.getsize(basename) if extracted_bytes != expected_size_in_bytes: raise exceptions.DataError("[%s] is corrupt. Extracted [%d] bytes but [%d] bytes are expected." % (basename, extracted_bytes, expected_size_in_bytes))
def _configured_plugins(self, variables=None): configured_plugins = [] # each directory is a plugin, each .ini is a config (just go one level deep) for entry in os.listdir(self.plugins_root_path): plugin_path = os.path.join(self.plugins_root_path, entry) if os.path.isdir(plugin_path): for child_entry in os.listdir(plugin_path): if os.path.isfile(os.path.join(plugin_path, child_entry)) and io.has_extension(child_entry, ".ini"): f, _ = io.splitext(child_entry) plugin_name = self._file_to_plugin_name(entry) config = io.basename(f) configured_plugins.append(PluginDescriptor(name=plugin_name, config=config, variables=variables)) return configured_plugins
def _configured_plugins(self): configured_plugins = [] # each directory is a plugin, each .ini is a config (just go one level deep) for entry in os.listdir(self.plugins_root_path): plugin_path = os.path.join(self.plugins_root_path, entry) if os.path.isdir(plugin_path): for child_entry in os.listdir(plugin_path): if os.path.isfile(os.path.join(plugin_path, child_entry)) and io.has_extension(child_entry, ".ini"): f, _ = io.splitext(child_entry) plugin_name = self._file_to_plugin_name(entry) config = io.basename(f) configured_plugins.append(PluginDescriptor(name=plugin_name, config=config)) return configured_plugins
def post_process_for_test_mode(t): logger.info("Preparing track [%s] for test mode." % str(t)) for index in t.indices: for type in index.types: if type.has_valid_document_data(): logger.info("Reducing corpus size to 1000 documents for [%s/%s]" % (index, type)) type.number_of_documents = 1000 path, ext = io.splitext(type.document_archive) path_2, ext_2 = io.splitext(path) type.document_archive = "%s-1k%s%s" % (path_2, ext_2, ext) type.document_file = "%s-1k%s" % (path_2, ext_2) # we don't want to check sizes type.compressed_size_in_bytes = None type.uncompressed_size_in_bytes = None for challenge in t.challenges: for task in challenge.schedule: # we need iterate over leaf tasks and await iterating over possible intermediate 'parallel' elements for leaf_task in task: # iteration-based schedules are divided among all clients and we should provide at least one iteration for each client. if leaf_task.warmup_iterations > leaf_task.clients: count = leaf_task.clients logger.info("Resetting warmup iterations to %d for [%s]" % (count, str(leaf_task))) leaf_task.warmup_iterations = count if leaf_task.iterations > leaf_task.clients: count = leaf_task.clients logger.info("Resetting measurement iterations to %d for [%s]" % (count, str(leaf_task))) leaf_task.iterations = count if leaf_task.warmup_time_period is not None and leaf_task.warmup_time_period > 0: leaf_task.warmup_time_period = 0 logger.info("Resetting warmup time period for [%s] to [%d] seconds." % (str(leaf_task), leaf_task.warmup_time_period)) if leaf_task.time_period is not None and leaf_task.time_period > 10: leaf_task.time_period = 10 logger.info("Resetting measurement time period for [%s] to [%d] seconds." % (str(leaf_task), leaf_task.time_period)) return t
def _create_type(self, type_spec, mapping_dir, data_dir): compressed_docs = self._r(type_spec, "documents", mandatory=False) if compressed_docs: document_archive = "%s/%s" % (data_dir, compressed_docs) document_file = "%s/%s" % (data_dir, io.splitext(compressed_docs)[0]) else: document_archive = None document_file = None return track.Type(name=self._r(type_spec, "name"), mapping_file="%s/%s" % (mapping_dir, self._r(type_spec, "mapping")), document_file=document_file, document_archive=document_archive, number_of_documents=self._r(type_spec, "document-count", mandatory=False, default_value=0), compressed_size_in_bytes=self._r(type_spec, "compressed-bytes", mandatory=False), uncompressed_size_in_bytes=self._r(type_spec, "uncompressed-bytes", mandatory=False) )
def __init__(self, track_path): if not os.path.exists(track_path): raise exceptions.SystemSetupError("Track path %s does not exist" % track_path) if os.path.isdir(track_path): self.track_name = io.basename(track_path) self._track_dir = track_path self._track_file = os.path.join(track_path, "track.json") if not os.path.exists(self._track_file): raise exceptions.SystemSetupError("Could not find track.json in %s" % track_path) elif os.path.isfile(track_path): if io.has_extension(track_path, ".json"): self._track_dir = io.dirname(track_path) self._track_file = track_path self.track_name = io.splitext(io.basename(track_path))[0] else: raise exceptions.SystemSetupError("%s has to be a JSON file" % track_path) else: raise exceptions.SystemSetupError("%s is neither a file nor a directory" % track_path)
def decompress(data_set_path, expected_size_in_bytes): # we assume that track data are always compressed and try to decompress them before running the benchmark basename, extension = io.splitext(data_set_path) decompressed = False if not os.path.isfile(basename) or os.path.getsize(basename) != expected_size_in_bytes: decompressed = True if type.uncompressed_size_in_bytes: console.info("Decompressing track data from [%s] to [%s] (resulting size: %.2f GB) ... " % (data_set_path, basename, convert.bytes_to_gb(type.uncompressed_size_in_bytes)), end='', flush=True, logger=logger) else: console.info("Decompressing track data from [%s] to [%s] ... " % (data_set_path, basename), end='', flush=True, logger=logger) io.decompress(data_set_path, io.dirname(data_set_path)) console.println("[OK]") extracted_bytes = os.path.getsize(basename) if expected_size_in_bytes is not None and extracted_bytes != expected_size_in_bytes: raise exceptions.DataError("[%s] is corrupt. Extracted [%d] bytes but [%d] bytes are expected." % (basename, extracted_bytes, expected_size_in_bytes)) return basename, decompressed
def plain_text(file): _, ext = io.splitext(file) return ext in [ ".ini", ".txt", ".json", ".yml", ".yaml", ".options", ".properties" ]
def __is_car(path): _, extension = io.splitext(path) return extension == ".ini"
def __car_name(path): p, _ = io.splitext(path) return io.basename(p)
def plain_text(file): _, ext = io.splitext(file) return ext in [".ini", ".txt", ".json", ".yml", ".yaml", ".options", ".properties"]