def test_configuration_manager_read(self): configuration_manager = ConfigurationManager() value = configuration_manager.get_value("core", "datasets_dir") self.assertNotEqual(value, None) value = configuration_manager.get_value("invalid", "") self.assertEqual(value, None) value = configuration_manager.get_value("core", "invalid") self.assertEqual(value, None)
def test_should_search_in_upload_directory(self, create_mock): self.upload_path = Path(ConfigurationManager().get_value( 'storage', 'path_prefix')) file_path = 'video' table_metainfo = 'info' batch_mem_size = 3000 file_options = {} file_options['file_format'] = FileFormatType.VIDEO plan = type( "LoadDataPlan", (), { 'table_metainfo': table_metainfo, 'file_path': file_path, 'batch_mem_size': batch_mem_size, 'file_options': file_options }) load_executor = LoadDataExecutor(plan) with patch.object(Path, 'exists') as mock_exists: mock_exists.side_effect = [False, True] batch = next(load_executor.exec()) create_mock.assert_called_once_with(table_metainfo, self.upload_path / file_path) self.assertEqual( batch, Batch( pd.DataFrame([{ 'Video successfully added at location: ': file_path }])))
def __init__(self): """Initializes the engine and session for database operations Retrieves the database uri for connection from ConfigurationManager. """ uri = ConfigurationManager().get_value("core", "catalog_database_uri") # set echo=True to log SQL self.engine = create_engine(uri) # statements self.session = scoped_session(sessionmaker(bind=self.engine))
def generate_file_path(name: str = '') -> Path: """Generates a arbitrary file_path(md5 hash) based on the a random salt and name Arguments: name (str): Input file_name. Returns: Path: pathlib.Path object """ dataset_location = ConfigurationManager().get_value("core", "datasets_dir") if dataset_location is None: logger.error('Missing location key in eva.yml') raise KeyError('Missing datasets_dir key in eva.yml') dataset_location = Path(dataset_location) dataset_location.mkdir(parents=True, exist_ok=True) salt = uuid.uuid4().hex file_name = hashlib.md5(salt.encode() + name.encode()).hexdigest() path = dataset_location / file_name return path.resolve()
def eva(): """ Start the eva system """ # Get the hostname and port information from the configuration file config = ConfigurationManager() hostname = config.get_value('server', 'host') port = config.get_value('server', 'port') socket_timeout = config.get_value('server', 'socket_timeout') loop = asyncio.new_event_loop() stop_server_future = loop.create_future() # Launch server try: asyncio.run( start_server(host=hostname, port=port, loop=loop, socket_timeout=socket_timeout, stop_server_future=stop_server_future)) except Exception as e: logger.critical(e)
def apply(self, before: LogicalLoadData, context: OptimizerContext): # Configure the batch_mem_size. # We assume the optimizer decides the batch_mem_size. # ToDO: Experiment heuristics. batch_mem_size = 30000000 # 30mb config_batch_mem_size = ConfigurationManager().get_value( "executor", "batch_mem_size") if config_batch_mem_size: batch_mem_size = config_batch_mem_size after = LoadDataPlan(before.table_metainfo, before.path, batch_mem_size, before.column_list, before.file_options) return after
def apply(self, before: LogicalGet, context: OptimizerContext): # Configure the batch_mem_size. It decides the number of rows # read in a batch from storage engine. # ToDO: Experiment heuristics. batch_mem_size = 30000000 # 30mb config_batch_mem_size = ConfigurationManager().get_value( "executor", "batch_mem_size") if config_batch_mem_size: batch_mem_size = config_batch_mem_size after = SeqScanPlan(before.predicate, before.target_list, before.alias) after.append_child( StoragePlan(before.dataset_metadata, batch_mem_size=batch_mem_size)) return after
def classify(self, frames: Tensor) -> pd.DataFrame: """ Given the gpu_batch_size, we split the input tensor inpto chunks. And call the _get_predictions and merge the results. Arguments: frames (Tensor): tensor on which transformation is performed Returns: pd.DataFrame: outcome after prediction """ gpu_batch_size = ConfigurationManager()\ .get_value('executor', 'gpu_batch_size') if gpu_batch_size: chunks = torch.split(frames, gpu_batch_size) outcome = pd.DataFrame() for tensor in chunks: outcome = outcome.append(self._get_predictions(tensor), ignore_index=True) return outcome else: return self._get_predictions(frames)
def __init__(self, *args, cur_shard=None, shard_count=None, predicate=None, **kwargs): """ Reads data from the petastorm parquet stores. Note this won't work for any arbitary parquet store apart from one materialized using petastorm. In order to generalize, we might have to replace `make_reader` with `make_batch_reader`. https://petastorm.readthedocs.io/en/latest/api.html#module-petastorm.reader Attributes: cur_shard (int, optional): Shard number to load from if sharded shard_count (int, optional): Specify total number of shards if applicable predicate (PredicateBase, optional): instance of predicate object to filter rows to be returned by reader cache_type (str): the cache type, if desired. Options are [None, ‘null’, ‘local-disk’] to either have a null/noop cache or a cache implemented using diskcache. cache_location (int): the location or path of the cache. cache_size_limit (int): the size limit of the cache in bytes cache_row_size_estimate (int): the estimated size of a row """ self.cur_shard = cur_shard self.shard_count = shard_count self.predicate = predicate petastorm_config = ConfigurationManager().get_value( 'storage', 'petastorm') # cache not allowed with predicates if self.predicate or petastorm_config is None: petastorm_config = {} self.cache_type = petastorm_config.get('cache_type', None) self.cache_location = petastorm_config.get('cache_location', None) self.cache_size_limit = petastorm_config.get('cache_size_limit', None) self.cache_row_size_estimate = petastorm_config.get( 'cache_row_size_estimate', None) super().__init__(*args, **kwargs) if self.cur_shard is not None and self.cur_shard <= 0: self.cur_shard = None if self.shard_count is not None and self.shard_count <= 0: self.shard_count = None
def test_should_call_petastorm_make_reader_with_negative_shards( self, mock): petastorm_reader = PetastormReader(file_url=os.path.join( PATH_PREFIX, 'dummy.avi'), batch_mem_size=3000, cur_shard=-1, shard_count=-2) list(petastorm_reader._read()) petastorm_config = ConfigurationManager().get_value( 'storage', 'petastorm') mock.assert_called_once_with( os.path.join(PATH_PREFIX, 'dummy.avi'), shard_count=None, cur_shard=None, predicate=None, cache_location=petastorm_config.get('cache_location', None), cache_row_size_estimate=petastorm_config.get( 'cache_row_size_estimate', None), cache_size_limit=petastorm_config.get('cache_size_limit', None), cache_type=petastorm_config.get('cache_type', None))
def load_inbuilt_udfs(): mode = ConfigurationManager().get_value('core', 'mode') init_builtin_udfs(mode=mode)
import cv2 import os import shutil from eva.models.storage.batch import Batch from eva.models.catalog.frame_info import FrameInfo from eva.models.catalog.properties import ColorSpace from eva.server.command_handler import execute_query_fetch_all from eva.udfs.abstract_udfs import AbstractClassifierUDF from eva.udfs.udf_bootstrap_queries import init_builtin_udfs from eva.configuration.configuration_manager import ConfigurationManager NUM_FRAMES = 10 FRAME_SIZE = 2 * 2 * 3 CONFIG = ConfigurationManager() PATH_PREFIX = CONFIG.get_value('storage', 'path_prefix') def create_dataframe(num_frames=1) -> pd.DataFrame: frames = [] for i in range(1, num_frames + 1): frames.append({"id": i, "data": (i * np.ones((1, 1)))}) return pd.DataFrame(frames) def create_dataframe_same(times=1): base_df = create_dataframe() for i in range(1, times): base_df = base_df.append(create_dataframe(), ignore_index=True) return base_df
def __init__(self): self.metadata = 'metadata' self.curr_version = ConfigurationManager().get_value( 'storage', 'video_engine_version')
def main(): mode = ConfigurationManager().get_value('core', 'mode') init_builtin_udfs(mode=mode) eva()
def setUp(self): self.video_engine = VideoStorageEngine self.table = self.create_sample_table() self.curr_version = ConfigurationManager().get_value( 'storage', 'video_engine_version')
def __init__(self): self._config_manager = ConfigurationManager() self._gpus = self._populate_gpu_ids()
# coding=utf-8 # Copyright 2018-2020 EVA # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from eva.configuration.configuration_manager import ConfigurationManager from eva.utils.generic_utils import str_to_class StorageEngine = str_to_class(ConfigurationManager().get_value( "storage", "engine"))() VideoStorageEngine = str_to_class(ConfigurationManager().get_value( "storage", "video_engine"))()
class Context: """ Stores the context information of the executor, i.e., if using spark, name of the application, current spark executors, if using horovod: current rank etc. """ _instance = None def __new__(cls): if cls._instance is None: cls._instance = super(Context, cls).__new__(cls) return cls._instance def __init__(self): self._config_manager = ConfigurationManager() self._gpus = self._populate_gpu_ids() @property def gpus(self): return self._gpus def _possible_addresses(self) -> Set: host = socket.gethostname() result_address = {host} true_host, aliases, address = socket.gethostbyaddr(host) result_address.add(true_host) result_address.update(set(aliases).union(set(address))) return result_address def _populate_gpu_from_config(self) -> List: gpu_conf = self._config_manager.get_value('executor', 'gpus') gpu_conf = gpu_conf if gpu_conf else {} this_address = self._possible_addresses() intersection_addresses = this_address.intersection(gpu_conf.keys()) if len(intersection_addresses) != 0: return [ str(gpu) for gpu in gpu_conf.get(intersection_addresses.pop()) ] return [] def _populate_gpu_from_env(self) -> List: gpus = map(lambda x: x.strip(), os.environ.get('GPU_DEVICES', '').strip().split(',')) return list(filter(lambda x: x, gpus)) def _populate_gpu_ids(self) -> List: if not is_gpu_available(): return [] gpus = self._populate_gpu_from_config() if len(gpus) == 0: gpus = self._populate_gpu_from_env() return gpus def _select_random_gpu(self) -> str: """ A random GPU selection strategy Returns: (str): GPU device ID """ return random.choice(self.gpus) def gpu_device(self) -> str: """ Selects a GPU on which the task can be executed Returns: (str): GPU device ID """ if self.gpus: # TODO: Should allow choosing GPU based on Spark and Horovod return self._select_random_gpu() return NO_GPU
def __init__(self, node: LoadDataPlan): super().__init__(node) config = ConfigurationManager() self.path_prefix = config.get_value('storage', 'path_prefix')
def __init__(self, node: LoadDataPlan): super().__init__(node) self.upload_path = Path(ConfigurationManager().get_value( "storage", "path_prefix"))
class Session(object): """ Wrapper around Spark Session """ _instance = None _session = None def __new__(cls): if cls._instance is None: cls._instance = super(Session, cls).__new__(cls) return cls._instance def __init__(self): self._config = ConfigurationManager() name = self._config.get_value('core', 'application') self.init_spark_session(name) def init_spark_session(self, application_name, spark_master=None): """Setup a spark session. :param spark_master: A master parameter used by spark session builder. Use default value (None) to use system environment configured spark cluster. Use 'local[*]' to run on a local box. :return: spark_session: A spark session """ eva_spark_conf = SparkConf() pyspark_config = self._config.get_value('pyspark', 'property') for key, value in pyspark_config.items(): eva_spark_conf.set(key, value) session_builder = SparkSession \ .builder \ .appName(application_name) \ .config(conf=eva_spark_conf) if spark_master: session_builder.master(spark_master) # Gets an existing SparkSession or, # if there is no existing one, creates a new one based # on the options set in this builder. self._session = session_builder.getOrCreate() # Configure logging spark_context = self._session.sparkContext spark_context.setLogLevel('OFF') def get_session(self): return self._session def get_context(self): return self._session.sparkContext def stop(self): self._session.stop() def __del__(self): self._session.stop()
def __init__(self): self._config = ConfigurationManager() name = self._config.get_value('core', 'application') self.init_spark_session(name)