コード例 #1
0
    def test_configuration_manager_read(self):

        configuration_manager = ConfigurationManager()

        value = configuration_manager.get_value("core", "datasets_dir")
        self.assertNotEqual(value, None)

        value = configuration_manager.get_value("invalid", "")
        self.assertEqual(value, None)

        value = configuration_manager.get_value("core", "invalid")
        self.assertEqual(value, None)
コード例 #2
0
    def test_should_search_in_upload_directory(self, create_mock):
        self.upload_path = Path(ConfigurationManager().get_value(
            'storage', 'path_prefix'))
        file_path = 'video'
        table_metainfo = 'info'
        batch_mem_size = 3000
        file_options = {}
        file_options['file_format'] = FileFormatType.VIDEO
        plan = type(
            "LoadDataPlan", (), {
                'table_metainfo': table_metainfo,
                'file_path': file_path,
                'batch_mem_size': batch_mem_size,
                'file_options': file_options
            })

        load_executor = LoadDataExecutor(plan)
        with patch.object(Path, 'exists') as mock_exists:
            mock_exists.side_effect = [False, True]
            batch = next(load_executor.exec())
            create_mock.assert_called_once_with(table_metainfo,
                                                self.upload_path / file_path)
            self.assertEqual(
                batch,
                Batch(
                    pd.DataFrame([{
                        'Video successfully added at location: ':
                        file_path
                    }])))
コード例 #3
0
ファイル: sql_config.py プロジェクト: georgia-tech-db/eva
    def __init__(self):
        """Initializes the engine and session for database operations

        Retrieves the database uri for connection from ConfigurationManager.
        """
        uri = ConfigurationManager().get_value("core", "catalog_database_uri")
        # set echo=True to log SQL
        self.engine = create_engine(uri)
        # statements
        self.session = scoped_session(sessionmaker(bind=self.engine))
コード例 #4
0
ファイル: generic_utils.py プロジェクト: georgia-tech-db/eva
def generate_file_path(name: str = '') -> Path:
    """Generates a arbitrary file_path(md5 hash) based on the a random salt
    and name

    Arguments:
        name (str): Input file_name.

    Returns:
        Path: pathlib.Path object

    """
    dataset_location = ConfigurationManager().get_value("core", "datasets_dir")
    if dataset_location is None:
        logger.error('Missing location key in eva.yml')
        raise KeyError('Missing datasets_dir key in eva.yml')

    dataset_location = Path(dataset_location)
    dataset_location.mkdir(parents=True, exist_ok=True)
    salt = uuid.uuid4().hex
    file_name = hashlib.md5(salt.encode() + name.encode()).hexdigest()
    path = dataset_location / file_name
    return path.resolve()
コード例 #5
0
def eva():
    """
        Start the eva system
    """
    # Get the hostname and port information from the configuration file
    config = ConfigurationManager()
    hostname = config.get_value('server', 'host')
    port = config.get_value('server', 'port')
    socket_timeout = config.get_value('server', 'socket_timeout')
    loop = asyncio.new_event_loop()
    stop_server_future = loop.create_future()

    # Launch server
    try:
        asyncio.run(
            start_server(host=hostname,
                         port=port,
                         loop=loop,
                         socket_timeout=socket_timeout,
                         stop_server_future=stop_server_future))

    except Exception as e:
        logger.critical(e)
コード例 #6
0
    def apply(self, before: LogicalLoadData, context: OptimizerContext):
        # Configure the batch_mem_size.
        # We assume the optimizer decides the batch_mem_size.
        # ToDO: Experiment heuristics.

        batch_mem_size = 30000000  # 30mb
        config_batch_mem_size = ConfigurationManager().get_value(
            "executor", "batch_mem_size")
        if config_batch_mem_size:
            batch_mem_size = config_batch_mem_size
        after = LoadDataPlan(before.table_metainfo, before.path,
                             batch_mem_size, before.column_list,
                             before.file_options)
        return after
コード例 #7
0
    def apply(self, before: LogicalGet, context: OptimizerContext):
        # Configure the batch_mem_size. It decides the number of rows
        # read in a batch from storage engine.
        # ToDO: Experiment heuristics.

        batch_mem_size = 30000000  # 30mb
        config_batch_mem_size = ConfigurationManager().get_value(
            "executor", "batch_mem_size")
        if config_batch_mem_size:
            batch_mem_size = config_batch_mem_size
        after = SeqScanPlan(before.predicate, before.target_list, before.alias)
        after.append_child(
            StoragePlan(before.dataset_metadata,
                        batch_mem_size=batch_mem_size))
        return after
コード例 #8
0
    def classify(self, frames: Tensor) -> pd.DataFrame:
        """
        Given the gpu_batch_size, we split the input tensor inpto chunks.
        And call the _get_predictions and merge the results.
        Arguments:
            frames (Tensor): tensor on which transformation is performed
        Returns:
            pd.DataFrame: outcome after prediction
        """
        gpu_batch_size = ConfigurationManager()\
            .get_value('executor', 'gpu_batch_size')

        if gpu_batch_size:
            chunks = torch.split(frames, gpu_batch_size)
            outcome = pd.DataFrame()
            for tensor in chunks:
                outcome = outcome.append(self._get_predictions(tensor),
                                         ignore_index=True)
            return outcome
        else:
            return self._get_predictions(frames)
コード例 #9
0
    def __init__(self,
                 *args,
                 cur_shard=None,
                 shard_count=None,
                 predicate=None,
                 **kwargs):
        """
        Reads data from the petastorm parquet stores. Note this won't
        work for any arbitary parquet store apart from one materialized
        using petastorm. In order to generalize, we might have to replace
        `make_reader` with `make_batch_reader`.
        https://petastorm.readthedocs.io/en/latest/api.html#module-petastorm.reader

        Attributes:
            cur_shard (int, optional): Shard number to load from if sharded
            shard_count (int, optional): Specify total number of shards if
                                      applicable
            predicate (PredicateBase, optional): instance of predicate object
                to filter rows to be returned by reader
            cache_type (str): the cache type, if desired.
            Options are [None, ‘null’, ‘local-disk’] to either have a
            null/noop cache or a cache implemented using diskcache.
            cache_location (int): the location or path of the cache.
            cache_size_limit (int): the size limit of the cache in bytes
            cache_row_size_estimate (int): the estimated size of a row
        """
        self.cur_shard = cur_shard
        self.shard_count = shard_count
        self.predicate = predicate
        petastorm_config = ConfigurationManager().get_value(
            'storage', 'petastorm')
        # cache not allowed with predicates
        if self.predicate or petastorm_config is None:
            petastorm_config = {}
        self.cache_type = petastorm_config.get('cache_type', None)
        self.cache_location = petastorm_config.get('cache_location', None)
        self.cache_size_limit = petastorm_config.get('cache_size_limit', None)
        self.cache_row_size_estimate = petastorm_config.get(
            'cache_row_size_estimate', None)
        super().__init__(*args, **kwargs)
        if self.cur_shard is not None and self.cur_shard <= 0:
            self.cur_shard = None

        if self.shard_count is not None and self.shard_count <= 0:
            self.shard_count = None
コード例 #10
0
 def test_should_call_petastorm_make_reader_with_negative_shards(
         self, mock):
     petastorm_reader = PetastormReader(file_url=os.path.join(
         PATH_PREFIX, 'dummy.avi'),
                                        batch_mem_size=3000,
                                        cur_shard=-1,
                                        shard_count=-2)
     list(petastorm_reader._read())
     petastorm_config = ConfigurationManager().get_value(
         'storage', 'petastorm')
     mock.assert_called_once_with(
         os.path.join(PATH_PREFIX, 'dummy.avi'),
         shard_count=None,
         cur_shard=None,
         predicate=None,
         cache_location=petastorm_config.get('cache_location', None),
         cache_row_size_estimate=petastorm_config.get(
             'cache_row_size_estimate', None),
         cache_size_limit=petastorm_config.get('cache_size_limit', None),
         cache_type=petastorm_config.get('cache_type', None))
コード例 #11
0
def load_inbuilt_udfs():
    mode = ConfigurationManager().get_value('core', 'mode')
    init_builtin_udfs(mode=mode)
コード例 #12
0
import cv2
import os
import shutil

from eva.models.storage.batch import Batch
from eva.models.catalog.frame_info import FrameInfo
from eva.models.catalog.properties import ColorSpace
from eva.server.command_handler import execute_query_fetch_all
from eva.udfs.abstract_udfs import AbstractClassifierUDF
from eva.udfs.udf_bootstrap_queries import init_builtin_udfs
from eva.configuration.configuration_manager import ConfigurationManager


NUM_FRAMES = 10
FRAME_SIZE = 2 * 2 * 3
CONFIG = ConfigurationManager()
PATH_PREFIX = CONFIG.get_value('storage', 'path_prefix')


def create_dataframe(num_frames=1) -> pd.DataFrame:
    frames = []
    for i in range(1, num_frames + 1):
        frames.append({"id": i, "data": (i * np.ones((1, 1)))})
    return pd.DataFrame(frames)


def create_dataframe_same(times=1):
    base_df = create_dataframe()
    for i in range(1, times):
        base_df = base_df.append(create_dataframe(), ignore_index=True)
    return base_df
コード例 #13
0
 def __init__(self):
     self.metadata = 'metadata'
     self.curr_version = ConfigurationManager().get_value(
         'storage', 'video_engine_version')
コード例 #14
0
def main():
    mode = ConfigurationManager().get_value('core', 'mode')
    init_builtin_udfs(mode=mode)
    eva()
コード例 #15
0
 def setUp(self):
     self.video_engine = VideoStorageEngine
     self.table = self.create_sample_table()
     self.curr_version = ConfigurationManager().get_value(
         'storage', 'video_engine_version')
コード例 #16
0
 def __init__(self):
     self._config_manager = ConfigurationManager()
     self._gpus = self._populate_gpu_ids()
コード例 #17
0
# coding=utf-8
# Copyright 2018-2020 EVA
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from eva.configuration.configuration_manager import ConfigurationManager
from eva.utils.generic_utils import str_to_class

StorageEngine = str_to_class(ConfigurationManager().get_value(
    "storage", "engine"))()
VideoStorageEngine = str_to_class(ConfigurationManager().get_value(
    "storage", "video_engine"))()
コード例 #18
0
class Context:
    """
    Stores the context information of the executor, i.e.,
    if using spark, name of the application, current spark executors,
    if using horovod: current rank etc.
    """

    _instance = None

    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(Context, cls).__new__(cls)
        return cls._instance

    def __init__(self):
        self._config_manager = ConfigurationManager()
        self._gpus = self._populate_gpu_ids()

    @property
    def gpus(self):
        return self._gpus

    def _possible_addresses(self) -> Set:
        host = socket.gethostname()
        result_address = {host}
        true_host, aliases, address = socket.gethostbyaddr(host)
        result_address.add(true_host)
        result_address.update(set(aliases).union(set(address)))
        return result_address

    def _populate_gpu_from_config(self) -> List:
        gpu_conf = self._config_manager.get_value('executor', 'gpus')
        gpu_conf = gpu_conf if gpu_conf else {}
        this_address = self._possible_addresses()
        intersection_addresses = this_address.intersection(gpu_conf.keys())
        if len(intersection_addresses) != 0:
            return [
                str(gpu) for gpu in gpu_conf.get(intersection_addresses.pop())
            ]
        return []

    def _populate_gpu_from_env(self) -> List:
        gpus = map(lambda x: x.strip(),
                   os.environ.get('GPU_DEVICES', '').strip().split(','))
        return list(filter(lambda x: x, gpus))

    def _populate_gpu_ids(self) -> List:
        if not is_gpu_available():
            return []
        gpus = self._populate_gpu_from_config()
        if len(gpus) == 0:
            gpus = self._populate_gpu_from_env()
        return gpus

    def _select_random_gpu(self) -> str:
        """
        A random GPU selection strategy
        Returns:
            (str): GPU device ID
        """
        return random.choice(self.gpus)

    def gpu_device(self) -> str:
        """
        Selects a GPU on which the task can be executed
        Returns:
             (str): GPU device ID
        """
        if self.gpus:
            # TODO: Should allow choosing GPU based on Spark and Horovod
            return self._select_random_gpu()
        return NO_GPU
コード例 #19
0
 def __init__(self, node: LoadDataPlan):
     super().__init__(node)
     config = ConfigurationManager()
     self.path_prefix = config.get_value('storage', 'path_prefix')
コード例 #20
0
 def __init__(self, node: LoadDataPlan):
     super().__init__(node)
     self.upload_path = Path(ConfigurationManager().get_value(
         "storage", "path_prefix"))
コード例 #21
0
class Session(object):
    """
    Wrapper around Spark Session
    """

    _instance = None
    _session = None

    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(Session, cls).__new__(cls)
        return cls._instance

    def __init__(self):
        self._config = ConfigurationManager()
        name = self._config.get_value('core', 'application')
        self.init_spark_session(name)

    def init_spark_session(self, application_name, spark_master=None):
        """Setup a spark session.

        :param spark_master: A master parameter used by spark session builder.
          Use default value (None) to use system
          environment configured spark cluster.
          Use 'local[*]' to run on a local box.

        :return: spark_session: A spark session
        """

        eva_spark_conf = SparkConf()
        pyspark_config = self._config.get_value('pyspark', 'property')
        for key, value in pyspark_config.items():
            eva_spark_conf.set(key, value)

        session_builder = SparkSession \
            .builder \
            .appName(application_name) \
            .config(conf=eva_spark_conf)

        if spark_master:
            session_builder.master(spark_master)

        # Gets an existing SparkSession or,
        # if there is no existing one, creates a new one based
        # on the options set in this builder.
        self._session = session_builder.getOrCreate()

        # Configure logging
        spark_context = self._session.sparkContext
        spark_context.setLogLevel('OFF')

    def get_session(self):
        return self._session

    def get_context(self):
        return self._session.sparkContext

    def stop(self):
        self._session.stop()

    def __del__(self):
        self._session.stop()
コード例 #22
0
 def __init__(self):
     self._config = ConfigurationManager()
     name = self._config.get_value('core', 'application')
     self.init_spark_session(name)