Beispiel #1
0
    def __init__(self, object_path, object_store=None, annotations=None):
        """Initialize the file that maintains the annotations. Annotations are
        read from file (if it exists).

        Provides the option to load an initial set of annotations from a given
        dictionary. If the file exists and the annotations dictionary is not
        None an exception is thrown.

        Parameters
        ----------
        object_path: string
            Path to resource
        object_store: vizier.core.io.base.ObjectStore, optional
            Object store to materialize annotations
        annotations: dict, optional
            Dictionary with initial set of annotations
        """
        # Ensure that the object store is not None
        if object_store is None:
            object_store = DefaultObjectStore()
        if not annotations is None:
            # Initialize annotations from the given dictionary. The persistent
            # set can only be initialized once.
            if object_store.exists(object_path):
                raise ValueError('cannot initialize existing annotation set')
            # Initialize the default object annotation set
            super(PersistentAnnotationSet, self).__init__(
                writer=PersistentAnnotationStore(
                    object_path=object_path,
                    object_store=object_store
                )
            )
            for key in annotations:
                value = annotations[key]
                if isinstance(value, list):
                    for val in value:
                        self.add(key, val, persist=False)
                else:
                    self.add(key, value, persist=False)
            self.writer.store(self.elements)
        else:
            # Read annotations from disk if the annotation file exists
            elements = dict()
            if object_store.exists(object_path):
                obj = object_store.read_object(object_path)
                for anno in obj:
                    elements[anno['key']] = anno['value']
            # Initialize the default object annotation set
            super(PersistentAnnotationSet, self).__init__(
                elements=elements,
                writer=PersistentAnnotationStore(
                    object_path=object_path,
                    object_store=object_store
                )
            )
    def __init__(
        self,
        viztrails: ViztrailRepository,
        container_file: str,
        config: AppConfig,
        datastores: DatastoreFactory,
        filestores: FilestoreFactory,
    ):
        """Initialize the cache components and load all projects in the given
        viztrails repository. Maintains all projects in an dictionary keyed by
        their identifier.

        Parameters
        ----------
        viztrails: vizier.vizual.repository.ViztrailRepository
            Repository for viztrails
        container_file: string
            Path to the container information file
        config: vizier.config.app.AppConfig
            Application object
        """
        self.viztrails = viztrails
        self.container_file = container_file
        self.config = config
        self.container_image = config.engine.backend.container.image
        # Keep track of the port numbers for the project containers.
        self.ports = config.engine.backend.container.ports
        # Instantiate the Docker daemon client using the default socket or
        # configuration in the environment. This may need to be adjusted for
        # production deployments.
        self.client = docker.from_env()
        # Read mapping of project identifier to container information
        self.store = DefaultObjectStore()
        containers = dict()
        if self.store.exists(self.container_file):
            for obj in cast(List[Dict[str, Any]],
                            self.store.read_object(self.container_file)):
                containers[obj['projectId']] = obj
        # Create index of project handles from existing viztrails. The project
        # handles do not have a reference to the datastore or filestore.
        self.projects = dict()
        self.datastores = datastores
        self.filestores = filestores
        for viztrail in self.viztrails.list_viztrails():
            container = containers[viztrail.identifier]
            project = ContainerProjectHandle(
                viztrail=viztrail,
                container_api=container['url'],
                port=container['port'],
                container_id=container['containerId'],
                datastore=self.datastores.get_datastore(viztrail.identifier),
                filestore=self.filestores.get_filestore(viztrail.identifier))
            self.projects[viztrail.identifier] = project
    def __init__(self,
                 base_path: str,
                 object_store: Optional[ObjectStore] = None):
        """Initialize the repository from a configuration dictionary. Expects
        a dictionary that contains at least the base path for the repository.
        The definition of the object store is optional. If none is given the
        default object store will be used.

        Parameters
        ---------
        base_path: string
            Path to the base directory for viztrail resources
        object_store: vizier.core.io.base.ObjectStore, optional
            Store for objects that represent viztrail resources
            not
        """
        # Raise an exception if the base directory argument is not given
        if base_path is None:
            raise ValueError('missing path for base directory')
        # Create the base directory if it does not exist
        self.base_path = base_path
        if not os.path.isdir(self.base_path):
            os.makedirs(self.base_path)
        # The object store element is optional. If not given the default object
        # store is used.
        if object_store is not None:
            self.object_store: ObjectStore = object_store
        else:
            self.object_store = DefaultObjectStore()
        # Initialize the viztrails index. Create the index file if it does not
        # exist.
        self.viztrails_index = self.object_store.join(self.base_path,
                                                      OBJ_VIZTRAILINDEX)
        if not self.object_store.exists(self.viztrails_index):
            self.object_store.create_object(parent_folder=self.base_path,
                                            identifier=OBJ_VIZTRAILINDEX,
                                            content=list())
        # Load viztrails and intialize the remaining instance variables by
        # calling the constructor of the super class
        self.viztrails: Dict[str, OSViztrailHandle] = dict()
        for identifier in cast(
                Dict[str, Any],
                self.object_store.read_object(self.viztrails_index)):
            vt = OSViztrailHandle.load_viztrail(
                base_path=self.object_store.join(self.base_path, identifier),
                object_store=self.object_store)
            # We just got the identifier from the repository... the loaded
            # viztrail had better exist.
            assert vt is not None
            self.viztrails[vt.identifier] = vt
Beispiel #4
0
 def __init__(self, 
         identifier: str, 
         is_default: bool, 
         base_path: str, 
         modules_folder: str, 
         provenance: BranchProvenance,
         properties: ObjectAnnotationSet, 
         workflows: List[WorkflowDescriptor] = list(), 
         head: Optional[WorkflowHandle] = None, 
         object_store: Optional[ObjectStore] = None,
         cache_size: int = DEFAULT_CACHE_SIZE
 ):
     """Initialize the branch handle.
     """
     super(OSBranchHandle, self).__init__(
         identifier=identifier,
         properties=properties,
         provenance=provenance
     )
     self.is_default = is_default
     self.base_path = base_path
     self.modules_folder = modules_folder
     self.object_store = init_value(object_store, DefaultObjectStore())
     self.workflows = init_value(workflows, list())
     self.head = head
     self.cache_size = cache_size if not cache_size is None else DEFAULT_CACHE_SIZE
     self.cache: List[WorkflowHandle] = list()
    def __init__(self,
                 object_path: str,
                 object_store: Optional[ObjectStore] = None,
                 properties: Optional[Dict[str, Any]] = None):
        """Initialize the file that maintains the properties. Annotations are
        read from file (if it exists).

        Provides the option to load an initial set of properties from a given
        dictionary. If the file exists and the properties dictionary is not
        None an exception is thrown.

        Parameters
        ----------
        object_path: string
            Path to resource
        object_store: vizier.core.io.base.ObjectStore, optional
            Object store to materialize properties
        properties: dict, optional
            Dictionary with initial set of properties
        """
        # Ensure that the object store is not None
        super().__init__()
        if object_store is None:
            object_store = DefaultObjectStore()
        if properties is not None:
            # Initialize properties from the given dictionary. The persistent
            # set can only be initialized once.
            if object_store.exists(object_path):
                raise ValueError('cannot initialize existing annotation set')
            # Initialize the default object annotation set
            super(PersistentAnnotationSet,
                  self).__init__(writer=PersistentAnnotationStore(
                      object_path=object_path, object_store=object_store))
            for key in properties:
                value = properties[key]
                self.delete(key, persist=False)
                if isinstance(value, list):
                    for val in value:
                        self.add(key, val, persist=False)
                else:
                    self.add(key, value, persist=False)
            cast(AnnotationStore, self.writer).store(self.elements)
        else:
            # Read properties from disk if the annotation file exists
            elements = dict()
            if object_store.exists(object_path):
                obj = cast(List[Dict[str, Any]],
                           object_store.read_object(object_path))
                for anno in obj:
                    elements[anno['key']] = anno['value']
            # Initialize the default object annotation set
            super(PersistentAnnotationSet, self).__init__(
                elements=elements,
                writer=PersistentAnnotationStore(object_path=object_path,
                                                 object_store=object_store))
        list.__init__(self, [{
            'key': k,
            'value': v
        } for k, v in self.elements.items()])
Beispiel #6
0
    def __init__(self, object_path, object_store=None):
        """Initialize the path to the resource in the object store. By default
        annotation stes are persisted as files on the locak file system.

        Parameters
        ----------
        object_path: string
            Path to the resource
        object_store: vizier.core.io.base.ObjectStore, optional
            Object store to materialize annotations
        """
        self.object_path = object_path
        self.object_store = init_value(object_store, DefaultObjectStore())
    def __init__(self,
                 identifier: str,
                 properties: PersistentAnnotationSet,
                 base_path: str,
                 branches: List[BranchHandle],
                 default_branch: Optional[BranchHandle],
                 object_store: ObjectStore = DefaultObjectStore(),
                 created_at: datetime = get_current_time(),
                 branch_index: Optional[str] = None,
                 branch_folder: Optional[str] = None,
                 modules_folder: Optional[str] = None):
        """Initialize the viztrail descriptor.

        Parameters
        ----------
        identifier : string
            Unique viztrail identifier
        properties: dict(string, any)
            Dictionary of user-defined properties
        base_path: string
            Identifier for folder containing viztrail resources
        object_store: vizier.core.io.base.ObjectStore, optional
            Object store implementation to access and maintain resources
        branches: list(vizier.viztrail.branch.BranchHandle)
            List of branches in the viztrail
        default_branch: vizier.viztrail.branch.BranchHandle
            Default branch for the viztrail
        created_at : datetime.datetime, optional
            Timestamp of project creation (UTC)
        branch_index: string, optional
            Path to branch index list
        branch_folder: string, optional
            Path to branches folder
        modules_folder: string, optional
            Path to modules folder
        """
        super(OSViztrailHandle, self).__init__(identifier=identifier,
                                               properties=properties,
                                               branches=branches,
                                               default_branch=default_branch,
                                               created_at=created_at)
        # Initizlize the object store and identifier for all subfolders.
        self.base_path = base_path
        self.object_store = object_store
        self.branch_folder = init_value(
            branch_folder, self.object_store.join(base_path, FOLDER_BRANCHES))
        self.branch_index = init_value(
            branch_index,
            self.object_store.join(self.branch_folder, OBJ_BRANCHINDEX))
        self.modules_folder = init_value(
            modules_folder, self.object_store.join(base_path, FOLDER_MODULES))
Beispiel #8
0
    def __init__(self,
                 identifier,
                 command,
                 external_form,
                 module_path,
                 state=None,
                 timestamp=None,
                 datasets=None,
                 outputs=None,
                 provenance=None,
                 object_store=None):
        """Initialize the module handle. For new modules, datasets and outputs
        are initially empty.

        Parameters
        ----------
        identifier : string
            Unique module identifier
        command : vizier.viztrail.command.ModuleCommand
            Specification of the module (i.e., package, name, and arguments)
        external_form: string
            Printable representation of module command
        module_path: string
            Path to module resource in object store
        state: int
            Module state (one of PENDING, RUNNING, CANCELED, ERROR, SUCCESS)
        timestamp: vizier.viztrail.module.timestamp.ModuleTimestamp, optional
            Module timestamp
        datasets : dict(vizier.datastore.dataset.DatasetDescriptor), optional
            Dictionary of resulting datasets. Dataset descriptors are keyed by
            the user-specified dataset name.
        outputs: vizier.viztrail.module.output.ModuleOutputs, optional
            Module output streams STDOUT and STDERR
        provenance: vizier.viztrail.module.provenance.ModuleProvenance, optional
            Provenance information about datasets that were read and writen by
            previous execution of the module.
        object_store: vizier.core.io.base.ObjectStore, optional
            Object store implementation to access and maintain resources
        """
        super(OSModuleHandle, self).__init__(
            identifier=identifier,
            command=command,
            external_form=external_form,
            state=state if not state is None else mstate.MODULE_PENDING,
            timestamp=timestamp,
            datasets=datasets,
            outputs=outputs,
            provenance=provenance)
        self.module_path = module_path
        self.object_store = object_store if not object_store is None else DefaultObjectStore(
        )
Beispiel #9
0
    def __init__(self,
                 identifier,
                 is_default,
                 base_path,
                 modules_folder,
                 provenance,
                 properties,
                 workflows=None,
                 head=None,
                 object_store=None,
                 cache_size=None):
        """Initialize the branch handle.

        Parameters
        ----------
        identifier: string
            Unique branch identifier
        is_default: bool
            True if this is the default branch for its viztrail
        base_path: string
            Path to branch resources folder
        modules_folder: string
            Path to module resources folder
        provenance: vizier.viztrail.branch.BranchProvenance
            Branch provenance information
        properties: vizier.core.annotation.base.ObjectAnnotationSet
            Branch property set
        workflows: list(vizier.viztrail.workflow.WorkflowDescriptor), optional
            List of descriptors for workflows in branch history
        head: vizier.viztrail.workflow.WorkflowHandle, optional
            Current at the head of the branch
        object_store: vizier.core.io.base.ObjectStore, optional
            Object store implementation to access and maintain resources
        """
        super(OSBranchHandle, self).__init__(identifier=identifier,
                                             properties=properties,
                                             provenance=provenance)
        self.is_default = is_default
        self.base_path = base_path
        self.modules_folder = modules_folder
        self.object_store = init_value(object_store, DefaultObjectStore())
        self.workflows = init_value(workflows, list())
        self.head = head
        self.cache_size = cache_size if not cache_size is None else DEFAULT_CACHE_SIZE
        self.cache = list()
 def test_create_cache(self):
     """Test accessing and deleting projects for an empty repository."""
     viztrails = OSViztrailRepository(base_path=VIZTRAILS_DIR)
     vt1 = viztrails.create_viztrail(
         properties={PROPERTY_NAME: 'My Project'})
     vt2 = viztrails.create_viztrail(
         properties={PROPERTY_NAME: 'A Project'})
     filename = os.path.join(SERVER_DIR, 'container.json')
     DefaultObjectStore().write_object(object_path=filename,
                                       content=[{
                                           'projectId': vt1.identifier,
                                           'url': 'API1',
                                           'port': 80,
                                           'containerId': 'ID1'
                                       }, {
                                           'projectId': vt2.identifier,
                                           'url': 'API2',
                                           'port': 81,
                                           'containerId': 'ID2'
                                       }])
     # Initialize the project cache
     viztrails = OSViztrailRepository(base_path=VIZTRAILS_DIR)
     filestores_dir = os.path.join(SERVER_DIR, DEFAULT_FILESTORES_DIR)
     datastores_dir = os.path.join(SERVER_DIR, DEFAULT_DATASTORES_DIR)
     projects = ContainerProjectCache(
         viztrails=viztrails,
         container_file=filename,
         config=AppConfig(),
         datastores=MimirDatastoreFactory(datastores_dir),
         filestores=FileSystemFilestoreFactory(filestores_dir))
     self.assertEqual(len(projects.list_projects()), 2)
     self.assertEqual(
         projects.get_project(vt1.identifier).container_api, 'API1')
     self.assertEqual(
         projects.get_project(vt2.identifier).container_api, 'API2')
     self.assertEqual(
         projects.get_project(vt1.identifier).container_id, 'ID1')
     self.assertEqual(
         projects.get_project(vt2.identifier).container_id, 'ID2')
     self.assertEqual(projects.get_project(vt1.identifier).port, 80)
     self.assertEqual(projects.get_project(vt2.identifier).port, 81)
Beispiel #11
0
 def test_create_file_repeat(self):
     """Test create file with identifier factory that not always returns
     a unique identifier.
     """
     store = DefaultObjectStore(identifier_factory=IdFactory(
         max_attempts=MAX_ATTEMPS - 1))
     id1 = store.create_object(BASE_DIRECTORY)
     id2 = store.create_object(BASE_DIRECTORY)
     self.assertNotEqual(id1, id2)
     self.assertTrue(store.exists(store.join(BASE_DIRECTORY, id1)))
     self.assertTrue(store.exists(store.join(BASE_DIRECTORY, id2)))
     store.delete_object(store.join(BASE_DIRECTORY, id1))
     store.delete_object(store.join(BASE_DIRECTORY, id2))
     store = DefaultObjectStore(identifier_factory=IdFactory(
         max_attempts=MAX_ATTEMPS + 1))
     id1 = store.create_object(BASE_DIRECTORY)
     with self.assertRaises(RuntimeError):
         store.create_object(BASE_DIRECTORY)
Beispiel #12
0
class OSViztrailRepository(ViztrailRepository):
    """Repository for viztrails. This implementation maintains all resources
    that are managed by the repository as objects in an object store. The base
    path is the identifier prefix for all managed resources.

    By default all resources are mantained as directories and files on the local
    file system. The viztrails index is a list object that contains the
    identifier of active viztrails.

    Folders and Resources
    ---------------------
    viztrails        : List of active viztrails
    <vt-identifier>/ : Folder with resources for individual viztrail
    """
    def __init__(self, base_path, object_store=None):
        """Initialize the repository from a configuration dictionary. Expects
        a dictionary that contains at least the base path for the repository.
        The definition of the object store is optional. If none is given the
        default object store will be used.

        Parameters
        ---------
        base_path: string
            Path to the base directory for viztrail resources
        object_store: vizier.core.io.base.ObjectStore, optional
            Store for objects that represent viztrail resources
            not
        """
        # Raise an exception if the base directory argument is not given
        if base_path is None:
            raise ValueError('missing path for base directory')
        # Create the base directory if it does not exist
        self.base_path = base_path
        if not os.path.isdir(self.base_path):
            os.makedirs(self.base_path)
        # The object store element is optional. If not given the default object
        # store is used.
        if not object_store is None:
            self.object_store = object_store
        else:
            self.object_store = DefaultObjectStore()
        # Initialize the viztrails index. Create the index file if it does not
        # exist.
        self.viztrails_index = self.object_store.join(self.base_path,
                                                      OBJ_VIZTRAILINDEX)
        if not self.object_store.exists(self.viztrails_index):
            self.object_store.create_object(parent_folder=self.base_path,
                                            identifier=OBJ_VIZTRAILINDEX,
                                            content=list())
        # Load viztrails and intialize the remaining instance variables by
        # calling the constructor of the super class
        self.viztrails = dict()
        for identifier in self.object_store.read_object(self.viztrails_index):
            vt = OSViztrailHandle.load_viztrail(
                base_path=self.object_store.join(self.base_path, identifier),
                object_store=self.object_store)
            self.viztrails[vt.identifier] = vt

    def create_viztrail(self, properties=None):
        """Create a new viztrail. The initial set of properties is an optional
        dictionary of (key,value)-pairs where all values are expected to either
        be scalar values or a list of scalar values.

        Parameters
        ----------
        properties: dict, optional
            Set of properties for the new viztrail

        Returns
        -------
        vizier.viztrail.objectstore.viztrail.OSViztrailHandle
        """
        # Get unique identifier for new viztrail and viztrail directory. Raise
        # runtime error if the returned identifier is not unique.
        identifier = self.object_store.create_folder(
            parent_folder=self.base_path)
        viztrail_path = self.object_store.join(self.base_path, identifier)
        # Create materialized viztrail resource
        vt = OSViztrailHandle.create_viztrail(identifier=identifier,
                                              properties=properties,
                                              base_path=viztrail_path,
                                              object_store=self.object_store)
        # Add the new resource to the viztrails index. Write updated index to
        # object store before returning the new viztrail handle
        self.viztrails[vt.identifier] = vt
        self.object_store.write_object(
            object_path=self.viztrails_index,
            content=[vt_id for vt_id in self.viztrails])
        return vt

    def delete_viztrail(self, viztrail_id):
        """Delete the viztrail with given identifier. The result is True if a
        viztrail with the given identifier existed, False otherwise.

        Parameters
        ----------
        viztrail_id : string
            Unique viztrail identifier

        Returns
        -------
        bool
        """
        # Get the viztrail handle if it exists
        if viztrail_id in self.viztrails:
            # Call the delete method of the OSViztrailHandle to delete the
            # files that are associated with the viztrail
            self.viztrails[viztrail_id].delete_viztrail()
            # Remove viztrail from the internal cache and write the updated
            # viztrails index
            del self.viztrails[viztrail_id]
            self.object_store.write_object(
                object_path=self.viztrails_index,
                content=[vt for vt in self.viztrails])
            return True
        else:
            return False

    def get_viztrail(self, viztrail_id):
        """Retrieve the viztrail with the given identifier. The result is None
        if no viztrail with given identifier exists.

        Parameters
        ----------
        viztrail_id : string
            Unique viztrail identifier

        Returns
        -------
        vizier.viztrail.base.ViztrailHandle
        """
        if viztrail_id in self.viztrails:
            return self.viztrails[viztrail_id]
        else:
            return None

    def list_viztrails(self):
        """List handles for all viztrails in the repository.

        Returns
        -------
        list(vizier.viztrail.base.ViztrailHandle)
        """
        return list(self.viztrails.values())
Beispiel #13
0
 def test_create_and_delete_folder(self):
     """Test default functionality of create_folder and delete_folder
     methods.
     """
     store = DefaultObjectStore()
     self.assertEqual(store.create_folder(BASE_DIRECTORY, identifier='A'),
                      'A')
     self.assertTrue(store.exists(store.join(BASE_DIRECTORY, 'A')))
     self.assertTrue(os.path.isdir(os.path.join(BASE_DIRECTORY, 'A')))
     identifier = store.create_folder(BASE_DIRECTORY)
     self.assertTrue(store.exists(store.join(BASE_DIRECTORY, identifier)))
     self.assertTrue(os.path.isdir(os.path.join(BASE_DIRECTORY,
                                                identifier)))
     # New store with short identifier factory
     store = DefaultObjectStore(identifier_factory=get_short_identifier)
     short_id = store.create_folder(BASE_DIRECTORY)
     self.assertTrue(store.exists(store.join(BASE_DIRECTORY, short_id)))
     self.assertTrue(os.path.isdir(os.path.join(BASE_DIRECTORY, short_id)))
     # Delete folder with identifier
     store.delete_folder(store.join(BASE_DIRECTORY, identifier))
     self.assertFalse(store.exists(store.join(BASE_DIRECTORY, identifier)))
     self.assertFalse(
         os.path.isdir(os.path.join(BASE_DIRECTORY, identifier)))
     # Delete folder with short_id when the keep_deleted_files flag is True
     store = DefaultObjectStore(keep_deleted_files=True)
     store.delete_folder(store.join(BASE_DIRECTORY, short_id))
     self.assertTrue(store.exists(store.join(BASE_DIRECTORY, short_id)))
     self.assertTrue(os.path.isdir(os.path.join(BASE_DIRECTORY, short_id)))
     # Delete folder 'A' overriding the keep_deleted_files flag
     self.assertTrue(store.exists(store.join(BASE_DIRECTORY, 'A')))
     self.assertTrue(os.path.isdir(os.path.join(BASE_DIRECTORY, 'A')))
     store.delete_folder(store.join(BASE_DIRECTORY, 'A'), force_delete=True)
     self.assertFalse(store.exists(store.join(BASE_DIRECTORY, 'A')))
     self.assertFalse(os.path.isdir(os.path.join(BASE_DIRECTORY, 'A')))
Beispiel #14
0
 def test_create_object_with_identifier(self):
     """Test creating a new object with a given identifier."""
     store = DefaultObjectStore()
     store.create_object(BASE_DIRECTORY, identifier='A')
     self.assertTrue(os.path.isfile(os.path.join(BASE_DIRECTORY, 'A')))
     with self.assertRaises(ValueError):
         store.read_object(store.join(BASE_DIRECTORY, 'A'))
     store.create_object(BASE_DIRECTORY,
                         identifier='B',
                         content={'id': 100})
     self.assertTrue(os.path.isfile(os.path.join(BASE_DIRECTORY, 'B')))
     content = store.read_object(store.join(BASE_DIRECTORY, 'B'))
     self.assertEqual(content['id'], 100)
     store.create_object(BASE_DIRECTORY,
                         identifier='A',
                         content={'id': 100})
     self.assertTrue(os.path.isfile(os.path.join(BASE_DIRECTORY, 'A')))
     content = store.read_object(store.join(BASE_DIRECTORY, 'A'))
     self.assertEqual(content['id'], 100)
     store.create_object(BASE_DIRECTORY, identifier='B')
     self.assertTrue(os.path.isfile(os.path.join(BASE_DIRECTORY, 'B')))
     with self.assertRaises(ValueError):
         store.read_object(store.join(BASE_DIRECTORY, 'B'))
Beispiel #15
0
 def test_exists(self):
     """Test exists method."""
     store = DefaultObjectStore()
     filename = store.join(BASE_DIRECTORY, 'A.file')
     dirname = store.join(BASE_DIRECTORY, 'A.dir')
     self.assertFalse(store.exists(filename))
     store.create_object(BASE_DIRECTORY, identifier='A.file')
     self.assertTrue(store.exists(filename))
     self.assertFalse(store.exists(dirname))
     os.makedirs(dirname)
     self.assertTrue(store.exists(dirname))
     # Re-create the store to ensure that this has no effect
     store = DefaultObjectStore()
     self.assertTrue(store.exists(filename))
     self.assertTrue(store.exists(dirname))
Beispiel #16
0
 def test_list_folders(self):
     """Test list_folders method."""
     store = DefaultObjectStore()
     # The result is an empty list even if the folder does not exist and
     # is not created using the create flag
     dirname = store.join(BASE_DIRECTORY, 'A')
     dirs = store.list_folders(parent_folder=dirname, create=False)
     self.assertEqual(len(dirs), 0)
     self.assertFalse(store.exists(dirname))
     # The result is an empty list after the folder is created using the
     # create flag
     dirs = store.list_folders(parent_folder=dirname, create=True)
     self.assertEqual(len(dirs), 0)
     self.assertTrue(store.exists(dirname))
     # Create directories and files
     os.makedirs(store.join(dirname, 'A'))
     dirs = store.list_folders(parent_folder=dirname)
     self.assertEqual(len(dirs), 1)
     self.assertTrue('A' in dirs)
     os.makedirs(store.join(dirname, 'B'))
     dirs = store.list_folders(parent_folder=dirname, create=True)
     self.assertEqual(len(dirs), 2)
     self.assertTrue('A' in dirs)
     self.assertTrue('B' in dirs)
     filename = store.join(BASE_DIRECTORY, 'A.file')
     store.create_object(BASE_DIRECTORY, identifier='A.file')
     dirs = store.list_folders(parent_folder=dirname, create=True)
     self.assertEqual(len(dirs), 2)
     self.assertTrue('A' in dirs)
     self.assertTrue('B' in dirs)
     # Re-create the store to ensure that this has no effect
     store = DefaultObjectStore()
     dirs = store.list_folders(parent_folder=dirname, create=True)
     self.assertEqual(len(dirs), 2)
     self.assertTrue('A' in dirs)
     self.assertTrue('B' in dirs)
Beispiel #17
0
def get_engine(config: AppConfig) -> VizierEngine:
    """Create instance of the default vizual engine using the default datastore,
    filestore and viztrails factories.  The default engine may use a
    multi-process backend or a celery backend.

    Parameters
    ----------
    config: vizier.config.app.AppConfig
        Application configuration object

    Returns
    -------
    vizier.engine.base.VizierEngine
    """
    # Get backend identifier. Raise ValueError if value does not identify
    # a valid backend.
    backend_id = config.engine.backend.identifier
    if not backend_id in base.BACKENDS:
        raise ValueError('unknown backend \'' + str(backend_id) + '\'')
    # Get the identifier factory for the viztrails repository and create
    # the object store. At this point we use the default object store only.
    # We could add another environment variable to use different object
    # stores (once implemented).
    if config.engine.use_short_ids:
        id_factory = get_short_identifier
    else:
        id_factory = get_unique_identifier
    object_store = DefaultObjectStore(
        identifier_factory=id_factory
    )
    # Create index of supported packages
    packages = load_packages(config.engine.package_path)
    # By default the vizier engine uses the objectstore implementation for
    # the viztrails repository. The datastore and filestore factories depend
    # on the values of engine identifier (DEV or MIMIR).
    base_dir = config.engine.data_dir
    # Create the local viztrails repository
    viztrails = OSViztrailRepository(
        base_path=os.path.join(base_dir, app.DEFAULT_VIZTRAILS_DIR),
        object_store=object_store
    )
    filestores_dir = os.path.join(base_dir, app.DEFAULT_FILESTORES_DIR)
    datastores_dir = os.path.join(base_dir, app.DEFAULT_DATASTORES_DIR)
    if config.engine.identifier in [base.DEV_ENGINE, base.MIMIR_ENGINE]:
        filestore_factory=FileSystemFilestoreFactory(filestores_dir)
        datastore_factory: DatastoreFactory
        if config.engine.identifier == base.DEV_ENGINE:
            datastore_factory = FileSystemDatastoreFactory(datastores_dir)
        else:
            datastore_factory = MimirDatastoreFactory(datastores_dir)
        # The default engine uses a common project cache.
        projects: ProjectCache = CommonProjectCache(
            datastores=datastore_factory,
            filestores=filestore_factory,
            viztrails=viztrails
        )
        # Get set of task processors for supported packages
        processors = load_processors(config.engine.processor_path)
        # Create an optional task processor for synchronous tasks if given
        sync_commands_list = config.engine.sync_commands
        if not sync_commands_list is None:
            commands:Dict[str,Dict[str,TaskProcessor]] = dict()
            for el in sync_commands_list.split(':'):
                package_id, command_id = el.split('.')
                if not package_id in commands:
                    commands[package_id] = dict()
                commands[package_id][command_id] = processors[package_id]
            synchronous: TaskExecEngine = SynchronousTaskEngine(
                commands=commands,
                projects=projects
            )
        else:
            synchronous = NonSynchronousEngine()
        # Create the backend
        backend: VizierBackend
        if backend_id == base.BACKEND_MULTIPROCESS:
            backend = MultiProcessBackend(
                processors=processors,
                projects=projects,
                synchronous=synchronous
            )
        elif backend_id == base.BACKEND_CELERY:
            # Create and configure routing information (if given)
            backend = CeleryBackend(
                routes=config_routes(config),
                synchronous=synchronous
            )
        else:
            # Not all combinations of engine identifier and backend identifier
            # are valid.
            raise ValueError('invalid backend \'' + str(backend_id) + '\'')
    elif config.engine.identifier == base.CONTAINER_ENGINE:
        if backend_id == base.BACKEND_CONTAINER:
            projects = ContainerProjectCache(
                viztrails=viztrails,
                container_file=os.path.join(base_dir, app.DEFAULT_CONTAINER_FILE),
                config=config,
                datastores=MimirDatastoreFactory(datastores_dir),
                filestores=FileSystemFilestoreFactory(filestores_dir)
            )
            backend = ContainerBackend(projects=projects)
        else:
            # The container engine only supports a single backend type.
            raise ValueError('invalid backend \'' + str(backend_id) + '\'')
    else:
        raise ValueError('unknown vizier engine \'' + str(config.engine.identifier) + '\'')
    return VizierEngine(
        name=config.engine.identifier + ' (' + backend_id + ')',
        projects=projects,
        backend=backend,
        packages=packages
    )
Beispiel #18
0
 def test_error_on_missing(self):
     """Test that reading a missing object will raise a ValueError."""
     store = DefaultObjectStore()
     filename = store.join(BASE_DIRECTORY, 'A.file')
     store.create_object(BASE_DIRECTORY,
                         identifier='A.file',
                         content={'A': 1})
     self.assertTrue(store.exists(filename))
     # Re-create the store to ensure that this has no effect
     store = DefaultObjectStore()
     store.read_object(filename)
     os.remove(filename)
     with self.assertRaises(ValueError):
         store.read_object(filename)
Beispiel #19
0
    def create_branch(
        identifier: str, 
        base_path: str, 
        modules_folder: str, 
        is_default: bool = False, 
        provenance: Optional[BranchProvenance] = None,
        properties: Optional[Dict[str, Any]] = None, 
        created_at: Optional[datetime] = None, 
        modules: Optional[List[str]] = None, 
        object_store: Optional[ObjectStore] = None
    ):
        """Create a new branch. If the workflow is given the new branch contains
        exactly this workflow. Otherwise, the branch is empty.

        Raises ValueError if any of the modules in the given list is in an
        active state.

        Parameters
        ----------
        identifier: string
            Unique branch identifier
        base_path: string
            path to the folder for branch resources
        modules_folder: string
            Path to module resources folder
        is_default: bool, optional
            True if this is the default branch for its viztrail
        provenance: vizier.viztrail.branch.BranchProvenance, optional
            Branch provenance information
        properties: dict, optional
            Initial set of branch properties
        object_store: vizier.core.io.base.ObjectStore, optional
            Object store implementation to access and maintain resources
        modules: list(string), optional
            List of module identifier for the modules in the workflow at the
            head of the branch

        Returns
        -------
        vizier.viztrail.objectstore.branch.OSBranchHandle
        """
        # Make sure the object store is not None
        if object_store is None:
            object_store = DefaultObjectStore()
        # If base path does not exist raise an exception
        if not object_store.exists(base_path):
            raise ValueError('base path does not exist')
        # Read module handles first to ensure that none of the modules is in
        # an active state
        if not modules is None:
            wf_modules = read_workflow_modules(
                modules_list=modules,
                modules_folder=modules_folder,
                object_store=object_store
            )
            for m in wf_modules:
                if m.is_active:
                    raise ValueError('cannot branch from active workflow')
        # Set provenance object if not given
        if provenance is None:
            provenance = BranchProvenance()
        # Write provenance information to disk
        doc: Dict[str, Any] = {KEY_CREATED_AT: provenance.created_at.isoformat()}
        if not provenance.source_branch is None:
            # If one propery is not None all are expected to be not None
            doc[KEY_SOURCE_BRANCH] = provenance.source_branch
            doc[KEY_WORKFLOW_ID] = provenance.workflow_id
            doc[KEY_MODULE_ID] = provenance.module_id
        object_store.write_object(
            object_path=object_store.join(base_path, OBJ_METADATA),
            content=doc
        )
        # Create the initial workflow if the list of modules is given
        workflows = list()
        head = None
        if not modules is None:
            # Write handle for workflow at branch head
            descriptor = write_workflow_handle(
                modules=modules,
                workflow_count=0,
                base_path=base_path,
                object_store=object_store,
                action=ACTION_CREATE,
                created_at=provenance.created_at
            )
            workflows.append(descriptor)
            # Set the new workflow as the branch head
            head = WorkflowHandle(
                identifier=descriptor.identifier,
                branch_id=identifier,
                modules=wf_modules,
                descriptor=descriptor
            )
        # Return handle for new viztrail branch
        return OSBranchHandle(
            identifier=identifier,
            is_default=is_default,
            base_path=base_path,
            modules_folder=modules_folder,
            provenance=provenance,
            properties=PersistentAnnotationSet(
                object_path=object_store.join(base_path, OBJ_PROPERTIES),
                object_store=object_store,
                properties=properties
            ),
            workflows=workflows,
            head=head,
            object_store=object_store
        )
Beispiel #20
0
    def load_branch(
            identifier: str, 
            is_default: bool, 
            base_path: str, 
            modules_folder: str, 
            object_store: Optional[ObjectStore] = None
        ):
        """Load branch from disk. Reads the branch provenance information and
        descriptors for all workflows in the branch history. If the branch
        history is not empty the modules for the workflow at the branch head
        will be read as well.

        Parameters
        ----------
        identifier: string
            Unique branch identifier
        is_default: bool
            True if this is the default branch for its viztrail
        base_path: string
            Path to folder containing branch resources
        modules_folder: string
            Path to folder containing workflow modules
        object_store: vizier.core.io.base.ObjectStore, optional
            Object store implementation to access and maintain resources

        Returns
        -------
        vizier.viztrail.objectstore.branch.OSBranchHandle
        """
        # Make sure the object store is not None
        if object_store is None:
            object_store = DefaultObjectStore()
        # Load branch provenance. The object will contain the created_at
        # timestamp and optionally the three entries that define the branch
        # point.
        doc = cast(Dict[str, Any], object_store.read_object(
            object_store.join(base_path, OBJ_METADATA)
        ))
        created_at = to_datetime(doc[KEY_CREATED_AT])
        if len(doc) == 4:
            provenance = BranchProvenance(
                source_branch=doc[KEY_SOURCE_BRANCH],
                workflow_id=doc[KEY_WORKFLOW_ID],
                module_id=doc[KEY_MODULE_ID],
                created_at=created_at
            )
        else:
            provenance = BranchProvenance(created_at=created_at)
        # Read descriptors for all branch workflows. Workflow descriptors are
        # objects in the base directory that do no match the name of any of the
        # predefied branch object.
        workflows = list()
        for resource in object_store.list_objects(base_path):
            if not resource in [OBJ_METADATA, OBJ_PROPERTIES]:
                resource_path = object_store.join(base_path, resource)
                obj = cast(Dict[str, Any], object_store.read_object(resource_path))
                desc = obj[KEY_WORKFLOW_DESCRIPTOR]
                workflows.append(
                    WorkflowDescriptor(
                        identifier=obj[KEY_WORKFLOW_ID],
                        action=desc[KEY_ACTION],
                        package_id=desc[KEY_PACKAGE_ID],
                        command_id=desc[KEY_COMMAND_ID],
                        created_at=to_datetime(desc[KEY_CREATED_AT])
                    )
                )
        # Sort workflows in ascending order of their identifier
        workflows.sort(key=lambda x: x.identifier)
        # Read all modules for the workflow at the branch head (if exists)
        head = None
        if len(workflows) > 0:
            # The workflow descriptor is the last element in the workflows list
            descriptor = workflows[-1]
            head = read_workflow(
                branch_id=identifier,
                workflow_descriptor=descriptor,
                workflow_path=object_store.join(
                    base_path,
                    descriptor.identifier
                ),
                modules_folder=modules_folder,
                object_store=object_store
            )
        return OSBranchHandle(
            identifier=identifier,
            is_default=is_default,
            base_path=base_path,
            modules_folder=modules_folder,
            provenance=provenance,
            properties=PersistentAnnotationSet(
                object_path=object_store.join(base_path, OBJ_PROPERTIES),
                object_store=object_store
            ),
            workflows=workflows,
            head=head,
            object_store=object_store
        )
Beispiel #21
0
    def create_viztrail(identifier: str,
                        base_path: str,
                        object_store: Optional[ObjectStore] = None,
                        properties: Optional[Dict[str, Any]] = None):
        """Create a new viztrail resource. Will create the base directory for
        the viztrail.

        Creates subfolders for viztrail resources. Writes viztrail metadata and
        properties to file. Create an empty default branch

        Parameters
        ----------
        properties: dict(string, any)
            Dictionary of properties for the new viztrail
        base_path: string
            Identifier for folder containing viztrail resources
        object_store: vizier.core.io.base.ObjectStore, optional
            Object store implementation to access and maintain resources

        Returns
        -------
        vizier.viztrail.driver.fs.viztrail.FSViztrailHandle
        """
        # Make sure the object store is not None
        if object_store is None:
            object_store = DefaultObjectStore()
        # Create empty index file and subfolders for branches, workflows, and
        # modules. The base path folder is expected to exist.
        branch_folder = object_store.join(base_path, FOLDER_BRANCHES)
        object_store.create_folder(base_path, identifier=FOLDER_BRANCHES)
        branch_index = object_store.join(branch_folder, OBJ_BRANCHINDEX)
        content: List[str] = []
        object_store.write_object(object_path=branch_index, content=content)
        modules_folder = object_store.join(base_path, FOLDER_MODULES)
        object_store.create_folder(base_path, identifier=FOLDER_MODULES)
        # Write viztrail metadata to disk
        created_at = get_current_time()
        object_store.write_object(object_path=object_store.join(
            base_path, OBJ_METADATA),
                                  content={
                                      KEY_IDENTIFIER: identifier,
                                      KEY_CREATED_AT: created_at.isoformat()
                                  })
        # Create the default branch for the new viztrail
        default_branch = create_branch(
            provenance=BranchProvenance(created_at=created_at),
            properties={PROPERTY_NAME: DEFAULT_BRANCH},
            modules=None,
            branch_folder=branch_folder,
            modules_folder=modules_folder,
            object_store=object_store,
            is_default=True,
            created_at=created_at)
        # Materialize the updated branch index
        write_branch_index(
            branches={default_branch.identifier: default_branch},
            object_path=branch_index,
            object_store=object_store)
        # Return handle for new viztrail
        return OSViztrailHandle(identifier=identifier,
                                properties=PersistentAnnotationSet(
                                    object_path=object_store.join(
                                        base_path, OBJ_PROPERTIES),
                                    object_store=object_store,
                                    properties=properties),
                                branches=[default_branch],
                                default_branch=default_branch,
                                created_at=created_at,
                                base_path=base_path,
                                object_store=object_store,
                                branch_index=branch_index,
                                branch_folder=branch_folder,
                                modules_folder=modules_folder)
Beispiel #22
0
    def load_viztrail(
        base_path: str,
        object_store: Optional[ObjectStore] = None
    ) -> Optional["OSViztrailHandle"]:
        """Load all viztrail resources from given object store.

        Parameters
        ----------
        base_path: string
            Identifier for folder containing viztrail resources
        object_store: vizier.core.io.base.ObjectStore, optional
            Object store implementation to access and maintain resources

        Returns
        -------
        vizier.viztrail.driver.os.viztrail.OSViztrailHandle
        """
        # Make sure the object store is not None
        if object_store is None:
            object_store = DefaultObjectStore()
        object_store = cast(ObjectStore, object_store)
        # Load viztrail metadata
        metadata = object_store.read_object(
            object_store.join(base_path, OBJ_METADATA))
        if metadata is None:
            return None
        metadata = cast(Dict[str, Any], metadata)
        identifier = metadata[KEY_IDENTIFIER]
        created_at = to_datetime(metadata[KEY_CREATED_AT])
        # Load active branches. The branch index resource contains a list of
        # active branch identifiers.
        branch_folder = object_store.join(base_path, FOLDER_BRANCHES)
        branch_index = object_store.join(branch_folder, OBJ_BRANCHINDEX)
        modules_folder = object_store.join(base_path, FOLDER_MODULES)
        branches = list()
        default_branch: Optional[BranchHandle] = None
        for b in cast(List[Dict[str, Any]],
                      object_store.read_object(branch_index)):
            branch_id = b[KEY_IDENTIFIER]
            is_default = b[KEY_DEFAULT]
            branches.append(
                OSBranchHandle.load_branch(identifier=branch_id,
                                           is_default=is_default,
                                           base_path=object_store.join(
                                               branch_folder, branch_id),
                                           modules_folder=modules_folder,
                                           object_store=object_store))
            if is_default:
                default_branch = branches[-1]
        # Return handle for new viztrail
        return OSViztrailHandle(identifier=identifier,
                                properties=PersistentAnnotationSet(
                                    object_path=object_store.join(
                                        base_path, OBJ_PROPERTIES),
                                    object_store=object_store),
                                branches=branches,
                                default_branch=default_branch,
                                created_at=created_at,
                                base_path=base_path,
                                object_store=object_store,
                                branch_index=branch_index,
                                branch_folder=branch_folder,
                                modules_folder=modules_folder)
Beispiel #23
0
    def create_module(command,
                      external_form,
                      state,
                      timestamp,
                      outputs,
                      provenance,
                      module_folder,
                      datasets=None,
                      object_store=None):
        """Create a new materialized module instance for the given values.

        Parameters
        ----------
        command : vizier.viztrail.command.ModuleCommand
            Specification of the module (i.e., package, name, and arguments)
        external_form: string
            Printable representation of module command
        state: int
            Module state (one of PENDING, RUNNING, CANCELED, ERROR, SUCCESS)
        timestamp: vizier.viztrail.module.timestamp.ModuleTimestamp
            Module timestamp
        datasets : dict(vizier.datastore.dataset.DatasetDescriptor)
            Dictionary of resulting datasets. Dataset descriptors are keyed by
            the user-specified dataset name.
        outputs: vizier.viztrail.module.output.ModuleOutputs
            Module output streams STDOUT and STDERR
        provenance: vizier.viztrail.module.provenance.ModuleProvenance
            Provenance information about datasets that were read and writen by
            previous execution of the module.
        module_folder: string
            Object store folder containing module resources
        object_store: vizier.core.io.base.ObjectStore, optional
            Object store implementation to access and maintain resources

        Returns
        -------
        vizier.viztrail.objectstore.module.OSModuleHandle
        """
        # Make sure the object store is not None
        if object_store is None:
            object_store = DefaultObjectStore()
        # Serialize module components and materialize
        obj = serialize_module(command=command,
                               external_form=external_form,
                               state=state,
                               timestamp=timestamp,
                               outputs=outputs,
                               provenance=provenance)
        identifier = object_store.create_object(parent_folder=module_folder,
                                                content=obj)
        # Return handle for created module
        return OSModuleHandle(
            identifier=identifier,
            command=command,
            external_form=external_form,
            module_path=object_store.join(module_folder, identifier),
            state=state,
            timestamp=timestamp,
            datasets=datasets if not datasets is None else dict(),
            outputs=outputs,
            provenance=provenance,
            object_store=object_store)
Beispiel #24
0
    def load_module(
            identifier: str, 
            module_path: str, 
            prev_state: Optional[Dict[str, ArtifactDescriptor]] = None, 
            object_store: ObjectStore = DefaultObjectStore()
        ) -> "OSModuleHandle":
        """Load module from given object store.

        Parameters
        ----------
        identifier: string
            Unique module identifier
        module_path: string
            Resource path for module object
        prev_state: dict(string: vizier.datastore.dataset.DatasetDescriptor)
            Dataset descriptors keyed by the user-provided name that exist in
            the database state of the previous moudle (in sequence of occurrence
            in the workflow)
        object_store: vizier.core.io.base.ObjectStore, optional
            Object store implementation to access and maintain resources

        Returns
        -------
        vizier.viztrail.objectstore.module.OSModuleHandle
        """
        # Make sure the object store is not None
        # Read object from store. This may raise a ValueError to indicate that
        # the module does not exists (in a system error condtion). In this
        # case we return a new module that is in error state.
        try:
            obj = cast(Dict[str, Any], object_store.read_object(object_path=module_path))
        except ValueError:
            return OSModuleHandle(
                identifier=identifier,
                command=ModuleCommand(
                    package_id=UNKNOWN_ID,
                    command_id=UNKNOWN_ID,
                    arguments=list(),
                    packages=None
                ),
                external_form='fatal error: object not found',
                module_path=module_path,
                state=mstate.MODULE_ERROR,
                object_store=object_store
            )
        # Create module command
        command = ModuleCommand(
            package_id=obj[KEY_COMMAND][KEY_PACKAGE_ID],
            command_id=obj[KEY_COMMAND][KEY_COMMAND_ID],
            arguments=obj[KEY_COMMAND][KEY_ARGUMENTS],
            packages=None
        )
        # Create module timestamps
        created_at = to_datetime(obj[KEY_TIMESTAMP][KEY_CREATED_AT])
        if KEY_STARTED_AT in obj[KEY_TIMESTAMP]:
            started_at: Optional[datetime] = to_datetime(obj[KEY_TIMESTAMP][KEY_STARTED_AT])
        else:
            started_at = None
        if KEY_FINISHED_AT in obj[KEY_TIMESTAMP]:
            finished_at: Optional[datetime] = to_datetime(obj[KEY_TIMESTAMP][KEY_FINISHED_AT])
        else:
            finished_at = None
        timestamp = ModuleTimestamp(
            created_at=created_at,
            started_at=started_at,
            finished_at=finished_at
        )
        # Create module output streams.
        outputs = ModuleOutputs(
            stdout=get_output_stream(obj[KEY_OUTPUTS][KEY_STDOUT]),
            stderr=get_output_stream(obj[KEY_OUTPUTS][KEY_STDERR])
        )
        # Create module provenance information
        read_prov = None
        if KEY_PROVENANCE_READ in obj[KEY_PROVENANCE]:
            read_prov = dict()
            for ds in obj[KEY_PROVENANCE][KEY_PROVENANCE_READ]:
                read_prov[ds[KEY_DATASET_NAME]] = ds[KEY_DATASET_ID]
        write_prov = None
        if KEY_PROVENANCE_WRITE in obj[KEY_PROVENANCE]:
            write_prov = dict()
            for ds in obj[KEY_PROVENANCE][KEY_PROVENANCE_WRITE]:
                if KEY_DATAOBJECT_TYPE in ds:
                    descriptor = ArtifactDescriptor(
                        identifier=ds[KEY_DATAOBJECT_ID],
                        name=ds[KEY_DATAOBJECT_NAME],
                        artifact_type=ds[KEY_DATAOBJECT_TYPE])
                else: 
                    descriptor = DatasetDescriptor(
                        identifier=ds[KEY_DATASET_ID],
                        name=ds[KEY_DATASET_NAME],
                        columns=[
                            DatasetColumn(
                                identifier=col[KEY_COLUMN_ID],
                                name=col[KEY_COLUMN_NAME],
                                data_type=col[KEY_COLUMN_TYPE]
                            ) for col in ds[KEY_DATASET_COLUMNS]
                        ]
                    )
                write_prov[ds[KEY_DATASET_NAME]] = descriptor
        if KEY_PROVENANCE_DELETE in obj[KEY_PROVENANCE]:
            delete_prov = set(obj[KEY_PROVENANCE][KEY_PROVENANCE_DELETE])
        else:
            delete_prov = set()
        if KEY_PROVENANCE_RESOURCES in obj[KEY_PROVENANCE]:
            res_prov = cast(Dict[str, Any], obj[KEY_PROVENANCE][KEY_PROVENANCE_RESOURCES])
        else:
            res_prov = dict()
        if KEY_PROVENANCE_CHARTS in obj[KEY_PROVENANCE]:
            charts_prov = [
                ( 
                    c[0], 
                    ChartViewHandle.from_dict(c[1])  # type: ignore[no-untyped-call]
                ) if isinstance(c, list) else 
                (
                    "Chart",
                    ChartViewHandle.from_dict(c)
                )
                for c in obj[KEY_PROVENANCE][KEY_PROVENANCE_CHARTS]
            ]
        else:
            charts_prov = list()
        provenance = ModuleProvenance(
            read=read_prov,
            write=write_prov,
            delete=delete_prov,
            resources=res_prov,
            charts=charts_prov
        )
        # Return module handle
        return OSModuleHandle(
            identifier=identifier,
            command=command,
            external_form=obj[KEY_EXTERNAL_FORM],
            module_path=module_path,
            state=obj[KEY_STATE],
            timestamp=timestamp,
            outputs=outputs,
            provenance=provenance,
            object_store=object_store,
        )
Beispiel #25
0
class ContainerProjectCache(ProjectCache):
    """The project cache for containerized projects. It is assumed that each
    project runs in a separate container on the local machine that exposes the
    container API via a local port. Maintains a mapping of project identifier
    to information about local container in a separate file.
    """
    def __init__(self, viztrails, container_file, config):
        """Initialize the cache components and load all projects in the given
        viztrails repository. Maintains all projects in an dictionary keyed by
        their identifier.

        Parameters
        ----------
        viztrails: vizier.vizual.repository.ViztrailRepository
            Repository for viztrails
        container_file: string
            Path to the container information file
        config: vizier.config.app.AppConfig
            Application object
        """
        self.viztrails = viztrails
        self.container_file = container_file
        self.config = config
        self.container_image = config.engine.backend.container.image
        # Keep track of the port numbers for the project containers.
        self.ports = config.engine.backend.container.ports
        # Instantiate the Docker daemon client using the default socket or
        # configuration in the environment. This may need to be adjusted for
        # production deployments.
        self.client = docker.from_env()
        # Read mapping of project identifier to container information
        self.store = DefaultObjectStore()
        containers = dict()
        if self.store.exists(self.container_file):
            for obj in self.store.read_object(self.container_file):
                containers[obj['projectId']] = obj
        # Create index of project handles from existing viztrails. The project
        # handles do not have a reference to the datastore or filestore.
        self.projects = dict()
        for viztrail in self.viztrails.list_viztrails():
            container = containers[viztrail.identifier]
            project = ContainerProjectHandle(
                viztrail=viztrail,
                container_api=container['url'],
                port=container['port'],
                container_id=container['containerId'])
            self.projects[viztrail.identifier] = project

    def create_project(self, properties=None):
        """Create a new project. Will (i) create a viztrail in the underlying
        viztrail repository, and (ii) start a docker container for the project.

        Parameters
        ----------
        properties: dict, optional
            Set of properties for the new viztrail

        Returns
        -------
        vizier.engine.project.base.ProjectHandle
        """
        # Create the viztrail for the project
        viztrail = self.viztrails.create_viztrail(properties=properties)
        # Start a new docker container for the project on the next unused
        # port. Raises ValueError if all given port numbers are currently used.
        used_ports = [p.port for p in list(self.projects.values())]
        port = None
        for port_nr in self.ports:
            if not port_nr in used_ports:
                port = port_nr
                break
        if port is None:
            raise ValueError('no port number available')
        project_id = viztrail.identifier
        container = self.client.containers.run(
            image=self.container_image,
            environment={
                app.VIZIERSERVER_NAME: 'Project Container API - ' + project_id,
                app.VIZIERSERVER_BASE_URL: self.config.webservice.server_url,
                app.VIZIERSERVER_SERVER_PORT: port,
                app.VIZIERSERVER_SERVER_LOCAL_PORT: port,
                app.VIZIERSERVER_APP_PATH: self.config.webservice.app_path,
                app.VIZIERSERVER_LOG_DIR: '/app/data/logs/container',
                app.VIZIERENGINE_DATA_DIR: '/app/data',
                app.VIZIERSERVER_PACKAGE_PATH:
                '/app/resources/packages/common:/app/resources/packages/mimir',
                app.VIZIERSERVER_PROCESSOR_PATH:
                '/app/resources/processors/common:/app/resources/processors/mimir',
                contnr.VIZIERCONTAINER_PROJECT_ID: project_id,
                contnr.VIZIERCONTAINER_CONTROLLER_URL: self.config.app_base_url
            },
            network='host',
            detach=True)
        project = ContainerProjectHandle(
            viztrail=viztrail,
            container_api=self.config.get_url(port),
            port=port,
            container_id=container.id)
        self.projects[project.identifier] = project
        self.write_container_info()
        return project

    def delete_project(self, project_id):
        """Delete all resources that are associated with the given project.
        Returns True if the project existed and False otherwise.

        Parameters
        ----------
        project_id: string
            Unique project identifier

        Returns
        -------
        bool
        """
        if project_id in self.projects:
            project = self.projects[project_id]
            # Delete the viztrail for the project
            viztrail = project.viztrail
            # Stop and remove the associated container
            self.viztrails.delete_viztrail(viztrail.identifier)
            container = self.client.containers.get(project.container_id)
            container.stop()
            container.remove()
            # Remove project from internal cache and update the materialized
            # mapping of projects to containers
            del self.projects[project_id]
            self.write_container_info()
            return True
        return False

    def get_branch(self, project_id, branch_id):
        """Get the branch with the given identifier for the specified project.
        The result is None if the project of branch does not exist.

        Parameters
        ----------
        project_id: string
            Unique project identifier
        branch_id: string
            Unique branch identifier

        Returns
        -------
        vizier.viztrail.branch.BranchHandle
        """
        # If the project is not in the internal cache it does not exist
        if not project_id in self.projects:
            return None
        # Return the handle for the specified branch
        return self.projects[project_id].viztrail.get_branch(branch_id)

    def get_project(self, project_id):
        """Get the handle for project. Returns None if the project does not
        exist.

        Returns
        -------
        vizier.engine.project.base.ProjectHandle
        """
        if project_id in self.projects:
            return self.projects[project_id]
        return None

    def list_projects(self):
        """Get a list of handles for all projects.

        Returns
        -------
        list(vizier.engine.project.base.ProjectHandle)
        """
        return list(self.projects.values())

    def write_container_info(self):
        """Write the current mapping of project identifier to project containers
        to the object store container file.
        """
        self.store.write_object(content=[{
            'projectId': p.identifier,
            'containerId': p.container_id,
            'port': p.port,
            'url': p.container_api
        } for p in list(self.projects.values())],
                                object_path=self.container_file)
Beispiel #26
0
    def load_module(identifier,
                    module_path,
                    prev_state=None,
                    object_store=None):
        """Load module from given object store.

        Parameters
        ----------
        identifier: string
            Unique module identifier
        module_path: string
            Resource path for module object
        prev_state: dict(string: vizier.datastore.dataset.DatasetDescriptor)
            Dataset descriptors keyed by the user-provided name that exist in
            the database state of the previous moudle (in sequence of occurrence
            in the workflow)
        object_store: vizier.core.io.base.ObjectStore, optional
            Object store implementation to access and maintain resources

        Returns
        -------
        vizier.viztrail.objectstore.module.OSModuleHandle
        """
        # Make sure the object store is not None
        if object_store is None:
            object_store = DefaultObjectStore()
        # Read object from store. This may raise a ValueError to indicate that
        # the module does not exists (in a system error condtion). In this
        # case we return a new module that is in error state.
        try:
            obj = object_store.read_object(object_path=module_path)
        except ValueError:
            return OSModuleHandle(
                identifier=identifier,
                command=ModuleCommand(package_id=UNKNOWN_ID,
                                      command_id=UNKNOWN_ID),
                external_form='fatal error: object not found',
                module_path=module_path,
                state=mstate.MODULE_ERROR,
                object_store=object_store)
        # Create module command
        command = ModuleCommand(package_id=obj[KEY_COMMAND][KEY_PACKAGE_ID],
                                command_id=obj[KEY_COMMAND][KEY_COMMAND_ID],
                                arguments=obj[KEY_COMMAND][KEY_ARGUMENTS])
        # Create module timestamps
        created_at = to_datetime(obj[KEY_TIMESTAMP][KEY_CREATED_AT])
        if KEY_STARTED_AT in obj[KEY_TIMESTAMP]:
            started_at = to_datetime(obj[KEY_TIMESTAMP][KEY_STARTED_AT])
        else:
            started_at = None
        if KEY_FINISHED_AT in obj[KEY_TIMESTAMP]:
            finished_at = to_datetime(obj[KEY_TIMESTAMP][KEY_FINISHED_AT])
        else:
            finished_at = None
        timestamp = ModuleTimestamp(created_at=created_at,
                                    started_at=started_at,
                                    finished_at=finished_at)
        # Create module output streams.
        outputs = ModuleOutputs(
            stdout=get_output_stream(obj[KEY_OUTPUTS][KEY_STDOUT]),
            stderr=get_output_stream(obj[KEY_OUTPUTS][KEY_STDERR]))
        # Create module provenance information
        read_prov = None
        if KEY_PROVENANCE_READ in obj[KEY_PROVENANCE]:
            read_prov = dict()
            for ds in obj[KEY_PROVENANCE][KEY_PROVENANCE_READ]:
                read_prov[ds[KEY_DATASET_NAME]] = ds[KEY_DATASET_ID]
        write_prov = None
        if KEY_PROVENANCE_WRITE in obj[KEY_PROVENANCE]:
            write_prov = dict()
            for ds in obj[KEY_PROVENANCE][KEY_PROVENANCE_WRITE]:
                descriptor = DatasetDescriptor(
                    identifier=ds[KEY_DATASET_ID],
                    columns=[
                        DatasetColumn(identifier=col[KEY_COLUMN_ID],
                                      name=col[KEY_COLUMN_NAME],
                                      data_type=col[KEY_COLUMN_TYPE])
                        for col in ds[KEY_DATASET_COLUMNS]
                    ],
                    row_count=ds[KEY_DATASET_ROWCOUNT])
                write_prov[ds[KEY_DATASET_NAME]] = descriptor
        delete_prov = None
        if KEY_PROVENANCE_DELETE in obj[KEY_PROVENANCE]:
            delete_prov = obj[KEY_PROVENANCE][KEY_PROVENANCE_DELETE]
        res_prov = None
        if KEY_PROVENANCE_RESOURCES in obj[KEY_PROVENANCE]:
            res_prov = obj[KEY_PROVENANCE][KEY_PROVENANCE_RESOURCES]
        charts_prov = None
        if KEY_PROVENANCE_CHARTS in obj[KEY_PROVENANCE]:
            charts_prov = [
                ChartViewHandle.from_dict(c)
                for c in obj[KEY_PROVENANCE][KEY_PROVENANCE_CHARTS]
            ]
        provenance = ModuleProvenance(read=read_prov,
                                      write=write_prov,
                                      delete=delete_prov,
                                      resources=res_prov,
                                      charts=charts_prov)
        # Create dictionary of dataset descriptors only if previous state is
        # given and the module is in SUCCESS state. Otherwise, the database
        # state is empty.
        if obj[KEY_STATE] == mstate.MODULE_SUCCESS and not prev_state is None:
            datasets = provenance.get_database_state(prev_state)
        else:
            datasets = dict()
        # Return module handle
        return OSModuleHandle(identifier=identifier,
                              command=command,
                              external_form=obj[KEY_EXTERNAL_FORM],
                              module_path=module_path,
                              state=obj[KEY_STATE],
                              timestamp=timestamp,
                              datasets=datasets,
                              outputs=outputs,
                              provenance=provenance,
                              object_store=object_store)
Beispiel #27
0
def get_engine(config):
    """Create instance of vizier engine using the default datastore, filestore
    and viztrails factories. The default engine may use a multi-process backend
    or a celery backend.

    Parameters
    ----------
    config: vizier.config.app.AppConfig
        Application configuration object

    Returns
    -------
    vizier.engine.base.VizierEngine
    """
    # Get backend identifier. Raise ValueError if value does not identify
    # a valid backend.
    backend_id = config.engine.backend.identifier
    if not backend_id in base.BACKENDS:
        raise ValueError('unknown backend \'' + str(backend_id) + '\'')
    # Get the identifier factory for the viztrails repository and create
    # the object store. At this point we use the default object store only.
    # We could add another environment variable to use different object
    # stores (once implemented).
    if config.engine.use_short_ids:
        id_factory = get_short_identifier
    else:
        id_factory = get_unique_identifier
    object_store = DefaultObjectStore(
        identifier_factory=id_factory
    )
    # By default the vizier engine uses the objectstore implementation for
    # the viztrails repository. The datastore and filestore factories depend
    # on the values of engine identifier (DEV or MIMIR).
    base_dir = config.engine.data_dir
    viztrails_dir = os.path.join(base_dir, app.DEFAULT_VIZTRAILS_DIR)
    if config.engine.identifier in [base.DEV_ENGINE, base.MIMIR_ENGINE]:
        filestores_dir = os.path.join(base_dir, app.DEFAULT_FILESTORES_DIR)
        filestore_factory=FileSystemFilestoreFactory(filestores_dir)
        datastores_dir = os.path.join(base_dir, app.DEFAULT_DATASTORES_DIR)
        if config.engine.identifier == base.DEV_ENGINE:
            datastore_factory = FileSystemDatastoreFactory(datastores_dir)
        else:
            datastore_factory = MimirDatastoreFactory(datastores_dir)
    else:
        raise ValueError('unknown vizier engine \'' + str(config.engine.identifier) + '\'')
    # The default engine uses a common project cache.
    projects = SingleProjectCache(
        ProjectHandle(
            viztrail=ViztrailHandle(identifier=config.project_id),
            datastore=datastore_factory.get_datastore(config.project_id),
            filestore=filestore_factory.get_filestore(config.project_id)
        )
    )
    # Create workflow execution backend and processor for synchronous task
    packages = load_packages(config.engine.package_path)
    processors = load_processors(config.engine.processor_path)
    # Create the backend
    if backend_id == base.BACKEND_MULTIPROCESS:
        backend = MultiProcessBackend(
            processors=processors,
            projects=projects,
            synchronous=None
        )
    elif backend_id == base.BACKEND_CELERY:
        # Create and configure routing information (if given)
        backend = CeleryBackend(
            routes=config_routes(config),
            synchronous=None
        )
    else:
        # For completeness. Validity of the backend id is be checked before.
        raise ValueError('unknown backend \'' + str(backend_id) + '\'')
    return VizierEngine(
        name=config.engine.identifier + ' (' + backend_id + ')',
        projects=projects,
        backend=backend,
        packages=packages
    )