Beispiel #1
0
    def load_dataset(
            self,
            datastore: Datastore,
            filestore: Filestore,
            file_id: Optional[str] = None,
            url: Optional[str] = None,
            detect_headers: bool = True,
            infer_types: bool = True,
            load_format: str = 'csv',
            options: List[Dict[str, str]] = [],
            username: str = None,
            password: str = None,
            resources: Optional[Dict[str, Any]] = None,
            reload: bool = False,
            human_readable_name: Optional[str] = None,
            proposed_schema: List[Tuple[str, str]] = []) -> VizualApiResult:
        """Create (or load) a new dataset from a given file or Uri. It is
        guaranteed that either the file identifier or the url are not None but
        one of them will be None. The user name and password may only be given
        if an url is given.

        The resources refer to any resoures (e.g., file identifier) that have
        been generated by a previous execution of the respective task. This
        allows to associate an identifier with a downloaded file to avoid future
        downloads (unless the reload flag is True).

        ^--- Vistrails will automatically skip re-execution, so the only reason
        that we'd re-execute the cell is if the user manually asked us to.  If 
        that's the case, we should actually reload the file (e.g., because we
        may be reloading with different parameters).


        Parameters
        ----------
        datastore : Datastore to retireve and update datasets
        filestore: Filestore to retrieve uploaded datasets
        file_id: Identifier for a file in an associated filestore
        url: Identifier for a web resource
        detect_headers: Detect column names in loaded file if True
        infer_types: Infer column types for loaded dataset if True
        load_format: Format identifier
        options: Additional options for Mimirs load command
        username: User name for authentication when accessing restricted resources
        password: Password for authentication when accessing restricted resources
        resources: Dictionary of additional resources (i.e., key,value pairs) that were
            generated during a previous execution of the associated module
        reload: If set to false, avoid reloading the data if possible.
        human_readable_name: A user-facing name for this table
        proposed_schema: A list of name/type pairs that will override 
                         the inferred column names/types if present

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        dataset = None
        f_handle = None
        result_resources = dict()
        if url is not None:
            if (debug_is_on()):
                print("LOAD URL: {}".format(url))
            # If the same url has been previously used to generate a dataset
            # we do not need to download the file and re-create the dataset.
            if not reload and not resources is None and base.RESOURCE_URL in resources and base.RESOURCE_DATASET in resources:
                # Check if the previous download matches the given Uri
                if resources[base.RESOURCE_URL] == url:
                    ds_id = resources[base.RESOURCE_DATASET]
                    if (debug_is_on()):
                        print("   ... re-using existing dataset {}".format(
                            ds_id))
                    dataset = datastore.get_dataset(ds_id)
            result_resources[base.RESOURCE_URL] = url
        elif file_id is not None:
            if debug_is_on():
                print("LOAD FILE: {}".format(file_id))
            # If the same file has been previously used to generate a dataset
            # we do not need to re-create it.
            if (not reload) and (resources is not None) and (
                    base.RESOURCE_FILEID
                    in resources) and (base.RESOURCE_DATASET in resources):
                if resources[base.RESOURCE_FILEID] == file_id:
                    ds_id = resources[base.RESOURCE_DATASET]
                    # if(debug_is_on()):
                    print("   ... re-using existing dataset {}".format(ds_id))
                    dataset = datastore.get_dataset(ds_id)
                    print("DATASET: {}".format(dataset))
            # If the dataset is None we will load the dataset from an uploaded
            # file. Need to get the file handle for the file here.
            if dataset is None:
                print("getting file")
                f_handle = filestore.get_file(file_id)
                if (f_handle is None):
                    raise ValueError(
                        "The uploaded file got deleted, try re-uploading.")
            result_resources[base.RESOURCE_FILEID] = file_id
        else:
            raise ValueError('no source identifier given for load')

        # If the dataset is still None at this point we need to call the
        # load_dataset method of the datastore to load it.
        if dataset is None:
            if (url is None and f_handle is None):
                raise ValueError("Need an URL or an Uploaded File to load")
            assert (isinstance(datastore, MimirDatastore))
            if (debug_is_on()):
                print("   ... loading dataset {} / {}".format(url, f_handle))
            dataset = datastore.load_dataset(
                f_handle=f_handle,
                url=url,
                detect_headers=detect_headers,
                infer_types=infer_types,
                load_format=load_format,
                human_readable_name=human_readable_name,
                options=options,
                proposed_schema=proposed_schema)
        result_resources[base.RESOURCE_DATASET] = dataset.identifier
        return VizualApiResult(dataset=dataset, resources=result_resources)

        # Ensure that file name references a previously uploaded file.
        f_handle = self.fileserver.get_file(file_id)
        if f_handle is None:
            raise ValueError('unknown file \'' + file_id + '\'')
Beispiel #2
0
    def load_dataset(
            self,
            datastore: Datastore,
            filestore: Filestore,
            file_id: Optional[str] = None,
            url: Optional[str] = None,
            detect_headers: bool = True,
            infer_types: bool = True,
            load_format: str = 'csv',
            options: List[Dict[str, str]] = [],
            username: str = None,
            password: str = None,
            resources: Optional[Dict[str, Any]] = None,
            reload: bool = False,
            human_readable_name: Optional[str] = None,
            proposed_schema: List[Tuple[str, str]] = []) -> VizualApiResult:
        """Create (or load) a new dataset from a given file or Uri. It is
        guaranteed that either the file identifier or the url are not None but
        one of them will be None. The user name and password may only be given
        if an url is given.

        The resources refer to any resoures (e.g., file identifier) that have
        been generated by a previous execution of the respective task. This
        allows to associate an identifier with a downloaded file to avoid future
        downloads (unless the reload flag is True).

        Parameters
        ----------
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets
        filestore: vizier.filestore.Filestore
            Filestore to retrieve uploaded datasets
        file_id: string, optional
            Identifier for a file in an associated filestore
        url: string, optional
            Identifier for a web resource
        detect_headers: bool, optional
            Detect column names in loaded file if True
        infer_types: bool, optional
            Infer column types for loaded dataset if True
        load_format: string, optional
            Format identifier
        options: list, optional
            Additional options for Mimirs load command
        username: string, optional
            User name for authentication when accessing restricted resources
        password: string, optional
            Password for authentication when accessing restricted resources
        resources: dict, optional
            Dictionary of additional resources (i.e., key,value pairs) that were
            generated during a previous execution of the associated module
        reload: bool, optional
            Flag to force download of a remote resource even if it was
            downloaded previously

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        dataset = None
        result_resources = dict()
        if url is not None:
            # If the same url has been previously used to generate a dataset
            # we do not need to download the file and re-create the dataset.
            if not reload and not resources is None and base.RESOURCE_URL in resources and base.RESOURCE_DATASET in resources:
                # Check if the previous download matches the given Uri
                if resources[base.RESOURCE_URL] == url:
                    ds_id = resources[base.RESOURCE_DATASET]
                    dataset = datastore.get_dataset(ds_id)
            # If dataset is still None we need to create a new dataset by
            # downloading the given Uri
            if dataset is None:
                assert (isinstance(datastore, FileSystemDatastore))
                dataset = datastore.download_dataset(
                    url=url,
                    username=username,
                    password=password,
                )
            result_resources[base.RESOURCE_URL] = url
        else:
            # either url or file_id must not be None
            assert (file_id is not None)

            # If the same file has been previously used to generate a dataset
            # we do not need to re-create it.
            if not resources is None and base.RESOURCE_FILEID in resources and base.RESOURCE_DATASET in resources:
                if resources[base.RESOURCE_FILEID] == file_id:
                    ds_id = resources[base.RESOURCE_DATASET]
                    dataset = datastore.get_dataset(ds_id)
            # If dataset is still None we need to create a new dataset from the
            # specified file
            if dataset is None:
                dataset = datastore.load_dataset(
                    f_handle=filestore.get_file(file_id),
                    proposed_schema=proposed_schema)
            result_resources[base.RESOURCE_FILEID] = file_id
        # Ensure that the dataset is not None at this point
        if dataset is None:
            raise ValueError('unknown file or resource')
        result_resources[base.RESOURCE_DATASET] = dataset.identifier
        return VizualApiResult(dataset=dataset, resources=result_resources)