コード例 #1
0
 def from_mimir(
         response: Dict[str, Any], 
         name: Optional[str] = None
     ) -> "VizualApiResult":
     ds = MimirDatasetHandle.from_mimir_result(
         table_name = response["name"], 
         schema = response["schema"], 
         properties = response["properties"],
         name = name
     )
     return VizualApiResult(ds)
コード例 #2
0
    def create_dataset(
            self,
            columns: List[DatasetColumn],
            rows: List[DatasetRow],
            properties: Dict[str, Any] = None,
            human_readable_name: str = "Untitled Dataset",
            backend_options: Optional[List[Tuple[str, str]]] = None,
            dependencies: Optional[List[str]] = None) -> MimirDatasetHandle:
        """Create a new dataset in the datastore. Expects at least the list of
        columns and the rows for the dataset.

        Parameters
        ----------
        columns: list(vizier.datastore.dataset.DatasetColumn)
            List of columns. It is expected that each column has a unique
            identifier.
        rows: list(vizier.datastore.dataset.DatasetRow)
            List of dataset rows.
        properties: dict(string, any), optional
            Annotations for dataset components

        Returns
        -------
        vizier.datastore.dataset.DatasetDescriptor
        """
        # Get unique identifier for new dataset
        properties = {} if properties is None else properties
        backend_options = [] if backend_options is None else backend_options
        dependencies = [] if dependencies is None else dependencies
        identifier = 'DS_' + get_unique_identifier()
        columns = [
            col if isinstance(col, MimirDatasetColumn) else MimirDatasetColumn(
                identifier=col.identifier,
                name_in_dataset=col.name,
                data_type=col.data_type) for col in columns
        ]

        table_name, schema = mimir.loadDataInline(
            schema=[{
                "name": base.sanitize_column_name(col.name),
                "type": col.data_type
            } for col in columns],
            rows=[row.values for row in rows],
            result_name=identifier,
            human_readable_name=human_readable_name,
            dependencies=dependencies,
            properties=properties)

        # Insert the new dataset metadata information into the datastore
        return MimirDatasetHandle.from_mimir_result(table_name=table_name,
                                                    schema=schema,
                                                    properties=properties,
                                                    name=human_readable_name)
コード例 #3
0
ファイル: mimir.py プロジェクト: mikebrachmann/web-api-async
    def sort_dataset(self, identifier: str, columns: List[int],
                     reversed: List[bool],
                     datastore: Datastore) -> VizualApiResult:
        """Sort the dataset with the given identifier according to the order by
        statement. The order by statement is a pair of lists. The first list
        contains the identifier of columns to sort on. The second list contains
        boolean flags, one for each entry in columns, indicating whether sort
        order is revered for the corresponding column or not.

        Returns the number of rows in the dataset and the identifier of the
        sorted dataset.

        Raises ValueError if no dataset with given identifier exists or if any
        of the columns in the order by clause are unknown.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        columns: list(int)
            List of column identifier for sort columns.
        reversed: list(bool)
            Flags indicating whether the sort order of the corresponding column
            is reveresed.
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        assert (isinstance(dataset, MimirDatasetHandle))
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Create order by clause based on columns and reversed flags
        order_by_clause = list()
        for i in range(len(columns)):
            col_id = columns[i]
            stmt = cast(MimirDatasetColumn,
                        dataset.column_by_id(col_id)).name_in_rdb
            if reversed[i]:
                stmt += ' DESC'
            order_by_clause.append(stmt)
        sql = 'SELECT * FROM ' + dataset.identifier + ' ORDER BY '
        sql += ','.join(order_by_clause)
        view_name, dependencies, schema, properties, functionDeps = mimir.createView(
            datasets={dataset.identifier: dataset.identifier}, query=sql)
        ds = MimirDatasetHandle.from_mimir_result(view_name, schema,
                                                  properties)
        return VizualApiResult(ds)
コード例 #4
0
ファイル: mimir.py プロジェクト: mikebrachmann/web-api-async
 def materialize_dataset(self, identifier: str,
                         datastore: Datastore) -> VizualApiResult:
     """Create a materialized snapshot of the dataset for faster
     execution."""
     input_dataset = datastore.get_dataset(identifier)
     if input_dataset is None:
         raise ValueError('unknown dataset \'' + identifier + '\'')
     cast(MimirDatasetHandle, input_dataset)
     response = mimir.materialize(input_dataset.identifier)
     output_ds = MimirDatasetHandle(
         identifier=response["name"],
         columns=cast(List[MimirDatasetColumn], input_dataset.columns),
         properties=input_dataset.get_properties(),
         name=input_dataset.name
         if input_dataset.name is not None else "untitled dataset")
     return VizualApiResult(output_ds)
コード例 #5
0
    def get_dataset(self, identifier):
        """Read a full dataset from the data store. Returns None if no dataset
        with the given identifier exists.

        Parameters
        ----------
        identifier : string
            Unique dataset identifier

        Returns
        -------
        vizier.datastore.mimir.dataset.MimirDatasetHandle
        """
        # Return None if the dataset file does not exist
        dataset_file = self.get_dataset_file(identifier)
        if not os.path.isfile(dataset_file):
            return None
        annotations = DatasetMetadata.from_file(
            self.get_metadata_filename(identifier))
        return MimirDatasetHandle.from_file(dataset_file,
                                            annotations=annotations)
コード例 #6
0
    def get_dataset(self,
                    identifier: str,
                    force_profiler: Optional[bool] = None,
                    name: Optional[str] = None) -> MimirDatasetHandle:
        """Read a full dataset from the data store. Returns None if no dataset
        with the given identifier exists.

        Parameters
        ----------
        identifier : string
            Unique dataset identifier

        Returns
        -------
        vizier.datastore.mimir.dataset.MimirDatasetHandle
        """
        # Return None if the dataset file does not exist
        schema, properties = mimir.getTableInfo(identifier,
                                                force_profiler=force_profiler)
        return MimirDatasetHandle.from_mimir_result(identifier, schema,
                                                    properties, name)
コード例 #7
0
    def compute(self, command_id: str, arguments: "ModuleArguments",
                context: TaskContext) -> ExecResult:
        """Compute results for commands in the sampling package using 
        the set of user-provided arguments and the current database 
        state.

        Parameters
        ----------
        command_id: string
            Unique identifier for a command in a package declaration
        arguments: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """

        input_ds_name = arguments.get_value(cmd.PARA_INPUT_DATASET).lower()
        input_dataset: DatasetDescriptor = context.get_dataset(input_ds_name)
        if input_dataset is None:
            raise ValueError('unknown dataset \'' + input_ds_name + '\'')

        output_ds_name = arguments.get_value(cmd.PARA_OUTPUT_DATASET,
                                             raise_error=False)
        if output_ds_name is None or output_ds_name == "":
            output_ds_name = input_ds_name + "_SAMPLE"
        output_ds_name = output_ds_name.lower()

        # Load the sampling configuration
        sample_mode = None

        if command_id == cmd.BASIC_SAMPLE:
            sampling_rate = float(arguments.get_value(cmd.PARA_SAMPLING_RATE))
            if sampling_rate > 1.0 or sampling_rate < 0.0:
                raise Exception("Sampling rate must be between 0.0 and 1.0")
            sample_mode = {
                "mode": cmd.SAMPLING_MODE_UNIFORM_PROBABILITY,
                "probability": sampling_rate
            }
        elif command_id == cmd.MANUAL_STRATIFIED_SAMPLE or command_id == cmd.AUTOMATIC_STRATIFIED_SAMPLE:
            column = arguments.get_value(cmd.PARA_STRATIFICATION_COLUMN)
            column_defn = input_dataset.columns[column]
            if command_id == cmd.MANUAL_STRATIFIED_SAMPLE:
                strata = [{
                    "value":
                    stratum.get_value(cmd.PARA_STRATUM_VALUE),
                    "probability":
                    stratum.get_value(cmd.PARA_SAMPLING_RATE)
                } for stratum in arguments.get_value(cmd.PARA_STRATA)]
            else:
                probability = arguments.get_value(cmd.PARA_SAMPLING_RATE)
                strata = self.get_automatic_strata(input_dataset, column_defn,
                                                   probability)
            sample_mode = {
                "mode": cmd.SAMPLING_MODE_STRATIFIED_ON,
                "column": column_defn.name,
                "type": column_defn.data_type,
                "strata": strata
            }
        else:
            raise Exception("Unknown sampling command: {}".format(command_id))

        table_name, schema = mimir.createSample(input_dataset.identifier,
                                                sample_mode,
                                                result_name="SAMPLE_" +
                                                get_unique_identifier())
        ds = MimirDatasetHandle.from_mimir_result(table_name,
                                                  schema,
                                                  properties={},
                                                  name=output_ds_name)

        # And start rendering some output
        outputs = ModuleOutputs()
        ds_output = server.api.datasets.get_dataset(
            project_id=context.project_id,
            dataset_id=ds.identifier,
            offset=0,
            limit=10)
        if ds_output is not None:
            ds_output['name'] = output_ds_name
            outputs.stdout.append(DatasetOutput(ds_output))
        else:
            outputs.stderr.append(TextOutput("Error displaying dataset"))

        # Record Reads and writes
        provenance = ModuleProvenance(
            read={input_ds_name: input_dataset.identifier},
            write={
                output_ds_name:
                DatasetDescriptor(identifier=ds.identifier,
                                  name=output_ds_name,
                                  columns=ds.columns)
            })

        # Return task result
        return ExecResult(outputs=outputs, provenance=provenance)
コード例 #8
0
    def compute(self, command_id, arguments, context):
        """Compute results for commands in the Mimir package using the set of
        user-provided arguments and the current database state.

        Parameters
        ----------
        command_id: string
            Unique identifier for a command in a package declaration
        arguments: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        outputs = ModuleOutputs()
        # Get dataset. Raise exception if dataset is unknown.
        ds_name = arguments.get_value(pckg.PARA_DATASET).lower()
        dataset = context.get_dataset(ds_name)
        mimir_table_name = dataset.identifier
        # Keep track of the name of the input dataset for the provenance
        # information.
        input_ds_name = ds_name
        if command_id == cmd.MIMIR_GEOCODE:
            geocoder = arguments.get_value(cmd.PARA_GEOCODER)
            # Add columns for LATITUDE and LONGITUDE
            column_counter = dataset.max_column_id() + 1
            cname_lat = dataset.get_unique_name('LATITUDE')
            cname_lon = dataset.get_unique_name('LONGITUDE')
            dataset.columns.append(
                MimirDatasetColumn(
                    identifier=column_counter,
                    name_in_dataset=cname_lat,
                    data_type=DATATYPE_REAL
                )
            )
            dataset.columns.append(
                MimirDatasetColumn(
                    identifier=column_counter + 1,
                    name_in_dataset=cname_lon,
                    data_type=DATATYPE_REAL
                )
            )
            house = arguments.get_value(cmd.PARA_HOUSE_NUMBER, raise_error=False, default_value=None)
            street = arguments.get_value(cmd.PARA_STREET, raise_error=False, default_value=None)
            city = arguments.get_value(cmd.PARA_CITY, raise_error=False, default_value=None)
            state = arguments.get_value(cmd.PARA_STATE, raise_error=False, default_value=None)

            params = {
                'houseColumn': dataset.column_by_id(house).name_in_rdb   if house  is not None and house  != '' else None,
                'streetColumn': dataset.column_by_id(street).name_in_rdb if street is not None and street != '' else None,
                'cityColumn': dataset.column_by_id(city).name_in_rdb     if city   is not None and city   != '' else None,
                'stateColumn': dataset.column_by_id(state).name_in_rdb   if state  is not None and state  != '' else None,
                'geocoder': geocoder#,
                #'latitudeColumn': Option[String],
                #'longitudeColumn': Option[String],
                #'cacheCode': Option[String]
            }
        elif command_id == cmd.MIMIR_KEY_REPAIR:
            column = dataset.column_by_id(arguments.get_value(pckg.PARA_COLUMN))
            params = { "key" : column.name_in_rdb }
        elif command_id == cmd.MIMIR_MISSING_KEY:
            column = dataset.column_by_id(arguments.get_value(pckg.PARA_COLUMN))
            params = column.name_in_rdb
            # Set MISSING ONLY to FALSE to ensure that all rows are returned
            #params += ['MISSING_ONLY(FALSE)']
            # Need to run this lens twice in order to generate row ids for
            # any potential new tuple
        elif command_id == cmd.MIMIR_MISSING_VALUE:
            params = list()
            for col in arguments.get_value(cmd.PARA_COLUMNS, default_value=[]):
                f_col = dataset.column_by_id(col.get_value(pckg.PARA_COLUMN))
                param = f_col.name_in_rdb
                col_constraint = col.get_value(
                    cmd.PARA_COLUMNS_CONSTRAINT,
                    raise_error=False
                )
                if col_constraint == '':
                    col_constraint = None
                #if not col_constraint is None:
                #    param = param + ' ' + str(col_constraint).replace("'", "\'\'").replace("OR", ") OR (")
                #param = '\'(' + param + ')\''
                params.append(param)
        elif command_id == cmd.MIMIR_PICKER:
            # Compute the input columns
            inputs = []
            for col in arguments.get_value(cmd.PARA_SCHEMA):
                c_col = col.get_value(cmd.PARA_PICKFROM)
                column = dataset.column_by_id(c_col)
                inputs.append(column.name_in_rdb)

            # Compute the output column
            output = arguments.get_value(cmd.PARA_PICKAS, default_value = inputs[0])
            if output == "":
                output = inputs[0]
            else:
                output = dataset.get_unique_name(output.strip().upper())

            # Compute the final parameter list
            params = {
                "inputs" : inputs,
                "output" : output
            }
        elif command_id == cmd.MIMIR_TYPE_INFERENCE:
            params = [str(arguments.get_value(cmd.PARA_PERCENT_CONFORM))]
        elif command_id == cmd.MIMIR_SHAPE_DETECTOR:
            dseModel = arguments.get_value(cmd.PARA_MODEL_NAME)
            params = []
            if not dseModel is None:
                params = [str(dseModel)]
        elif command_id == cmd.MIMIR_COMMENT:
            commentsParams = []
            for idx, comment in enumerate(arguments.get_value(cmd.PARA_COMMENTS)):
                commentParam = {}
                
                # If target is defined, it is the column that we're trying to annotate
                # If unset (or empty), it means we're annotating the row.
                column_id = comment.get_value(cmd.PARA_EXPRESSION, None)

                if column_id is not None:
                    column = dataset.column_by_id(column_id)
                    commentParam['target'] = column.name_in_rdb

                # The comment
                commentParam['comment'] = comment.get_value(cmd.PARA_COMMENT)

                # If rowid is defined, it is the row that we're trying to annotate.  
                # If unset (or empty), it means that we're annotating all rows
                rowid = comment.get_value(cmd.PARA_ROWID, None) 
                if (rowid is not None) and (rowid != ""):
                    # If rowid begins with '=', it's a formula
                    if rowid[0] == '=':
                        commentParam['condition'] = rowid[1:]
                    else:
                        commentParam['rows'] = [ int(rowid) ]
                
                #TODO: handle result columns
                commentsParams.append(commentParam)
            params = {'comments' : commentsParams}
        elif command_id == cmd.MIMIR_PIVOT:
            column = dataset.column_by_id(arguments.get_value(pckg.PARA_COLUMN))
            params = {
                "target" : column.name_in_rdb,
                "keys" : [],
                "values" : []
            }
            for col_arg in arguments.get_value(cmd.PARA_VALUES):
                col = dataset.column_by_id(col_arg.get_value(cmd.PARA_VALUE))
                params["values"].append(col.name_in_rdb)
            for col_arg in arguments.get_value(cmd.PARA_KEYS, default_value=[]):
                col = dataset.column_by_id(col_arg.get_value(cmd.PARA_KEY))
                params["keys"].append(col.name_in_rdb)
            if len(params["values"]) < 1:
                raise ValueError("Need at least one value column")
            # store_as_dataset = arguments.get_value(cmd.PARA_RESULT_DATASET)
        elif command_id == cmd.MIMIR_SHRED:
            params = { 
                "keepOriginalColumns" : arguments.get_value(cmd.PARA_KEEP_ORIGINAL)
            }
            shreds = []
            global_input_col = dataset.column_by_id(arguments.get_value(cmd.PARA_COLUMN_NAME))
            for (idx, shred) in enumerate(arguments.get_value(cmd.PARA_COLUMNS)):
                output_col = shred.get_value(cmd.PARA_OUTPUT_COLUMN)
                if output_col is None:
                    output_col = "{}_{}".format(global_input_col,idx)
                config = {}
                shred_type = shred.get_value(cmd.PARA_TYPE)
                expression = shred.get_value(cmd.PARA_EXPRESSION)
                group = shred.get_value(cmd.PARA_INDEX)
                if shred_type == "pattern":
                    config["regexp"] = expression
                    config["group"] = int(group)
                elif shred_type == "field":
                    config["separator"] = expression
                    config["field"] = int(group)
                elif shred_type == "explode":
                    config["separator"] = expression
                elif shred_type == "pass":
                    pass
                elif shred_type == "substring":
                    range_parts = re.match("([0-9]+)(([+\\-])([0-9]+))?", expression)
                    # print(range_parts)

                    # Mimir expects ranges to be given from start (inclusive) to end (exclusive)
                    # in a zero-based numbering scheme.

                    # Vizier expects input ranges to be given in a one-based numbering scheme.

                    # Convert to this format

                    if range_parts is None:
                        raise ValueError("Substring requires a range of the form '10', '10-11', or '10+1', but got '{}'".format(expression))
                    config["start"] = int(range_parts.group(1))-1 # Convert 1-based numbering to 0-based
                    if range_parts.group(2) is None:
                        config["end"] = config["start"] + 1 # if only one character, split one character
                    elif range_parts.group(3) == "+":
                        config["end"] = config["start"] + int(range_parts.group(4)) # start + length
                    elif range_parts.group(3) == "-":
                        config["end"] = int(range_parts.group(4)) # Explicit end, 1-based -> 0-based and exclusive cancel out
                    else:
                        raise ValueError("Invalid expression '{}' in substring shredder".format(expression))
                    # print("Shredding {} <- {} -- {}".format(output_col,config["start"],config["end"]))
                else:
                    raise ValueError("Invalid Shredding Type '{}'".format(shred_type))

                shreds.append({
                    **config,
                    "op" : shred_type,
                    "input" : global_input_col.name_in_rdb,
                    "output" : output_col,
                })
            params["shreds"] = shreds
            # store_as_dataset = arguments.get_value(cmd.PARA_RESULT_DATASET)
        else:
            raise ValueError("Unknown Mimir lens '{}'".format(command_id))
        # Create Mimir lens
       
        mimir_lens_response = mimir.createLens(
            mimir_table_name,
            params,
            command_id,
            arguments.get_value(cmd.PARA_MATERIALIZE_INPUT, default_value=True),
            human_readable_name = ds_name.upper()
        )
        lens_name = mimir_lens_response['name']
        lens_schema = mimir_lens_response['schema']
        lens_properties = mimir_lens_response['properties']

        ds = MimirDatasetHandle.from_mimir_result(lens_name, lens_schema, lens_properties, ds_name)

        if command_id in LENSES_THAT_SHOULD_NOT_DISPLAY_TABLES:
            print_dataset_schema(outputs, ds_name, ds.columns)
        else:
            from vizier.api.webservice import server
            ds_output = server.api.datasets.get_dataset(
                project_id=context.project_id,
                dataset_id=ds.identifier,
                offset=0,
                limit=10
            )
            outputs.stdout.append(DatasetOutput(ds_output))
        
        # Return task result
        return ExecResult(
            outputs=outputs,
            provenance=ModuleProvenance(
                read={input_ds_name: dataset.identifier},
                write={ds_name: DatasetDescriptor(
                    identifier=ds.identifier,
                    name=ds_name,
                    columns=ds.columns
                )}
            )
        )
コード例 #9
0
    def execute_query(self, args: ModuleArguments,
                      context: TaskContext) -> ExecResult:
        """Execute a SQL query in the given context.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get SQL source code that is in this cell and the global
        # variables
        source = args.get_value(cmd.PARA_SQL_SOURCE)
        if not source.endswith(';'):
            source = source
        ds_name = args.get_value(cmd.PARA_OUTPUT_DATASET, raise_error=False)
        # Get mapping of datasets in the context to their respective table
        # name in the Mimir backend
        mimir_table_names = dict()
        for ds_name_o in context.datasets:
            dataset_id = context.datasets[ds_name_o].identifier
            dataset = context.datastore.get_dataset(dataset_id)
            if dataset is None:
                raise ValueError('unknown dataset \'' + ds_name_o + '\'')
            mimir_table_names[ds_name_o] = dataset.identifier
        # Module outputs
        outputs = ModuleOutputs()
        is_success = True
        functions = {
            name: context.dataobjects[name].identifier
            for name in context.dataobjects
            if context.dataobjects[name].obj_type == ARTIFACT_TYPE_PYTHON
        }
        try:
            # Create the view from the SQL source
            view_name, dependencies, mimirSchema, properties, functionDeps = mimir.createView(
                datasets=mimir_table_names,
                query=source,
                functions=dict(functions))
            ds = MimirDatasetHandle.from_mimir_result(view_name, mimirSchema,
                                                      properties, ds_name)

            print(mimirSchema)

            if ds_name is None or ds_name == '':
                ds_name = "TEMPORARY_RESULT"

            from vizier.api.webservice import server

            ds_output = server.api.datasets.get_dataset(
                project_id=context.project_id,
                dataset_id=ds.identifier,
                offset=0,
                limit=10)
            if ds_output is None:
                outputs.stderr.append(
                    TextOutput("Error displaying dataset {}".format(ds_name)))
            else:
                ds_output['name'] = ds_name
                outputs.stdout.append(DatasetOutput(ds_output))

            dependenciesDict: Dict[str, str] = {
                dep_name.lower(): get_artifact_id(dep)
                for dep_name, dep in [(
                    dep_name, context.datasets.get(dep_name.lower(), None))
                                      for dep_name in dependencies]
                if dep is not None
            }
            functionDepDict: Dict[str, str] = {
                dep_name.lower(): get_artifact_id(dep)
                for dep_name, dep in [(
                    dep_name, context.dataobjects.get(dep_name.lower(), None))
                                      for dep_name in dependencies]
                if dep is not None
            }
            # print("---- SQL DATASETS ----\n{}\n{}".format(context.datasets, dependencies))

            provenance = ModuleProvenance(write={
                ds_name:
                DatasetDescriptor(identifier=ds.identifier,
                                  name=ds_name,
                                  columns=ds.columns)
            },
                                          read={
                                              **dependenciesDict,
                                              **functionDepDict
                                          })
        except Exception as ex:
            provenance = ModuleProvenance()
            outputs.error(ex)
            is_success = False
        # Return execution result
        return ExecResult(is_success=is_success,
                          outputs=outputs,
                          provenance=provenance)
コード例 #10
0
    def register_dataset(self,
                         table_name,
                         columns,
                         row_counter=None,
                         annotations=None):
        """Create a new record for a database table or view. Note that this
        method does not actually create the table or view in the database but
        adds the datasets metadata to the data store. The table or view will
        have been created by a load command or be the result from executing
        a lens or a VizUAL command.

        Parameters
        ----------
        table_name: string
            Name of relational database table or view containing the dataset.
        columns: list(vizier.datastore.mimir.MimirDatasetColumn)
            List of column names in the dataset schema and their corresponding
            names in the relational database table or view.
        row_counter: int
            Counter for unique row ids
        annotations: vizier.datastore.metadata.DatasetMetadata
            Annotations for dataset components
        update_rows: bool, optional
            Flag indicating that the number of rows may have changed and the
            list of row identifier therefore needs to be checked.

        Returns
        -------
        vizier.datastore.mimir.dataset.MimirDatasetHandle
        """
        # Depending on whether we need to update row ids we either query the
        # database or just get the schema. In either case mimir_schema will
        # contain a the returned Mimir schema information.
        sql = base.get_select_query(table_name, columns=columns) + ';'
        mimir_schema = mimir.getSchema(sql)

        # Create a mapping of column name (in database) to column type. This
        # mapping is then used to update the data type information for all
        # column descriptors.
        col_types = dict()
        for col in mimir_schema:
            col_types[base.sanitize_column_name(
                col['name'].upper())] = col['baseType']
        for col in columns:
            col.data_type = col_types[col.name_in_rdb]
        # Set row counter to max. row id + 1 if None
        if row_counter is None:
            row_counter = mimir.countRows(table_name)
        dataset = MimirDatasetHandle(identifier=get_unique_identifier(),
                                     columns=list(
                                         map(base.sanitize_column_name,
                                             columns)),
                                     table_name=table_name,
                                     row_counter=row_counter,
                                     annotations=annotations)
        # Create a new directory for the dataset if it doesn't exist.
        dataset_dir = self.get_dataset_dir(dataset.identifier)
        if not os.path.isdir(dataset_dir):
            os.makedirs(dataset_dir)
        # Write dataset and annotation file to disk
        dataset.to_file(self.get_dataset_file(dataset.identifier))
        dataset.annotations.to_file(
            self.get_metadata_filename(dataset.identifier))
        return dataset
コード例 #11
0
    def load_dataset(
        self,
        f_handle: Optional[FileHandle] = None,
        proposed_schema: List[Tuple[str, str]] = [],
        url: Optional[str] = None,
        detect_headers: bool = True,
        infer_types: bool = True,
        properties: Dict[str, Any] = {},
        load_format: str = 'csv',
        options: List[Dict[str, str]] = [],
        human_readable_name: Optional[str] = None,
    ):
        """Create a new dataset from a given file or url. Expects that either
        the file handle or the url are not None. Raises ValueError if both are
        None or not None.


        Parameters
        ----------
        f_handle : vizier.filestore.base.FileHandle, optional
            handle for an uploaded file on the associated file server.
        url: string, optional, optional
            Url for the file source
        detect_headers: bool, optional
            Detect column names in loaded file if True
        infer_types: bool, optional
            Infer column types for loaded dataset if True
        load_format: string, optional
            Format identifier
        options: list, optional
            Additional options for Mimirs load command
        human_readable_name: string, optional
            Optional human readable name for the resulting table

        Returns
        -------
        vizier.datastore.mimir.dataset.MimirDatasetHandle
        """
        assert (url is not None or f_handle is not None)
        if f_handle is None and url is None:
            raise ValueError('no load source given')
        elif f_handle is not None and url is not None:
            raise ValueError('too many load sources given')
        elif url is None and f_handle is not None:
            # os.path.abspath((r'%s' % os.getcwd().replace('\\','/') ) + '/' + f_handle.filepath)
            abspath = f_handle.filepath
        elif url is not None:
            abspath = url

        # for ease of debugging, associate each table with a prefix identifying its nature
        prefix = load_format if load_format in SAFE_FORMAT_IDENTIFIER_PREFIXES else "LOADED_"

        # Load dataset into Mimir
        table_name, mimirSchema = mimir.loadDataSource(
            abspath,
            infer_types,
            detect_headers,
            load_format,
            human_readable_name,
            options,
            properties=properties,
            result_name=prefix + get_unique_identifier(),
            proposed_schema=proposed_schema)
        return MimirDatasetHandle.from_mimir_result(table_name, mimirSchema,
                                                    properties,
                                                    human_readable_name)