Example #1
0
    def insert_column(self, identifier, position, name):
        """Insert column with given name at given position in dataset.

        Raises ValueError if no dataset with given identifier exists, if the
        specified column position is outside of the current schema bounds, or if
        the column name is invalid.

        Parameters
        ----------
        identifier : string
            Unique dataset identifier
        position : int
            Index position at which the column will be inserted
        name : string
            New column name

        Returns
        -------
        int, string
            Number of inserted columns (i.e., 1) and identifier of resulting
            dataset
        """
        # Raise ValueError if given colum name is invalid
        if not is_valid_name(name):
            raise ValueError('invalid column name \'' + name + '\'')
        # Get dataset. Raise exception if dataset is unknown
        dataset = self.datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Make sure that position is a valid column index in the new dataset
        if position < 0 or position > len(dataset.columns):
            raise ValueError('invalid column index \'' + str(position) + '\'')
        # Get name for new column
        col_id = dataset.column_counter
        dataset.column_counter += 1
        # Insert new column into schema
        schema = list(dataset.columns)
        new_column = MimirDatasetColumn(col_id, name, COL_PREFIX + str(col_id))
        schema.insert(position, new_column)
        # Create a view for the modified schema
        col_list = [ROW_ID]
        for col in schema:
            if col.identifier == new_column.identifier:
                # Note: By no (April 2018) this requires Mimir to run with the
                # XNULL option. Otherwise, in some scenarios setting the all
                # values in the new column to NULL may cause an exception.
                col_list.append('NULL ' + col.name_in_rdb)
            else:
                col_list.append(col.name_in_rdb)
        sql = 'SELECT ' + ','.join(col_list) + ' FROM ' + dataset.table_name
        view_name = mimir._mimir.createView(dataset.table_name, sql)
        # Store updated dataset information with new identifier
        ds = self.datastore.register_dataset(
            table_name=view_name,
            columns=schema,
            row_ids=dataset.row_ids,
            column_counter=dataset.column_counter,
            row_counter=dataset.row_counter,
            annotations=dataset.annotations)
        return 1, ds.identifier
Example #2
0
    def rename_dataset(self, name, new_name):
        """Rename an existing dataset.

        Raises ValueError if a dataset with given name already exist.

        Raises ValueError if dataset with name does not exist or if dataset with
        new_name already exists.

        Parameters
        ----------
        name : string
            Unique dataset name
        new_name : string
            New dataset name
        """
        # Make sure to record access idependently of whether the dataset exists
        # or not. Ignore read access to datasets that have been written.
        if not name.lower() in self.write:
            self.read.add(name.lower())
        # Add the new name to the written datasets
        self.write.add(new_name.lower())
        # Raise exception if new_name exists or is not valid.
        if self.has_dataset_identifier(new_name.lower()):
            raise ValueError('dataset \'{}\' exists'.format(new_name.lower()))
        if not is_valid_name(new_name):
            raise ValueError('invalid dataset name \'{}\''.format(new_name))
        # Raise an exception if no dataset with the given name exists
        ds = self.datasets.get(name.lower(), None)
        if ds is None:
            raise ValueError('dataset \'{}\' does not exist'.format(name))
        self.drop_dataset(name.lower())
        self.datasets[new_name.lower()] = ds
        self.write.add(new_name.lower())
Example #3
0
    def insert_column(self, identifier, position, name, datastore):
        """Insert column with given name at given position in dataset.

        Raises ValueError if no dataset with given identifier exists, if the
        specified column position is outside of the current schema bounds, or if
        the column name is invalid.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        position: int
            Index position at which the column will be inserted
        name: string, optional
            New column name
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Raise ValueError if given colum name is invalid
        if not is_valid_name(name):
            raise ValueError('invalid column name \'' + str(name) + '\'')
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Make sure that position is a valid column index in the new dataset
        if position < 0 or position > len(dataset.columns):
            raise ValueError('invalid column index \'' + str(position) + '\'')
        # Get identifier for new column
        col_id = dataset.max_column_id() + 1
        # Insert new column into schema
        schema = list(dataset.columns)
        new_column = MimirDatasetColumn(col_id, name, name)
        schema.insert(position, new_column)
        # Create a view for the modified schema
        col_list = []
        for col in schema:
            if col.identifier == new_column.identifier:
                # Note: By no (April 2018) this requires Mimir to run with the
                # XNULL option. Otherwise, in some scenarios setting the all
                # values in the new column to NULL may cause an exception.
                col_list.append(" CAST('' AS int) AS " + col.name_in_rdb)
            else:
                col_list.append(col.name_in_rdb)
        sql = 'SELECT ' + ','.join(
            col_list) + ' FROM ' + dataset.table_name + ';'
        view_name, dependencies = mimir.createView(dataset.table_name, sql)
        # Store updated dataset information with new identifier
        ds = datastore.register_dataset(table_name=view_name,
                                        columns=schema,
                                        row_counter=dataset.row_counter,
                                        annotations=dataset.annotations)
        return VizualApiResult(ds)
Example #4
0
    def compute_simple_chart(self, args, context):
        """Execute simple chart command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get dataset name and the associated dataset. This will raise an
        # exception if the dataset name is unknown.
        ds_name = args.get_value(pckg.PARA_DATASET)
        ds = context.get_dataset(ds_name)
        # Get user-provided name for the new chart and verify that it is a
        # valid name
        chart_name = args.get_value(pckg.PARA_NAME,
                                    default_value=ds_name + ' Plot')
        if chart_name == '' or chart_name == None:
            chart_name = ds_name + ' Plot'
        if not is_valid_name(chart_name):
            raise ValueError('invalid chart name \'' + str(chart_name) + '\'')
        chart_args = args.get_value(cmd.PARA_CHART)
        chart_type = chart_args.get_value(cmd.PARA_CHART_TYPE)
        grouped_chart = chart_args.get_value(cmd.PARA_CHART_GROUPED)
        # Create a new chart view handle and add the series definitions
        view = ChartViewHandle(dataset_name=ds_name,
                               chart_name=chart_name,
                               chart_type=chart_type,
                               grouped_chart=grouped_chart)
        # The data series index for x-axis values is optional
        if args.has(cmd.PARA_XAXIS):
            x_axis = args.get_value(cmd.PARA_XAXIS)
            # X-Axis column may be empty. In that case, we ignore the
            # x-axis spec
            add_data_series(args=x_axis,
                            view=view,
                            dataset=ds,
                            col_arg_id=cmd.PARA_XAXIS_COLUMN,
                            range_arg_id=cmd.PARA_XAXIS_RANGE)
            view.x_axis = 0
        # Definition of data series. Each series is a pair of column
        # identifier and a printable label.
        for data_series in args.get_value(cmd.PARA_SERIES):
            add_data_series(args=data_series, view=view, dataset=ds)
        # Execute the query and get the result
        rows = ChartQuery.exec_query(ds, view)
        # Add chart view handle as module output
        return ExecResult(
            outputs=ModuleOutputs(stdout=[ChartOutput(view=view, rows=rows)]),
            provenance=ModuleProvenance(read={ds_name: ds.identifier},
                                        write=dict(),
                                        charts=[view]))
Example #5
0
    def filter_columns(self, identifier: str, columns: List[int],
                       names: List[str],
                       datastore: Datastore) -> VizualApiResult:
        """Dataset projection operator. Returns a copy of the dataset with the
        given identifier that contains only those columns listed in columns.
        The list of names contains optional new names for the filtered columns.
        A value of None in names indicates that the name of the corresponding
        column is not changed.

        Raises ValueError if no dataset with given identifier exists or if any
        of the filter columns are unknown.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        columns: list(int)
            List of column identifier for columns in the result.
        names: list(string)
            Optional new names for filtered columns.
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # The schema of the new dataset only contains the columns in the given
        # list. A column might need to be renamed.
        schema = list()
        column_mapping = list()
        col_list = []
        for i in range(len(columns)):
            col_idx = get_index_for_column(dataset, columns[i])
            col = dataset.columns[col_idx]
            if not names[i] is None:
                if not is_valid_name(names[i]):
                    raise ValueError('invalid column name \'' + str(names[i]) +
                                     '\'')
                schema.append(
                    MimirDatasetColumn(identifier=col.identifier,
                                       name_in_dataset=names[i],
                                       name_in_rdb=names[i]))
            else:
                schema.append(col)
            column_mapping.append({
                "columns_column": col_idx,
                "columns_name": schema[-1].name
            })
            col_list.append(col.name_in_rdb)
        command = {"id": "projection", "columns": column_mapping}
        response = mimir.vizualScript(dataset.identifier, command)
        return VizualApiResult.from_mimir(response)
Example #6
0
    def create_dataset(self, name, dataset, backend_options=[]):
        """Create a new dataset with given name.

        Raises ValueError if a dataset with given name already exist.

        Parameters
        ----------
        name : string
            Unique dataset name
        dataset : vizier.datastore.client.DatasetClient
            Dataset object

        Returns
        -------
        vizier.datastore.client.DatasetClient
        """
        # Raise an exception if a dataset with the given name already exists or
        # if the name is not valid
        if self.has_dataset_identifier(name):
            # Record access to the datasets
            self.read.add(name.lower())
            raise ValueError('dataset \'' + name + '\' already exists')
        if not is_valid_name(name):
            raise ValueError('invalid dataset name \'' + name + '\'')
        # Create list of columns for new dataset. Ensure that every column has
        # a positive identifier
        columns = list()
        if len(dataset.columns) > 0:
            column_counter = max(
                max([col.identifier for col in dataset.columns]) + 1, 0)
            for col in dataset.columns:
                if col.identifier < 0:
                    col.identifier = column_counter
                    column_counter += 1
                columns.append(
                    DatasetColumn(identifier=col.identifier,
                                  name=col.name,
                                  data_type=col.data_type))
        rows = dataset.rows
        if len(rows) > 0:
            # Ensure that all rows have positive identifier
            row_counter = max(max([row.identifier for row in rows]) + 1, 0)
            for row in rows:
                if row.identifier < 0:
                    row.identifier = row_counter
                    row_counter += 1
        # Write dataset to datastore and add new dataset to context
        ds = self.datastore.create_dataset(columns=columns,
                                           rows=rows,
                                           annotations=dataset.annotations,
                                           human_readable_name=name.upper(),
                                           backend_options=backend_options)
        self.set_dataset_identifier(name, ds.identifier)
        self.descriptors[ds.identifier] = ds
        return DatasetClient(dataset=self.datastore.get_dataset(ds.identifier))
Example #7
0
    def compute_empty_dataset(self, args, context):
        """Execute empty dataset command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        outputs = ModuleOutputs()
        default_columns = [("''", "unnamed_column")]
        ds_name = args.get_value(pckg.PARA_NAME).lower()
        if ds_name in context.datasets:
            raise ValueError('dataset \'' + ds_name + '\' exists')
        if not is_valid_name(ds_name):
            raise ValueError('invalid dataset name \'' + ds_name + '\'')
        try:
            source = "SELECT {};".format(", ".join(
                default_val + " AS " + col_name
                for default_val, col_name in default_columns))
            view_name, dependencies = mimir.createView(dict(), source)

            columns = [
                MimirDatasetColumn(identifier=col_id,
                                   name_in_dataset=col_defn[1])
                for col_defn, col_id in zip(default_columns,
                                            range(len(default_columns)))
            ]

            ds = context.datastore.register_dataset(table_name=view_name,
                                                    columns=columns,
                                                    row_counter=1)
            provenance = ModuleProvenance(
                write={
                    ds_name:
                    DatasetDescriptor(identifier=ds.identifier,
                                      columns=ds.columns,
                                      row_count=ds.row_count)
                },
                read=dict(
                )  # Need to explicitly declare a lack of dependencies.
            )
            outputs.stdout.append(
                TextOutput("Empty dataset '{}' created".format(ds_name)))
        except Exception as ex:
            provenance = ModuleProvenance()
            outputs.error(ex)
        return ExecResult(is_success=(len(outputs.stderr) == 0),
                          outputs=outputs,
                          provenance=provenance)
Example #8
0
    def create_dataset(self, name, dataset, backend_options=[]):
        """Create a new dataset with given name.

        Raises ValueError if a dataset with given name already exist.

        Parameters
        ----------
        name : string
            Unique dataset name
        dataset : vizier.datastore.client.DatasetClient
            Dataset object

        Returns
        -------
        vizier.datastore.client.DatasetClient
        """
        # Raise an exception if a dataset with the given name already exists or
        # if the name is not valid
        if name.lower() in self.datasets:
            # Record access to the datasets
            raise ValueError('dataset \'' + name + '\' already exists')
        if not is_valid_name(name):
            raise ValueError('invalid dataset name \'' + name + '\'')
        # Create list of columns for new dataset. Ensure that every column has
        # a positive identifier
        columns = list()
        if len(dataset.columns) > 0:
            column_counter = max(
                max([col.identifier for col in dataset.columns]) + 1, 0)
            for col in dataset.columns:
                if col.identifier < 0:
                    col.identifier = column_counter
                    column_counter += 1
                columns.append(
                    DatasetColumn(identifier=col.identifier,
                                  name=col.name,
                                  data_type=col.data_type))
        rows = dataset.rows
        # Write dataset to datastore and add new dataset to context
        ds = self.datastore.create_dataset(columns=columns,
                                           rows=rows,
                                           properties=dataset.properties,
                                           human_readable_name=name,
                                           backend_options=backend_options)
        self.datasets[name.lower()] = ds
        self.write.add(name.lower())
        return DatasetClient(dataset=self.datastore.get_dataset(ds.identifier),
                             client=self,
                             existing_name=name.lower())
Example #9
0
    def rename_column(self, identifier: str, column_id: int, name: str,
                      datastore: Datastore) -> VizualApiResult:
        """Rename column in a given dataset.

        Raises ValueError if no dataset with given identifier exists, if the
        specified column is unknown, or if the given column name is invalid.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        column_id: int
            Unique column identifier
        name: string
            New column name
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Raise ValueError if given colum name is invalid
        if not is_valid_name(name):
            raise ValueError('invalid column name \'' + name + '\'')
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Get the specified column that is to be renamed and set the column name
        # to the new name
        col_idx = dataset.get_index(column_id)
        if col_idx is None:
            raise ValueError('unknown column identifier \'' + str(column_id) +
                             '\'')
        # Nothing needs to be changed if name does not differ from column name
        if dataset.columns[col_idx].name.lower() != name.lower():
            columns = list(dataset.columns)
            col = columns[col_idx]
            columns[col_idx] = DatasetColumn(identifier=col.identifier,
                                             name=name,
                                             data_type=col.data_type)
            # Store updated dataset to get new identifier
            ds = datastore.create_dataset(columns=columns,
                                          rows=dataset.fetch_rows(),
                                          properties={})
            return VizualApiResult(ds)
        else:
            return VizualApiResult(dataset)
Example #10
0
    def rename_column(self, identifier, column, name):
        """Rename column in a given dataset.

        Raises ValueError if no dataset with given identifier exists, if the
        specified column is unknown, or if the given column name is invalid.

        Parameters
        ----------
        identifier : string
            Unique dataset identifier
        column : int
            Unique column identifier
        name : string
            New column name

        Returns
        -------
        int, string
            Number of renamed columns (i.e., 1) and identifier of resulting
            dataset
        """
        # Raise ValueError if given colum name is invalid
        if not is_valid_name(name):
            raise ValueError('invalid column name \'' + name + '\'')
        # Get dataset. Raise exception if dataset is unknown
        dataset = self.datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Get the specified column that is to be renamed and set the column name
        # to the new name
        schema = list(dataset.columns)
        col = schema[get_index_for_column(dataset, column)]
        # No need to do anything if the name hasn't changed
        if col.name.lower() != name.lower():
            # There are no changes to the underlying database. We only need to
            # change the column information in the dataset schema.
            col.name = name
            # Store updated dataset to get new identifier
            ds = self.datastore.register_dataset(
                table_name=dataset.table_name,
                columns=schema,
                row_ids=dataset.row_ids,
                column_counter=dataset.column_counter,
                row_counter=dataset.row_counter,
                annotations=dataset.annotations)
            return 1, ds.identifier
        else:
            return 0, identifier
Example #11
0
    def insert_column(self, identifier: str, position: int, name: str,
                      datastore: Datastore) -> VizualApiResult:
        """Insert column with given name at given position in dataset.

        Raises ValueError if no dataset with given identifier exists, if the
        specified column position is outside of the current schema bounds, or if
        the column name is invalid.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        position: int
            Index position at which the column will be inserted
        name: string
            New column name
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Raise ValueError if given colum name is invalid
        if not name is None and not is_valid_name(name):
            raise ValueError('invalid column name \'' + name + '\'')
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Make sure that position is a valid column index in the new dataset
        if position < 0 or position > len(dataset.columns):
            raise ValueError('invalid column index \'' + str(position) + '\'')
        # Insert new column into dataset
        columns = list(dataset.columns)
        rows = dataset.fetch_rows()
        columns.insert(
            position,
            DatasetColumn(identifier=dataset.max_column_id() + 1,
                          name=name if not name is None else ''))
        # Add a null value to each row for the new column
        for row in rows:
            row.values.insert(position, None)
        # Store updated dataset to get new identifier
        ds = datastore.create_dataset(columns=columns,
                                      rows=rows,
                                      properties={})
        return VizualApiResult(ds)
Example #12
0
    def insert_column(self, identifier, position, name):
        """Insert column with given name at given position in dataset.

        Raises ValueError if no dataset with given identifier exists, if the
        specified column position is outside of the current schema bounds, or if
        the column name is invalid.

        Parameters
        ----------
        identifier : string
            Unique dataset identifier
        position : int
            Index position at which the column will be inserted
        name : string
            New column name

        Returns
        -------
        int, string
            Number of inserted columns (i.e., 1) and identifier of resulting
            dataset
        """
        # Raise ValueError if given colum name is invalid
        if not is_valid_name(name):
            raise ValueError('invalid column name \'' + name + '\'')
        # Get dataset. Raise exception if dataset is unknown
        dataset = self.datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Make sure that position is a valid column index in the new dataset
        if position < 0 or position > len(dataset.columns):
            raise ValueError('invalid column index \'' + str(position) + '\'')
        # Insert new column into dataset
        columns = list(dataset.columns)
        rows = dataset.fetch_rows()
        columns.insert(position, DatasetColumn(dataset.column_counter, name))
        # Add a null value to each row for the new column
        for row in rows:
            row.values.insert(position, None)
        # Store updated dataset to get new identifier
        ds = self.datastore.create_dataset(
            columns=columns,
            rows=rows,
            column_counter=dataset.column_counter + 1,
            row_counter=dataset.row_counter,
            annotations=dataset.annotations)
        return 1, ds.identifier
Example #13
0
    def rename_column(self, identifier, column, name):
        """Rename column in a given dataset.

        Raises ValueError if no dataset with given identifier exists, if the
        specified column is unknown, or if the given column name is invalid.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        column: int
            Unique column identifier
        name: string
            New column name

        Returns
        -------
        int, string
            Number of renamed columns (i.e., 1) and identifier of resulting
            dataset
        """
        # Raise ValueError if given colum name is invalid
        if not is_valid_name(name):
            raise ValueError('invalid column name \'' + name + '\'')
        # Get dataset. Raise exception if dataset is unknown
        dataset = self.datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Get the specified column that is to be renamed and set the column name
        # to the new name
        col_idx = get_index_for_column(dataset, column)
        # Nothing needs to be changed if name does not differ from column name
        if dataset.columns[col_idx].name.lower() != name.lower():
            columns = list(dataset.columns)
            columns[col_idx] = DatasetColumn(columns[col_idx].identifier, name)
            # Store updated dataset to get new identifier
            ds = self.datastore.create_dataset(
                columns=columns,
                rows=dataset.fetch_rows(),
                column_counter=dataset.column_counter,
                row_counter=dataset.row_counter,
                annotations=dataset.annotations)
            return 1, ds.identifier
        else:
            return 0, identifier
Example #14
0
    def create_dataset(self, name, dataset):
        """Create a new dataset with given name.

        Raises ValueError if a dataset with given name already exist.

        Parameters
        ----------
        name : string
            Unique dataset name
        dataset : vizier.datastore.base.Dataset
            Dataset object

        Returns
        -------
        vizier.datastore.client.DatasetClient
        """
        # Raise an exception if a dataset with the given name already exists or
        # if the name is not valid
        if self.has_dataset_identifier(name):
            raise ValueError('dataset \'' + name + '\' already exists')
        if not is_valid_name(name):
            raise ValueError('invalid dataset name \'' + name + '\'')
        columns = dataset.columns
        rows = dataset.rows
        column_counter = max(max_column_id(columns) + 1, 0)
        row_counter = max(max_row_id(rows) + 1, 0)
        # Ensure that all columns has positive identifier
        for col in columns:
            if col.identifier < 0:
                col.identifier = column_counter
                column_counter += 1
        # Ensure that all rows have positive identifier
        for row in rows:
            if row.identifier < 0:
                row.identifier = row_counter
                row_counter += 1
        # Write dataset to datastore and add new dataset to context
        ds = self.datastore.create_dataset(columns=columns,
                                           rows=rows,
                                           column_counter=column_counter,
                                           row_counter=row_counter,
                                           annotations=dataset.annotations)
        self.set_dataset_identifier(name, ds.identifier)
        return DatasetClient(dataset=ds)
Example #15
0
    def insert_column(self, identifier: str, position: int, name: str,
                      datastore: Datastore) -> VizualApiResult:
        """Insert column with given name at given position in dataset.

        Raises ValueError if no dataset with given identifier exists, if the
        specified column position is outside of the current schema bounds, or if
        the column name is invalid.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        position: int
            Index position at which the column will be inserted
        name: string, optional
            New column name
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Raise ValueError if given colum name is invalid
        if not is_valid_name(name):
            raise ValueError('invalid column name \'' + str(name) + '\'')
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Make sure that position is a valid column index in the new dataset
        if position < 0 or position > len(dataset.columns):
            raise ValueError('invalid column index \'' + str(position) + '\'')
        # Get identifier for new column
        col_id = dataset.max_column_id() + 1
        # Insert new column into schema
        schema = list(dataset.columns)
        new_column = MimirDatasetColumn(col_id, name, name)
        schema.insert(position, new_column)
        command = {"id": "insertColumn", "name": name, "position": position}
        response = mimir.vizualScript(dataset.identifier, command)
        return VizualApiResult.from_mimir(response)
Example #16
0
    def compute_rename_dataset(self, args, context):
        """Execute rename dataset command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get name of existing dataset and the new dataset name. Raise
        # exception if a dataset with the new name already exists or if the new
        # dataset name is not a valid name.
        ds_name = args.get_value(pckg.PARA_DATASET).lower()
        new_name = args.get_value(pckg.PARA_NAME).lower()
        if new_name in context.datasets:
            raise ValueError('dataset \'' + new_name + '\' exists')
        if not is_valid_name(new_name):
            raise ValueError('invalid dataset name \'' + new_name + '\'')
        #  Get dataset. Raises exception if the dataset does not exist.
        ds = context.get_dataset(ds_name)
        # Adjust database state
        datasets = dict(context.datasets)
        del datasets[ds_name]
        datasets[new_name] = ds
        return ExecResult(
            outputs=ModuleOutputs(stdout=[TextOutput('1 dataset renamed')]),
            provenance=ModuleProvenance(read=dict(),
                                        write={
                                            new_name:
                                            DatasetDescriptor(
                                                identifier=ds.identifier,
                                                columns=ds.columns,
                                                row_count=ds.row_count)
                                        },
                                        delete=[ds_name]))
Example #17
0
    def rename_column(self, identifier: str, column_id: int, name: str,
                      datastore: Datastore) -> VizualApiResult:
        """Rename column in a given dataset.

        Raises ValueError if no dataset with given identifier exists, if the
        specified column is unknown, or if the given column name is invalid.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        column_id: int
            Unique column identifier
        name: string
            New column name
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Raise ValueError if given colum name is invalid
        if not is_valid_name(name):
            raise ValueError('invalid column name \'' + str(name) + '\'')
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Get the specified column that is to be renamed and set the column name
        # to the new name
        target_col_index = get_index_for_column(dataset, column_id)
        command = {
            "id": "renameColumn",
            "column": target_col_index,
            "name": name
        }
        response = mimir.vizualScript(dataset.identifier, command)
        return VizualApiResult.from_mimir(response)
Example #18
0
    def rename_dataset(self, name, new_name):
        """Rename an existing dataset.

        Raises ValueError if a dataset with given name already exist.

        Raises ValueError if dataset with name does not exist or if dataset with
        new_name already exists.

        Parameters
        ----------
        name : string
            Unique dataset name
        new_name : string
            New dataset name
        """
        # Raise exception if new_name exists or is not valid.
        if self.has_dataset_identifier(new_name):
            raise ValueError('dataset \'' + new_name + '\' exists')
        if not is_valid_name(new_name):
            raise ValueError('invalid dataset name \'' + new_name + '\'')
        # Raise an exception if no dataset with the given name exists
        identifier = self.get_dataset_identifier(name)
        self.remove_dataset_identifier(name)
        self.set_dataset_identifier(new_name, identifier)
Example #19
0
    def compute(self, command_id, arguments, context):
        """Compute results for commands in the Mimir package using the set of
        user-provided arguments and the current database state.

        Parameters
        ----------
        command_id: string
            Unique identifier for a command in a package declaration
        arguments: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        outputs = ModuleOutputs()
        store_as_dataset = None
        update_rows = False
        lens_annotations = []
        # Get dataset. Raise exception if dataset is unknown.
        ds_name = arguments.get_value(pckg.PARA_DATASET).lower()
        dataset = context.get_dataset(ds_name)
        mimir_table_name = dataset.table_name
        # Keep track of the name of the input dataset for the provenance
        # information.
        input_ds_name = ds_name
        if command_id == cmd.MIMIR_DOMAIN:
            column = dataset.column_by_id(arguments.get_value(
                pckg.PARA_COLUMN))
            params = [column.name_in_rdb]
        elif command_id == cmd.MIMIR_GEOCODE:
            geocoder = arguments.get_value(cmd.PARA_GEOCODER)
            params = ['GEOCODER(' + geocoder + ')']
            add_column_parameter(params, 'HOUSE_NUMBER', dataset, arguments,
                                 cmd.PARA_HOUSE_NUMBER)
            add_column_parameter(params, 'STREET', dataset, arguments,
                                 cmd.PARA_STREET)
            add_column_parameter(params, 'CITY', dataset, arguments,
                                 cmd.PARA_CITY)
            add_column_parameter(params, 'STATE', dataset, arguments,
                                 cmd.PARA_STATE)
            # Add columns for LATITUDE and LONGITUDE
            column_counter = dataset.max_column_id() + 1
            cname_lat = dataset.get_unique_name('LATITUDE')
            cname_lon = dataset.get_unique_name('LONGITUDE')
            dataset.columns.append(
                MimirDatasetColumn(identifier=column_counter,
                                   name_in_dataset=cname_lat,
                                   data_type=DATATYPE_REAL))
            dataset.columns.append(
                MimirDatasetColumn(identifier=column_counter + 1,
                                   name_in_dataset=cname_lon,
                                   data_type=DATATYPE_REAL))
            params.append('RESULT_COLUMNS(' + cname_lat + ',' + cname_lon +
                          ')')
        elif command_id == cmd.MIMIR_KEY_REPAIR:
            column = dataset.column_by_id(arguments.get_value(
                pckg.PARA_COLUMN))
            params = [column.name_in_rdb]
            update_rows = True
        elif command_id == cmd.MIMIR_MISSING_KEY:
            column = dataset.column_by_id(arguments.get_value(
                pckg.PARA_COLUMN))
            params = [column.name_in_rdb]
            # Set MISSING ONLY to FALSE to ensure that all rows are returned
            params += ['MISSING_ONLY(FALSE)']
            # Need to run this lens twice in order to generate row ids for
            # any potential new tuple
            mimir_lens_response = mimir.createLens(
                dataset.table_name, params, command_id,
                arguments.get_value(cmd.PARA_MATERIALIZE_INPUT,
                                    default_value=True))
            (mimir_table_name,
             lens_annotations) = (mimir_lens_response.lensName(),
                                  mimir_lens_response.annotations())
            params = [ROW_ID, 'MISSING_ONLY(FALSE)']
            update_rows = True
        elif command_id == cmd.MIMIR_MISSING_VALUE:
            params = list()
            for col in arguments.get_value(cmd.PARA_COLUMNS, default_value=[]):
                f_col = dataset.column_by_id(col.get_value(pckg.PARA_COLUMN))
                param = f_col.name_in_rdb
                col_constraint = col.get_value(cmd.PARA_COLUMNS_CONSTRAINT,
                                               raise_error=False)
                if col_constraint == '':
                    col_constraint = None
                if not col_constraint is None:
                    param = param + ' ' + str(col_constraint).replace(
                        "'", "\'\'").replace("OR", ") OR (")
                param = '\'(' + param + ')\''
                params.append(param)
        elif command_id == cmd.MIMIR_PICKER:
            pick_from = list()
            column_names = list()
            for col in arguments.get_value(cmd.PARA_SCHEMA):
                c_col = col.get_value(cmd.PARA_PICKFROM)
                column = dataset.column_by_id(c_col)
                pick_from.append(column.name_in_rdb)
                column_names.append(column.name.upper().replace(' ', '_'))
            # Add result column to dataset schema
            pick_as = arguments.get_value(cmd.PARA_PICKAS,
                                          default_value='PICK_ONE_' +
                                          '_'.join(column_names))
            pick_as = dataset.get_unique_name(pick_as.strip().upper())
            dataset.columns.append(
                MimirDatasetColumn(identifier=dataset.max_column_id() + 1,
                                   name_in_dataset=pick_as))
            params = ['PICK_FROM(' + ','.join(pick_from) + ')']
            params.append('PICK_AS(' + pick_as + ')')
        elif command_id == cmd.MIMIR_SCHEMA_MATCHING:
            store_as_dataset = arguments.get_value(cmd.PARA_RESULT_DATASET)
            if store_as_dataset in context.datasets:
                raise ValueError('dataset \'' + store_as_dataset + '\' exists')
            if not is_valid_name(store_as_dataset):
                raise ValueError('invalid dataset name \'' + store_as_dataset +
                                 '\'')
            column_names = list()
            params = ['\'' + ROW_ID + ' int\'']
            for col in arguments.get_value(cmd.PARA_SCHEMA):
                c_name = col.get_value(pckg.PARA_COLUMN)
                c_type = col.get_value(cmd.PARA_TYPE)
                params.append('\'' + c_name + ' ' + c_type + '\'')
                column_names.append(c_name)
        elif command_id == cmd.MIMIR_TYPE_INFERENCE:
            params = [str(arguments.get_value(cmd.PARA_PERCENT_CONFORM))]
        elif command_id == cmd.MIMIR_SHAPE_DETECTOR:
            dseModel = arguments.get_value(cmd.PARA_MODEL_NAME)
            params = []
            if not dseModel is None:
                params = [str(dseModel)]
        elif command_id == cmd.MIMIR_COMMENT:
            params = []
            for comment in arguments.get_value(cmd.PARA_COMMENTS):
                c_expr = comment.get_value(cmd.PARA_EXPRESSION)
                c_cmnt = comment.get_value(cmd.PARA_COMMENT)
                c_rowid = comment.get_value(cmd.PARA_ROWID)
                if c_rowid is None:
                    params.append('COMMENT(' + c_expr + ', \'' + c_cmnt +
                                  '\') ')
                else:
                    params.append('COMMENT(' + c_expr + ', \'' + c_cmnt +
                                  '\', \'' + c_rowid + '\') ')
            result_cols = []
            for col in arguments.get_value(cmd.PARA_RESULT_COLUMNS):
                c_name = col.get_value(pckg.PARA_COLUMN)
                result_cols.append(c_name)
            if len(result_cols) > 0:
                params.append('RESULT_COLUMNS(' + ','.join(result_cols) + ')')
        else:
            raise ValueError('unknown Mimir lens \'' + str(lens) + '\'')
        # Create Mimir lens
        if command_id in [
                cmd.MIMIR_SCHEMA_MATCHING, cmd.MIMIR_TYPE_INFERENCE,
                cmd.MIMIR_SHAPE_DETECTOR
        ]:
            lens_name = mimir.createAdaptiveSchema(mimir_table_name, params,
                                                   command_id.upper())
        else:
            mimir_lens_response = mimir.createLens(
                mimir_table_name,
                params,
                command_id.upper(),
                arguments.get_value(cmd.PARA_MATERIALIZE_INPUT,
                                    default_value=True),
                human_readable_name=ds_name.upper())
            (lens_name,
             lens_annotations) = (mimir_lens_response['lensName'],
                                  mimir_lens_response['annotations'])
        # Create a view including missing row ids for the result of a
        # MISSING KEY lens
        if command_id == cmd.MIMIR_MISSING_KEY:
            lens_name, row_counter = create_missing_key_view(
                dataset, lens_name, column)
            dataset.row_counter = row_counter
        # Create datastore entry for lens.
        if not store_as_dataset is None:
            columns = list()
            for c_name in column_names:
                col_id = len(columns)
                columns.append(
                    MimirDatasetColumn(identifier=col_id,
                                       name_in_dataset=c_name))
            ds = context.datastore.register_dataset(
                table_name=lens_name,
                columns=columns,
                annotations=dataset.annotations)
            ds_name = store_as_dataset
        else:
            ds = context.datastore.register_dataset(
                table_name=lens_name,
                columns=dataset.columns,
                annotations=dataset.annotations)
        # Add dataset schema and returned annotations to output
        if command_id in [
                cmd.MIMIR_SCHEMA_MATCHING, cmd.MIMIR_TYPE_INFERENCE,
                cmd.MIMIR_SHAPE_DETECTOR
        ]:
            print_dataset_schema(outputs, ds_name, ds.columns)
        else:
            ds_output = server.api.datasets.get_dataset(
                project_id=context.project_id,
                dataset_id=ds.identifier,
                offset=0,
                limit=10)
            outputs.stdout.append(DatasetOutput(ds_output))

        print_lens_annotations(outputs, lens_annotations)
        dsd = DatasetDescriptor(identifier=ds.identifier,
                                columns=ds.columns,
                                row_count=ds.row_count)
        result_resources = dict()
        result_resources[base.RESOURCE_DATASET] = ds.identifier

        # Return task result
        return ExecResult(outputs=outputs,
                          provenance=ModuleProvenance(
                              read={input_ds_name: dataset.identifier},
                              write={ds_name: dsd},
                              resources=result_resources))
Example #20
0
    def compute_unload_dataset(self, args, context):
        """Execute unload dataset command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get the new dataset name. Raise exception if a dataset with the
        # specified name already exsists.
        ds_name = args.get_value(pckg.PARA_DATASET).lower()

        if not is_valid_name(ds_name):
            raise ValueError('invalid dataset name \'' + ds_name + '\'')
        # Get components of the load source. Raise exception if the source
        # descriptor is invalid.
        unload_format = args.get_value(cmd.PARA_UNLOAD_FORMAT)
        options = args.get_value(cmd.PARA_UNLOAD_OPTIONS, raise_error=False)
        m_opts = []

        if not options is None:
            for option in options:
                unload_opt_key = option.get_value(cmd.PARA_UNLOAD_OPTION_KEY)
                unload_opt_val = option.get_value(cmd.PARA_UNLOAD_OPTION_VALUE)
                m_opts.append({
                    'name': unload_opt_key,
                    'value': unload_opt_val
                })
        # Execute load command.
        dataset = context.get_dataset(ds_name)
        result = self.api.unload_dataset(dataset=dataset,
                                         datastore=context.datastore,
                                         filestore=context.filestore,
                                         unload_format=unload_format,
                                         options=m_opts,
                                         resources=context.resources)
        # Delete the uploaded file (of load was from file). A reference to the
        # created dataset is in the resources and will be used if the module is
        # re-executed.
        #file_id = result.resources[apibase.RESOURCE_FILEID]
        #if not file_id is None:
        #    context.filestore.delete_file(file_id)
        # Create result object
        outputhtml = HtmlOutput(''.join([
            "<div><a href=\"" + config.webservice.app_path + "/projects/" +
            str(context.project_id) + "/files/" + out_file.identifier +
            "\" download=\"" + out_file.name + "\">Download " + out_file.name +
            "</a></div>"
            for out_file in result.resources[apibase.RESOURCE_FILEID]
        ]))
        return ExecResult(outputs=ModuleOutputs(stdout=[outputhtml]),
                          provenance=ModuleProvenance(read={
                              ds_name:
                              context.datasets.get(ds_name.lower(), None)
                          },
                                                      write=dict()))
Example #21
0
    def rename_column(self, identifier, column_id, name, datastore):
        """Rename column in a given dataset.

        Raises ValueError if no dataset with given identifier exists, if the
        specified column is unknown, or if the given column name is invalid.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        column_id: int
            Unique column identifier
        name: string
            New column name
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Raise ValueError if given colum name is invalid
        if not is_valid_name(name):
            raise ValueError('invalid column name \'' + str(name) + '\'')
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Get the specified column that is to be renamed and set the column name
        # to the new name
        columns = list()
        schema = list(dataset.columns)
        colIndex = get_index_for_column(dataset, column_id)
        col = schema[colIndex]
        # No need to do anything if the name hasn't changed
        if col.name.lower() != name.lower():

            sql = 'SELECT * FROM ' + dataset.table_name
            mimirSchema = mimir.getSchema(sql)
            # Create list of dataset columns
            colSql = ''
            idx = 0
            for col in mimirSchema:
                col_id = len(columns)
                name_in_dataset = sanitize_column_name(col['name'].upper())
                name_in_rdb = sanitize_column_name(col['name'].upper())
                col = MimirDatasetColumn(identifier=col_id,
                                         name_in_dataset=name_in_dataset,
                                         name_in_rdb=name_in_rdb)
                if idx == 0:
                    colSql = name_in_dataset + ' AS ' + name_in_rdb
                elif idx == colIndex:
                    colSql = colSql + ', ' + name_in_dataset + ' AS ' + name
                    col.name = name
                    col.name_in_rdb = name
                else:
                    colSql = colSql + ', ' + name_in_dataset + ' AS ' + name_in_rdb
                columns.append(col)
                idx = idx + 1
            # Create view for loaded dataset
            sql = 'SELECT ' + colSql + ' FROM {{input}};'
            view_name, dependencies = mimir.createView(dataset.table_name, sql)
            # There are no changes to the underlying database. We only need to
            # change the column information in the dataset schema.
            # Store updated dataset to get new identifier
            ds = datastore.register_dataset(table_name=view_name,
                                            columns=columns,
                                            row_counter=dataset.row_counter,
                                            annotations=dataset.annotations)
            return VizualApiResult(ds)
        else:
            return VizualApiResult(dataset)
Example #22
0
    def compute_load_dataset(self, args, context):
        """Execute load dataset command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get the new dataset name. Raise exception if a dataset with the
        # specified name already exsists.
        ds_name = args.get_value(pckg.PARA_NAME).lower()
        if ds_name in context.datasets:
            raise ValueError('dataset \'' + ds_name + '\' exists')
        if not is_valid_name(ds_name):
            raise ValueError('invalid dataset name \'' + ds_name + '\'')
        # Get components of the load source. Raise exception if the source
        # descriptor is invalid.
        source_desc = args.get_value(cmd.PARA_FILE)
        file_id = None
        url = None
        if pckg.FILE_ID in source_desc and source_desc[
                pckg.FILE_ID] is not None:
            file_id = source_desc[pckg.FILE_ID]
        elif pckg.FILE_URL in source_desc and source_desc[
                pckg.FILE_URL] is not None:
            url = source_desc[pckg.FILE_URL]
        else:
            raise ValueError('invalid source descriptor')
        username = source_desc[
            pckg.FILE_USERNAME] if pckg.FILE_USERNAME in source_desc else None
        password = source_desc[
            pckg.FILE_PASSWORD] if pckg.FILE_PASSWORD in source_desc else None
        reload = source_desc[
            pckg.FILE_RELOAD] if pckg.FILE_RELOAD in source_desc else False
        load_format = args.get_value(cmd.PARA_LOAD_FORMAT)
        detect_headers = args.get_value(cmd.PARA_DETECT_HEADERS,
                                        raise_error=False,
                                        default_value=True)
        infer_types = args.get_value(cmd.PARA_INFER_TYPES,
                                     raise_error=False,
                                     default_value=True)
        options = args.get_value(cmd.PARA_LOAD_OPTIONS, raise_error=False)
        m_opts = []
        print((args.get_value(cmd.PARA_LOAD_DSE,
                              raise_error=False,
                              default_value=False)))
        if args.get_value(cmd.PARA_LOAD_DSE,
                          raise_error=False,
                          default_value=False):
            m_opts.append({'name': 'datasourceErrors', 'value': 'true'})
        if not options is None:
            for option in options:
                load_opt_key = option.get_value(cmd.PARA_LOAD_OPTION_KEY)
                load_opt_val = option.get_value(cmd.PARA_LOAD_OPTION_VALUE)
                m_opts.append({'name': load_opt_key, 'value': load_opt_val})
        # Execute load command.
        result = self.api.load_dataset(datastore=context.datastore,
                                       filestore=context.filestore,
                                       file_id=file_id,
                                       url=url,
                                       detect_headers=detect_headers,
                                       infer_types=infer_types,
                                       load_format=load_format,
                                       options=m_opts,
                                       username=username,
                                       password=password,
                                       resources=context.resources,
                                       reload=reload,
                                       human_readable_name=ds_name.upper())
        # Delete the uploaded file (of load was from file). A reference to the
        # created dataset is in the resources and will be used if the module is
        # re-executed.
        #if not file_id is None:
        #    context.filestore.delete_file(file_id)
        ds = DatasetDescriptor(identifier=result.dataset.identifier,
                               columns=result.dataset.columns,
                               row_count=result.dataset.row_count)
        ds_output = server.api.datasets.get_dataset(
            project_id=context.project_id,
            dataset_id=ds.identifier,
            offset=0,
            limit=10)
        ds_output['name'] = ds_name
        return ExecResult(
            outputs=ModuleOutputs(stdout=[DatasetOutput(ds_output)]),
            provenance=ModuleProvenance(
                read=dict(
                ),  # need to explicitly declare a lack of dependencies
                write={ds_name: ds},
                resources=result.resources))