def from_mimir( response: Dict[str, Any], name: Optional[str] = None ) -> "VizualApiResult": ds = MimirDatasetHandle.from_mimir_result( table_name = response["name"], schema = response["schema"], properties = response["properties"], name = name ) return VizualApiResult(ds)
def create_dataset( self, columns: List[DatasetColumn], rows: List[DatasetRow], properties: Dict[str, Any] = None, human_readable_name: str = "Untitled Dataset", backend_options: Optional[List[Tuple[str, str]]] = None, dependencies: Optional[List[str]] = None) -> MimirDatasetHandle: """Create a new dataset in the datastore. Expects at least the list of columns and the rows for the dataset. Parameters ---------- columns: list(vizier.datastore.dataset.DatasetColumn) List of columns. It is expected that each column has a unique identifier. rows: list(vizier.datastore.dataset.DatasetRow) List of dataset rows. properties: dict(string, any), optional Annotations for dataset components Returns ------- vizier.datastore.dataset.DatasetDescriptor """ # Get unique identifier for new dataset properties = {} if properties is None else properties backend_options = [] if backend_options is None else backend_options dependencies = [] if dependencies is None else dependencies identifier = 'DS_' + get_unique_identifier() columns = [ col if isinstance(col, MimirDatasetColumn) else MimirDatasetColumn( identifier=col.identifier, name_in_dataset=col.name, data_type=col.data_type) for col in columns ] table_name, schema = mimir.loadDataInline( schema=[{ "name": base.sanitize_column_name(col.name), "type": col.data_type } for col in columns], rows=[row.values for row in rows], result_name=identifier, human_readable_name=human_readable_name, dependencies=dependencies, properties=properties) # Insert the new dataset metadata information into the datastore return MimirDatasetHandle.from_mimir_result(table_name=table_name, schema=schema, properties=properties, name=human_readable_name)
def sort_dataset(self, identifier: str, columns: List[int], reversed: List[bool], datastore: Datastore) -> VizualApiResult: """Sort the dataset with the given identifier according to the order by statement. The order by statement is a pair of lists. The first list contains the identifier of columns to sort on. The second list contains boolean flags, one for each entry in columns, indicating whether sort order is revered for the corresponding column or not. Returns the number of rows in the dataset and the identifier of the sorted dataset. Raises ValueError if no dataset with given identifier exists or if any of the columns in the order by clause are unknown. Parameters ---------- identifier: string Unique dataset identifier columns: list(int) List of column identifier for sort columns. reversed: list(bool) Flags indicating whether the sort order of the corresponding column is reveresed. datastore : vizier.datastore.fs.base.FileSystemDatastore Datastore to retireve and update datasets Returns ------- vizier.engine.packages.vizual.api.VizualApiResult """ # Get dataset. Raise exception if dataset is unknown dataset = datastore.get_dataset(identifier) assert (isinstance(dataset, MimirDatasetHandle)) if dataset is None: raise ValueError('unknown dataset \'' + identifier + '\'') # Create order by clause based on columns and reversed flags order_by_clause = list() for i in range(len(columns)): col_id = columns[i] stmt = cast(MimirDatasetColumn, dataset.column_by_id(col_id)).name_in_rdb if reversed[i]: stmt += ' DESC' order_by_clause.append(stmt) sql = 'SELECT * FROM ' + dataset.identifier + ' ORDER BY ' sql += ','.join(order_by_clause) view_name, dependencies, schema, properties, functionDeps = mimir.createView( datasets={dataset.identifier: dataset.identifier}, query=sql) ds = MimirDatasetHandle.from_mimir_result(view_name, schema, properties) return VizualApiResult(ds)
def materialize_dataset(self, identifier: str, datastore: Datastore) -> VizualApiResult: """Create a materialized snapshot of the dataset for faster execution.""" input_dataset = datastore.get_dataset(identifier) if input_dataset is None: raise ValueError('unknown dataset \'' + identifier + '\'') cast(MimirDatasetHandle, input_dataset) response = mimir.materialize(input_dataset.identifier) output_ds = MimirDatasetHandle( identifier=response["name"], columns=cast(List[MimirDatasetColumn], input_dataset.columns), properties=input_dataset.get_properties(), name=input_dataset.name if input_dataset.name is not None else "untitled dataset") return VizualApiResult(output_ds)
def get_dataset(self, identifier): """Read a full dataset from the data store. Returns None if no dataset with the given identifier exists. Parameters ---------- identifier : string Unique dataset identifier Returns ------- vizier.datastore.mimir.dataset.MimirDatasetHandle """ # Return None if the dataset file does not exist dataset_file = self.get_dataset_file(identifier) if not os.path.isfile(dataset_file): return None annotations = DatasetMetadata.from_file( self.get_metadata_filename(identifier)) return MimirDatasetHandle.from_file(dataset_file, annotations=annotations)
def get_dataset(self, identifier: str, force_profiler: Optional[bool] = None, name: Optional[str] = None) -> MimirDatasetHandle: """Read a full dataset from the data store. Returns None if no dataset with the given identifier exists. Parameters ---------- identifier : string Unique dataset identifier Returns ------- vizier.datastore.mimir.dataset.MimirDatasetHandle """ # Return None if the dataset file does not exist schema, properties = mimir.getTableInfo(identifier, force_profiler=force_profiler) return MimirDatasetHandle.from_mimir_result(identifier, schema, properties, name)
def compute(self, command_id: str, arguments: "ModuleArguments", context: TaskContext) -> ExecResult: """Compute results for commands in the sampling package using the set of user-provided arguments and the current database state. Parameters ---------- command_id: string Unique identifier for a command in a package declaration arguments: vizier.viztrail.command.ModuleArguments User-provided command arguments context: vizier.engine.task.base.TaskContext Context in which a task is being executed Returns ------- vizier.engine.task.processor.ExecResult """ input_ds_name = arguments.get_value(cmd.PARA_INPUT_DATASET).lower() input_dataset: DatasetDescriptor = context.get_dataset(input_ds_name) if input_dataset is None: raise ValueError('unknown dataset \'' + input_ds_name + '\'') output_ds_name = arguments.get_value(cmd.PARA_OUTPUT_DATASET, raise_error=False) if output_ds_name is None or output_ds_name == "": output_ds_name = input_ds_name + "_SAMPLE" output_ds_name = output_ds_name.lower() # Load the sampling configuration sample_mode = None if command_id == cmd.BASIC_SAMPLE: sampling_rate = float(arguments.get_value(cmd.PARA_SAMPLING_RATE)) if sampling_rate > 1.0 or sampling_rate < 0.0: raise Exception("Sampling rate must be between 0.0 and 1.0") sample_mode = { "mode": cmd.SAMPLING_MODE_UNIFORM_PROBABILITY, "probability": sampling_rate } elif command_id == cmd.MANUAL_STRATIFIED_SAMPLE or command_id == cmd.AUTOMATIC_STRATIFIED_SAMPLE: column = arguments.get_value(cmd.PARA_STRATIFICATION_COLUMN) column_defn = input_dataset.columns[column] if command_id == cmd.MANUAL_STRATIFIED_SAMPLE: strata = [{ "value": stratum.get_value(cmd.PARA_STRATUM_VALUE), "probability": stratum.get_value(cmd.PARA_SAMPLING_RATE) } for stratum in arguments.get_value(cmd.PARA_STRATA)] else: probability = arguments.get_value(cmd.PARA_SAMPLING_RATE) strata = self.get_automatic_strata(input_dataset, column_defn, probability) sample_mode = { "mode": cmd.SAMPLING_MODE_STRATIFIED_ON, "column": column_defn.name, "type": column_defn.data_type, "strata": strata } else: raise Exception("Unknown sampling command: {}".format(command_id)) table_name, schema = mimir.createSample(input_dataset.identifier, sample_mode, result_name="SAMPLE_" + get_unique_identifier()) ds = MimirDatasetHandle.from_mimir_result(table_name, schema, properties={}, name=output_ds_name) # And start rendering some output outputs = ModuleOutputs() ds_output = server.api.datasets.get_dataset( project_id=context.project_id, dataset_id=ds.identifier, offset=0, limit=10) if ds_output is not None: ds_output['name'] = output_ds_name outputs.stdout.append(DatasetOutput(ds_output)) else: outputs.stderr.append(TextOutput("Error displaying dataset")) # Record Reads and writes provenance = ModuleProvenance( read={input_ds_name: input_dataset.identifier}, write={ output_ds_name: DatasetDescriptor(identifier=ds.identifier, name=output_ds_name, columns=ds.columns) }) # Return task result return ExecResult(outputs=outputs, provenance=provenance)
def compute(self, command_id, arguments, context): """Compute results for commands in the Mimir package using the set of user-provided arguments and the current database state. Parameters ---------- command_id: string Unique identifier for a command in a package declaration arguments: vizier.viztrail.command.ModuleArguments User-provided command arguments context: vizier.engine.task.base.TaskContext Context in which a task is being executed Returns ------- vizier.engine.task.processor.ExecResult """ outputs = ModuleOutputs() # Get dataset. Raise exception if dataset is unknown. ds_name = arguments.get_value(pckg.PARA_DATASET).lower() dataset = context.get_dataset(ds_name) mimir_table_name = dataset.identifier # Keep track of the name of the input dataset for the provenance # information. input_ds_name = ds_name if command_id == cmd.MIMIR_GEOCODE: geocoder = arguments.get_value(cmd.PARA_GEOCODER) # Add columns for LATITUDE and LONGITUDE column_counter = dataset.max_column_id() + 1 cname_lat = dataset.get_unique_name('LATITUDE') cname_lon = dataset.get_unique_name('LONGITUDE') dataset.columns.append( MimirDatasetColumn( identifier=column_counter, name_in_dataset=cname_lat, data_type=DATATYPE_REAL ) ) dataset.columns.append( MimirDatasetColumn( identifier=column_counter + 1, name_in_dataset=cname_lon, data_type=DATATYPE_REAL ) ) house = arguments.get_value(cmd.PARA_HOUSE_NUMBER, raise_error=False, default_value=None) street = arguments.get_value(cmd.PARA_STREET, raise_error=False, default_value=None) city = arguments.get_value(cmd.PARA_CITY, raise_error=False, default_value=None) state = arguments.get_value(cmd.PARA_STATE, raise_error=False, default_value=None) params = { 'houseColumn': dataset.column_by_id(house).name_in_rdb if house is not None and house != '' else None, 'streetColumn': dataset.column_by_id(street).name_in_rdb if street is not None and street != '' else None, 'cityColumn': dataset.column_by_id(city).name_in_rdb if city is not None and city != '' else None, 'stateColumn': dataset.column_by_id(state).name_in_rdb if state is not None and state != '' else None, 'geocoder': geocoder#, #'latitudeColumn': Option[String], #'longitudeColumn': Option[String], #'cacheCode': Option[String] } elif command_id == cmd.MIMIR_KEY_REPAIR: column = dataset.column_by_id(arguments.get_value(pckg.PARA_COLUMN)) params = { "key" : column.name_in_rdb } elif command_id == cmd.MIMIR_MISSING_KEY: column = dataset.column_by_id(arguments.get_value(pckg.PARA_COLUMN)) params = column.name_in_rdb # Set MISSING ONLY to FALSE to ensure that all rows are returned #params += ['MISSING_ONLY(FALSE)'] # Need to run this lens twice in order to generate row ids for # any potential new tuple elif command_id == cmd.MIMIR_MISSING_VALUE: params = list() for col in arguments.get_value(cmd.PARA_COLUMNS, default_value=[]): f_col = dataset.column_by_id(col.get_value(pckg.PARA_COLUMN)) param = f_col.name_in_rdb col_constraint = col.get_value( cmd.PARA_COLUMNS_CONSTRAINT, raise_error=False ) if col_constraint == '': col_constraint = None #if not col_constraint is None: # param = param + ' ' + str(col_constraint).replace("'", "\'\'").replace("OR", ") OR (") #param = '\'(' + param + ')\'' params.append(param) elif command_id == cmd.MIMIR_PICKER: # Compute the input columns inputs = [] for col in arguments.get_value(cmd.PARA_SCHEMA): c_col = col.get_value(cmd.PARA_PICKFROM) column = dataset.column_by_id(c_col) inputs.append(column.name_in_rdb) # Compute the output column output = arguments.get_value(cmd.PARA_PICKAS, default_value = inputs[0]) if output == "": output = inputs[0] else: output = dataset.get_unique_name(output.strip().upper()) # Compute the final parameter list params = { "inputs" : inputs, "output" : output } elif command_id == cmd.MIMIR_TYPE_INFERENCE: params = [str(arguments.get_value(cmd.PARA_PERCENT_CONFORM))] elif command_id == cmd.MIMIR_SHAPE_DETECTOR: dseModel = arguments.get_value(cmd.PARA_MODEL_NAME) params = [] if not dseModel is None: params = [str(dseModel)] elif command_id == cmd.MIMIR_COMMENT: commentsParams = [] for idx, comment in enumerate(arguments.get_value(cmd.PARA_COMMENTS)): commentParam = {} # If target is defined, it is the column that we're trying to annotate # If unset (or empty), it means we're annotating the row. column_id = comment.get_value(cmd.PARA_EXPRESSION, None) if column_id is not None: column = dataset.column_by_id(column_id) commentParam['target'] = column.name_in_rdb # The comment commentParam['comment'] = comment.get_value(cmd.PARA_COMMENT) # If rowid is defined, it is the row that we're trying to annotate. # If unset (or empty), it means that we're annotating all rows rowid = comment.get_value(cmd.PARA_ROWID, None) if (rowid is not None) and (rowid != ""): # If rowid begins with '=', it's a formula if rowid[0] == '=': commentParam['condition'] = rowid[1:] else: commentParam['rows'] = [ int(rowid) ] #TODO: handle result columns commentsParams.append(commentParam) params = {'comments' : commentsParams} elif command_id == cmd.MIMIR_PIVOT: column = dataset.column_by_id(arguments.get_value(pckg.PARA_COLUMN)) params = { "target" : column.name_in_rdb, "keys" : [], "values" : [] } for col_arg in arguments.get_value(cmd.PARA_VALUES): col = dataset.column_by_id(col_arg.get_value(cmd.PARA_VALUE)) params["values"].append(col.name_in_rdb) for col_arg in arguments.get_value(cmd.PARA_KEYS, default_value=[]): col = dataset.column_by_id(col_arg.get_value(cmd.PARA_KEY)) params["keys"].append(col.name_in_rdb) if len(params["values"]) < 1: raise ValueError("Need at least one value column") # store_as_dataset = arguments.get_value(cmd.PARA_RESULT_DATASET) elif command_id == cmd.MIMIR_SHRED: params = { "keepOriginalColumns" : arguments.get_value(cmd.PARA_KEEP_ORIGINAL) } shreds = [] global_input_col = dataset.column_by_id(arguments.get_value(cmd.PARA_COLUMN_NAME)) for (idx, shred) in enumerate(arguments.get_value(cmd.PARA_COLUMNS)): output_col = shred.get_value(cmd.PARA_OUTPUT_COLUMN) if output_col is None: output_col = "{}_{}".format(global_input_col,idx) config = {} shred_type = shred.get_value(cmd.PARA_TYPE) expression = shred.get_value(cmd.PARA_EXPRESSION) group = shred.get_value(cmd.PARA_INDEX) if shred_type == "pattern": config["regexp"] = expression config["group"] = int(group) elif shred_type == "field": config["separator"] = expression config["field"] = int(group) elif shred_type == "explode": config["separator"] = expression elif shred_type == "pass": pass elif shred_type == "substring": range_parts = re.match("([0-9]+)(([+\\-])([0-9]+))?", expression) # print(range_parts) # Mimir expects ranges to be given from start (inclusive) to end (exclusive) # in a zero-based numbering scheme. # Vizier expects input ranges to be given in a one-based numbering scheme. # Convert to this format if range_parts is None: raise ValueError("Substring requires a range of the form '10', '10-11', or '10+1', but got '{}'".format(expression)) config["start"] = int(range_parts.group(1))-1 # Convert 1-based numbering to 0-based if range_parts.group(2) is None: config["end"] = config["start"] + 1 # if only one character, split one character elif range_parts.group(3) == "+": config["end"] = config["start"] + int(range_parts.group(4)) # start + length elif range_parts.group(3) == "-": config["end"] = int(range_parts.group(4)) # Explicit end, 1-based -> 0-based and exclusive cancel out else: raise ValueError("Invalid expression '{}' in substring shredder".format(expression)) # print("Shredding {} <- {} -- {}".format(output_col,config["start"],config["end"])) else: raise ValueError("Invalid Shredding Type '{}'".format(shred_type)) shreds.append({ **config, "op" : shred_type, "input" : global_input_col.name_in_rdb, "output" : output_col, }) params["shreds"] = shreds # store_as_dataset = arguments.get_value(cmd.PARA_RESULT_DATASET) else: raise ValueError("Unknown Mimir lens '{}'".format(command_id)) # Create Mimir lens mimir_lens_response = mimir.createLens( mimir_table_name, params, command_id, arguments.get_value(cmd.PARA_MATERIALIZE_INPUT, default_value=True), human_readable_name = ds_name.upper() ) lens_name = mimir_lens_response['name'] lens_schema = mimir_lens_response['schema'] lens_properties = mimir_lens_response['properties'] ds = MimirDatasetHandle.from_mimir_result(lens_name, lens_schema, lens_properties, ds_name) if command_id in LENSES_THAT_SHOULD_NOT_DISPLAY_TABLES: print_dataset_schema(outputs, ds_name, ds.columns) else: from vizier.api.webservice import server ds_output = server.api.datasets.get_dataset( project_id=context.project_id, dataset_id=ds.identifier, offset=0, limit=10 ) outputs.stdout.append(DatasetOutput(ds_output)) # Return task result return ExecResult( outputs=outputs, provenance=ModuleProvenance( read={input_ds_name: dataset.identifier}, write={ds_name: DatasetDescriptor( identifier=ds.identifier, name=ds_name, columns=ds.columns )} ) )
def execute_query(self, args: ModuleArguments, context: TaskContext) -> ExecResult: """Execute a SQL query in the given context. Parameters ---------- args: vizier.viztrail.command.ModuleArguments User-provided command arguments context: vizier.engine.task.base.TaskContext Context in which a task is being executed Returns ------- vizier.engine.task.processor.ExecResult """ # Get SQL source code that is in this cell and the global # variables source = args.get_value(cmd.PARA_SQL_SOURCE) if not source.endswith(';'): source = source ds_name = args.get_value(cmd.PARA_OUTPUT_DATASET, raise_error=False) # Get mapping of datasets in the context to their respective table # name in the Mimir backend mimir_table_names = dict() for ds_name_o in context.datasets: dataset_id = context.datasets[ds_name_o].identifier dataset = context.datastore.get_dataset(dataset_id) if dataset is None: raise ValueError('unknown dataset \'' + ds_name_o + '\'') mimir_table_names[ds_name_o] = dataset.identifier # Module outputs outputs = ModuleOutputs() is_success = True functions = { name: context.dataobjects[name].identifier for name in context.dataobjects if context.dataobjects[name].obj_type == ARTIFACT_TYPE_PYTHON } try: # Create the view from the SQL source view_name, dependencies, mimirSchema, properties, functionDeps = mimir.createView( datasets=mimir_table_names, query=source, functions=dict(functions)) ds = MimirDatasetHandle.from_mimir_result(view_name, mimirSchema, properties, ds_name) print(mimirSchema) if ds_name is None or ds_name == '': ds_name = "TEMPORARY_RESULT" from vizier.api.webservice import server ds_output = server.api.datasets.get_dataset( project_id=context.project_id, dataset_id=ds.identifier, offset=0, limit=10) if ds_output is None: outputs.stderr.append( TextOutput("Error displaying dataset {}".format(ds_name))) else: ds_output['name'] = ds_name outputs.stdout.append(DatasetOutput(ds_output)) dependenciesDict: Dict[str, str] = { dep_name.lower(): get_artifact_id(dep) for dep_name, dep in [( dep_name, context.datasets.get(dep_name.lower(), None)) for dep_name in dependencies] if dep is not None } functionDepDict: Dict[str, str] = { dep_name.lower(): get_artifact_id(dep) for dep_name, dep in [( dep_name, context.dataobjects.get(dep_name.lower(), None)) for dep_name in dependencies] if dep is not None } # print("---- SQL DATASETS ----\n{}\n{}".format(context.datasets, dependencies)) provenance = ModuleProvenance(write={ ds_name: DatasetDescriptor(identifier=ds.identifier, name=ds_name, columns=ds.columns) }, read={ **dependenciesDict, **functionDepDict }) except Exception as ex: provenance = ModuleProvenance() outputs.error(ex) is_success = False # Return execution result return ExecResult(is_success=is_success, outputs=outputs, provenance=provenance)
def register_dataset(self, table_name, columns, row_counter=None, annotations=None): """Create a new record for a database table or view. Note that this method does not actually create the table or view in the database but adds the datasets metadata to the data store. The table or view will have been created by a load command or be the result from executing a lens or a VizUAL command. Parameters ---------- table_name: string Name of relational database table or view containing the dataset. columns: list(vizier.datastore.mimir.MimirDatasetColumn) List of column names in the dataset schema and their corresponding names in the relational database table or view. row_counter: int Counter for unique row ids annotations: vizier.datastore.metadata.DatasetMetadata Annotations for dataset components update_rows: bool, optional Flag indicating that the number of rows may have changed and the list of row identifier therefore needs to be checked. Returns ------- vizier.datastore.mimir.dataset.MimirDatasetHandle """ # Depending on whether we need to update row ids we either query the # database or just get the schema. In either case mimir_schema will # contain a the returned Mimir schema information. sql = base.get_select_query(table_name, columns=columns) + ';' mimir_schema = mimir.getSchema(sql) # Create a mapping of column name (in database) to column type. This # mapping is then used to update the data type information for all # column descriptors. col_types = dict() for col in mimir_schema: col_types[base.sanitize_column_name( col['name'].upper())] = col['baseType'] for col in columns: col.data_type = col_types[col.name_in_rdb] # Set row counter to max. row id + 1 if None if row_counter is None: row_counter = mimir.countRows(table_name) dataset = MimirDatasetHandle(identifier=get_unique_identifier(), columns=list( map(base.sanitize_column_name, columns)), table_name=table_name, row_counter=row_counter, annotations=annotations) # Create a new directory for the dataset if it doesn't exist. dataset_dir = self.get_dataset_dir(dataset.identifier) if not os.path.isdir(dataset_dir): os.makedirs(dataset_dir) # Write dataset and annotation file to disk dataset.to_file(self.get_dataset_file(dataset.identifier)) dataset.annotations.to_file( self.get_metadata_filename(dataset.identifier)) return dataset
def load_dataset( self, f_handle: Optional[FileHandle] = None, proposed_schema: List[Tuple[str, str]] = [], url: Optional[str] = None, detect_headers: bool = True, infer_types: bool = True, properties: Dict[str, Any] = {}, load_format: str = 'csv', options: List[Dict[str, str]] = [], human_readable_name: Optional[str] = None, ): """Create a new dataset from a given file or url. Expects that either the file handle or the url are not None. Raises ValueError if both are None or not None. Parameters ---------- f_handle : vizier.filestore.base.FileHandle, optional handle for an uploaded file on the associated file server. url: string, optional, optional Url for the file source detect_headers: bool, optional Detect column names in loaded file if True infer_types: bool, optional Infer column types for loaded dataset if True load_format: string, optional Format identifier options: list, optional Additional options for Mimirs load command human_readable_name: string, optional Optional human readable name for the resulting table Returns ------- vizier.datastore.mimir.dataset.MimirDatasetHandle """ assert (url is not None or f_handle is not None) if f_handle is None and url is None: raise ValueError('no load source given') elif f_handle is not None and url is not None: raise ValueError('too many load sources given') elif url is None and f_handle is not None: # os.path.abspath((r'%s' % os.getcwd().replace('\\','/') ) + '/' + f_handle.filepath) abspath = f_handle.filepath elif url is not None: abspath = url # for ease of debugging, associate each table with a prefix identifying its nature prefix = load_format if load_format in SAFE_FORMAT_IDENTIFIER_PREFIXES else "LOADED_" # Load dataset into Mimir table_name, mimirSchema = mimir.loadDataSource( abspath, infer_types, detect_headers, load_format, human_readable_name, options, properties=properties, result_name=prefix + get_unique_identifier(), proposed_schema=proposed_schema) return MimirDatasetHandle.from_mimir_result(table_name, mimirSchema, properties, human_readable_name)