def update_cell(self, identifier, column_id, row_id, value, datastore): """Update a cell in a given dataset. Raises ValueError if no dataset with given identifier exists or if the specified cell is outside of the current dataset ranges. Parameters ---------- identifier : string Unique dataset identifier column_id: int Unique column identifier for updated cell row_id: int Unique row identifier value: string New cell value datastore : vizier.datastore.fs.base.FileSystemDatastore Datastore to retireve and update datasets Returns ------- vizier.engine.packages.vizual.api.VizualApiResult """ # Get dataset. Raise exception if dataset is unknown dataset = datastore.get_dataset(identifier) if dataset is None: raise ValueError('unknown dataset \'' + identifier + '\'') # Get the index of the specified cell column col_index = get_index_for_column(dataset, column_id) # Raise exception if row id is not valid # Create a view for the modified dataset col_list = [] for i in range(len(dataset.columns)): col = dataset.columns[i] if i == col_index: try: val_stmt = col.to_sql_value(value) col_sql = val_stmt + ' ELSE ' + col.name_in_rdb + ' END ' except ValueError: col_sql = '\'' + str( value ) + '\' ELSE CAST({{input}}.' + col.name_in_rdb + ' AS varchar) END ' rid_sql = MIMIR_ROWID_COL.to_sql_value(row_id) stmt = 'CASE WHEN ' + ROW_ID + ' = ' + rid_sql + ' THEN ' stmt += col_sql stmt += 'AS ' + col.name_in_rdb col_list.append(stmt) else: col_list.append(col.name_in_rdb) sql = 'SELECT ' + ','.join( col_list) + ' FROM ' + dataset.table_name + ';' view_name, dependencies = mimir.createView(dataset.table_name, sql) # Store updated dataset information with new identifier ds = datastore.register_dataset(table_name=view_name, columns=dataset.columns, row_counter=dataset.row_counter, annotations=dataset.annotations) return VizualApiResult(ds)
def insert_column(self, identifier, position, name, datastore): """Insert column with given name at given position in dataset. Raises ValueError if no dataset with given identifier exists, if the specified column position is outside of the current schema bounds, or if the column name is invalid. Parameters ---------- identifier: string Unique dataset identifier position: int Index position at which the column will be inserted name: string, optional New column name datastore : vizier.datastore.fs.base.FileSystemDatastore Datastore to retireve and update datasets Returns ------- vizier.engine.packages.vizual.api.VizualApiResult """ # Raise ValueError if given colum name is invalid if not is_valid_name(name): raise ValueError('invalid column name \'' + str(name) + '\'') # Get dataset. Raise exception if dataset is unknown dataset = datastore.get_dataset(identifier) if dataset is None: raise ValueError('unknown dataset \'' + identifier + '\'') # Make sure that position is a valid column index in the new dataset if position < 0 or position > len(dataset.columns): raise ValueError('invalid column index \'' + str(position) + '\'') # Get identifier for new column col_id = dataset.max_column_id() + 1 # Insert new column into schema schema = list(dataset.columns) new_column = MimirDatasetColumn(col_id, name, name) schema.insert(position, new_column) # Create a view for the modified schema col_list = [] for col in schema: if col.identifier == new_column.identifier: # Note: By no (April 2018) this requires Mimir to run with the # XNULL option. Otherwise, in some scenarios setting the all # values in the new column to NULL may cause an exception. col_list.append(" CAST('' AS int) AS " + col.name_in_rdb) else: col_list.append(col.name_in_rdb) sql = 'SELECT ' + ','.join( col_list) + ' FROM ' + dataset.table_name + ';' view_name, dependencies = mimir.createView(dataset.table_name, sql) # Store updated dataset information with new identifier ds = datastore.register_dataset(table_name=view_name, columns=schema, row_counter=dataset.row_counter, annotations=dataset.annotations) return VizualApiResult(ds)
def compute_empty_dataset(self, args, context): """Execute empty dataset command. Parameters ---------- args: vizier.viztrail.command.ModuleArguments User-provided command arguments context: vizier.engine.task.base.TaskContext Context in which a task is being executed Returns ------- vizier.engine.task.processor.ExecResult """ outputs = ModuleOutputs() default_columns = [("''", "unnamed_column")] ds_name = args.get_value(pckg.PARA_NAME).lower() if ds_name in context.datasets: raise ValueError('dataset \'' + ds_name + '\' exists') if not is_valid_name(ds_name): raise ValueError('invalid dataset name \'' + ds_name + '\'') try: source = "SELECT {};".format(", ".join( default_val + " AS " + col_name for default_val, col_name in default_columns)) view_name, dependencies = mimir.createView(dict(), source) columns = [ MimirDatasetColumn(identifier=col_id, name_in_dataset=col_defn[1]) for col_defn, col_id in zip(default_columns, range(len(default_columns))) ] ds = context.datastore.register_dataset(table_name=view_name, columns=columns, row_counter=1) provenance = ModuleProvenance( write={ ds_name: DatasetDescriptor(identifier=ds.identifier, columns=ds.columns, row_count=ds.row_count) }, read=dict( ) # Need to explicitly declare a lack of dependencies. ) outputs.stdout.append( TextOutput("Empty dataset '{}' created".format(ds_name))) except Exception as ex: provenance = ModuleProvenance() outputs.error(ex) return ExecResult(is_success=(len(outputs.stderr) == 0), outputs=outputs, provenance=provenance)
def filter_columns(self, identifier, columns, names, datastore): """Dataset projection operator. Returns a copy of the dataset with the given identifier that contains only those columns listed in columns. The list of names contains optional new names for the filtered columns. A value of None in names indicates that the name of the corresponding column is not changed. Raises ValueError if no dataset with given identifier exists or if any of the filter columns are unknown. Parameters ---------- identifier: string Unique dataset identifier columns: list(int) List of column identifier for columns in the result. names: list(string) Optional new names for filtered columns. datastore : vizier.datastore.fs.base.FileSystemDatastore Datastore to retireve and update datasets Returns ------- vizier.engine.packages.vizual.api.VizualApiResult """ # Get dataset. Raise exception if dataset is unknown dataset = datastore.get_dataset(identifier) if dataset is None: raise ValueError('unknown dataset \'' + identifier + '\'') # The schema of the new dataset only contains the columns in the given # list. A column might need to be renamed. schema = list() col_list = [] for i in range(len(columns)): col_idx = get_index_for_column(dataset, columns[i]) col = dataset.columns[col_idx] if not names[i] is None: schema.append( MimirDatasetColumn(identifier=col.identifier, name_in_dataset=names[i], name_in_rdb=col.name_in_rdb)) else: schema.append(col) col_list.append(col.name_in_rdb) sql = 'SELECT ' + ','.join( col_list) + ' FROM ' + dataset.table_name + ';' view_name, dependencies = mimir.createView(dataset.table_name, sql) # Store updated dataset information with new identifier ds = datastore.register_dataset(table_name=view_name, columns=schema, row_counter=dataset.row_counter, annotations=dataset.annotations.filter( columns=columns, rows=dataset.row_ids)) return VizualApiResult(ds)
def sort_dataset(self, identifier: str, columns: List[int], reversed: List[bool], datastore: Datastore) -> VizualApiResult: """Sort the dataset with the given identifier according to the order by statement. The order by statement is a pair of lists. The first list contains the identifier of columns to sort on. The second list contains boolean flags, one for each entry in columns, indicating whether sort order is revered for the corresponding column or not. Returns the number of rows in the dataset and the identifier of the sorted dataset. Raises ValueError if no dataset with given identifier exists or if any of the columns in the order by clause are unknown. Parameters ---------- identifier: string Unique dataset identifier columns: list(int) List of column identifier for sort columns. reversed: list(bool) Flags indicating whether the sort order of the corresponding column is reveresed. datastore : vizier.datastore.fs.base.FileSystemDatastore Datastore to retireve and update datasets Returns ------- vizier.engine.packages.vizual.api.VizualApiResult """ # Get dataset. Raise exception if dataset is unknown dataset = datastore.get_dataset(identifier) assert (isinstance(dataset, MimirDatasetHandle)) if dataset is None: raise ValueError('unknown dataset \'' + identifier + '\'') # Create order by clause based on columns and reversed flags order_by_clause = list() for i in range(len(columns)): col_id = columns[i] stmt = cast(MimirDatasetColumn, dataset.column_by_id(col_id)).name_in_rdb if reversed[i]: stmt += ' DESC' order_by_clause.append(stmt) sql = 'SELECT * FROM ' + dataset.identifier + ' ORDER BY ' sql += ','.join(order_by_clause) view_name, dependencies, schema, properties, functionDeps = mimir.createView( datasets={dataset.identifier: dataset.identifier}, query=sql) ds = MimirDatasetHandle.from_mimir_result(view_name, schema, properties) return VizualApiResult(ds)
def sort_dataset(self, identifier, columns, reversed, datastore): """Sort the dataset with the given identifier according to the order by statement. The order by statement is a pair of lists. The first list contains the identifier of columns to sort on. The second list contains boolean flags, one for each entry in columns, indicating whether sort order is revered for the corresponding column or not. Returns the number of rows in the dataset and the identifier of the sorted dataset. Raises ValueError if no dataset with given identifier exists or if any of the columns in the order by clause are unknown. Parameters ---------- identifier: string Unique dataset identifier columns: list(int) List of column identifier for sort columns. reversed: list(bool) Flags indicating whether the sort order of the corresponding column is reveresed. datastore : vizier.datastore.fs.base.FileSystemDatastore Datastore to retireve and update datasets Returns ------- vizier.engine.packages.vizual.api.VizualApiResult """ # Get dataset. Raise exception if dataset is unknown dataset = datastore.get_dataset(identifier) if dataset is None: raise ValueError('unknown dataset \'' + identifier + '\'') # Create order by clause based on columns and reversed flags order_by_clause = list() for i in range(len(columns)): col_id = columns[i] stmt = dataset.column_by_id(col_id).name_in_rdb if reversed[i]: stmt += ' DESC' order_by_clause.append(stmt) sql = 'SELECT * FROM {{input}} ORDER BY ' sql += ','.join(order_by_clause) + ';' view_name, dependencies = mimir.createView(dataset.table_name, sql) # Register new dataset with only a modified list of row identifier ds = datastore.register_dataset(table_name=view_name, columns=dataset.columns, annotations=dataset.annotations) return VizualApiResult(ds)
def insert_row(self, identifier, position, datastore): """Insert row at given position in a dataset. Raises ValueError if no dataset with given identifier exists or if the specified row psotion isoutside the dataset bounds. Parameters ---------- identifier: string Unique dataset identifier position: int Index position at which the row will be inserted datastore : vizier.datastore.fs.base.FileSystemDatastore Datastore to retireve and update datasets Returns ------- vizier.engine.packages.vizual.api.VizualApiResult """ # Get dataset. Raise exception if dataset is unknown dataset = datastore.get_dataset(identifier) if dataset is None: raise ValueError('unknown dataset \'' + identifier + '\'') # Make sure that position is a valid row index in the new dataset if position < 0 or position > len(dataset.row_ids): raise ValueError('invalid row index \'' + str(position) + '\'') # Get unique id for new row dataset.row_counter += 1 # Create a view for the modified schema col_list = [] for col in dataset.columns: col_list.append(col.name_in_rdb) sql = 'SELECT ' + ','.join(col_list) + ' FROM ' + dataset.table_name mimirSchema = mimir.getSchema(sql) union_list = [] for col in mimirSchema[1:]: union_list.append('CAST(NULL AS ' + col['baseType'] + ') AS ' + col['name']) sql = '(' + sql + ') UNION ALL (SELECT ' + ','.join(union_list) + ');' view_name, dependencies = mimir.createView(dataset.table_name, sql) # Store updated dataset information with new identifier ds = datastore.register_dataset(table_name=view_name, columns=dataset.columns, row_counter=dataset.row_counter, annotations=dataset.annotations) return VizualApiResult(ds)
def create_missing_key_view(dataset, lens_name, key_column): """ Create a view for missing ROW_ID's on a MISSING_KEY lens. Parameters ---------- dataset: vizier.datastore.mimir.MimirDatasetHandle Descriptor for the dataset on which the lens was created lens_name: string Identifier of the created MISSING_KEY lens key_column: vizier.datastore.mimir.MimirDatasetColumn Name of the column for which the missing values where generated Returns ------- string, int Returns the name of the created view and the adjusted counter for row ids. """ # Select the rows that have missing row ids key_col_name = key_column.name_in_rdb sql = 'SELECT ' + key_col_name + ' FROM ' + lens_name sql += ' WHERE ' + ROW_ID + ' IS NULL;' rs = mimir.vistrailsQueryMimirJson(sql, False, False) case_conditions = [] for row in rs['data']: row_id = dataset.row_counter + len(case_conditions) val = str(row[0]) # If the key colum is of type real then we need to convert val into # something that looks like a real if key_column.data_type.lower() == 'real': val += '.0' case_conditions.append('WHEN ' + key_col_name + ' = ' + val + ' THEN ' + str(row_id)) # If no new rows where inserted we are good to go with the existing lens if len(case_conditions) == 0: return lens_name, dataset.row_counter # Create the view SQL statement stmt = 'CASE ' + (' '.join(case_conditions)).strip() stmt += ' ELSE ' + ROW_ID + ' END AS ' + ROW_ID col_list = [stmt] for column in dataset.columns: col_list.append(column.name_in_rdb) sql = 'SELECT ' + ','.join(col_list) + ' FROM ' + lens_name + ';' view_name, dependencies = mimir.createView(dataset.table_name, sql) return view_name, dataset.row_counter + len(case_conditions)
def delete_column(self, identifier, column_id, datastore): """Delete a column in a given dataset. Raises ValueError if no dataset with given identifier exists or if the specified column is unknown. Parameters ---------- identifier: string Unique dataset identifier column_id: int Unique column identifier datastore : vizier.datastore.fs.base.FileSystemDatastore Datastore to retireve and update datasets Returns ------- vizier.engine.packages.vizual.api.VizualApiResult """ # Get dataset. Raise exception if dataset is unknown dataset = datastore.get_dataset(identifier) if dataset is None: raise ValueError('unknown dataset \'' + identifier + '\'') # Get the index of the specified column that is to be deleted. col_index = get_index_for_column(dataset, column_id) # Delete column from schema schema = list(dataset.columns) del schema[col_index] # Create a view for the modified schema col_list = [] for col in schema: col_list.append(col.name_in_rdb) sql = 'SELECT ' + ','.join( col_list) + ' FROM ' + dataset.table_name + ';' view_name, dependencies = mimir.createView(dataset.table_name, sql) # Store updated dataset information with new identifier ds = datastore.register_dataset(table_name=view_name, columns=schema, row_counter=dataset.row_counter, annotations=dataset.annotations) return VizualApiResult(ds)
def delete_row(self, identifier, rowid, datastore): """Delete a row in a given dataset. Raises ValueError if no dataset with given identifier exists or if the specified row is not within the range of the dataset. Parameters ---------- identifier: string Unique dataset identifier row_index: int Row index for deleted row datastore : vizier.datastore.fs.base.FileSystemDatastore Datastore to retireve and update datasets Returns ------- vizier.engine.packages.vizual.api.VizualApiResult """ # Get dataset. Raise exception if dataset is unknown dataset = datastore.get_dataset(identifier) if dataset is None: raise ValueError('unknown dataset \'' + identifier + '\'') # Create a view for the modified dataset col_list = [] for col in dataset.columns: col_list.append(col.name_in_rdb) sql = 'SELECT ' + ','.join(col_list) + ' FROM ' + dataset.table_name sql += ' WHERE ' + ROW_ID + ' <> ' + MIMIR_ROWID_COL.to_sql_value( roid) + ';' view_name, dependencies = mimir.createView(dataset.table_name, sql) # Store updated dataset information with new identifier ds = datastore.register_dataset(table_name=view_name, columns=dataset.columns, row_counter=dataset.row_counter - 1, annotations=dataset.annotations) return VizualApiResult(ds)
def execute_query(self, args: ModuleArguments, context: TaskContext) -> ExecResult: """Execute a SQL query in the given context. Parameters ---------- args: vizier.viztrail.command.ModuleArguments User-provided command arguments context: vizier.engine.task.base.TaskContext Context in which a task is being executed Returns ------- vizier.engine.task.processor.ExecResult """ # Get SQL source code that is in this cell and the global # variables source = args.get_value(cmd.PARA_SQL_SOURCE) if not source.endswith(';'): source = source ds_name = args.get_value(cmd.PARA_OUTPUT_DATASET, raise_error=False) # Get mapping of datasets in the context to their respective table # name in the Mimir backend mimir_table_names = dict() for ds_name_o in context.datasets: dataset_id = context.datasets[ds_name_o].identifier dataset = context.datastore.get_dataset(dataset_id) if dataset is None: raise ValueError('unknown dataset \'' + ds_name_o + '\'') mimir_table_names[ds_name_o] = dataset.identifier # Module outputs outputs = ModuleOutputs() is_success = True functions = { name: context.dataobjects[name].identifier for name in context.dataobjects if context.dataobjects[name].obj_type == ARTIFACT_TYPE_PYTHON } try: # Create the view from the SQL source view_name, dependencies, mimirSchema, properties, functionDeps = mimir.createView( datasets=mimir_table_names, query=source, functions=dict(functions)) ds = MimirDatasetHandle.from_mimir_result(view_name, mimirSchema, properties, ds_name) print(mimirSchema) if ds_name is None or ds_name == '': ds_name = "TEMPORARY_RESULT" from vizier.api.webservice import server ds_output = server.api.datasets.get_dataset( project_id=context.project_id, dataset_id=ds.identifier, offset=0, limit=10) if ds_output is None: outputs.stderr.append( TextOutput("Error displaying dataset {}".format(ds_name))) else: ds_output['name'] = ds_name outputs.stdout.append(DatasetOutput(ds_output)) dependenciesDict: Dict[str, str] = { dep_name.lower(): get_artifact_id(dep) for dep_name, dep in [( dep_name, context.datasets.get(dep_name.lower(), None)) for dep_name in dependencies] if dep is not None } functionDepDict: Dict[str, str] = { dep_name.lower(): get_artifact_id(dep) for dep_name, dep in [( dep_name, context.dataobjects.get(dep_name.lower(), None)) for dep_name in dependencies] if dep is not None } # print("---- SQL DATASETS ----\n{}\n{}".format(context.datasets, dependencies)) provenance = ModuleProvenance(write={ ds_name: DatasetDescriptor(identifier=ds.identifier, name=ds_name, columns=ds.columns) }, read={ **dependenciesDict, **functionDepDict }) except Exception as ex: provenance = ModuleProvenance() outputs.error(ex) is_success = False # Return execution result return ExecResult(is_success=is_success, outputs=outputs, provenance=provenance)
def rename_column(self, identifier, column_id, name, datastore): """Rename column in a given dataset. Raises ValueError if no dataset with given identifier exists, if the specified column is unknown, or if the given column name is invalid. Parameters ---------- identifier: string Unique dataset identifier column_id: int Unique column identifier name: string New column name datastore : vizier.datastore.fs.base.FileSystemDatastore Datastore to retireve and update datasets Returns ------- vizier.engine.packages.vizual.api.VizualApiResult """ # Raise ValueError if given colum name is invalid if not is_valid_name(name): raise ValueError('invalid column name \'' + str(name) + '\'') # Get dataset. Raise exception if dataset is unknown dataset = datastore.get_dataset(identifier) if dataset is None: raise ValueError('unknown dataset \'' + identifier + '\'') # Get the specified column that is to be renamed and set the column name # to the new name columns = list() schema = list(dataset.columns) colIndex = get_index_for_column(dataset, column_id) col = schema[colIndex] # No need to do anything if the name hasn't changed if col.name.lower() != name.lower(): sql = 'SELECT * FROM ' + dataset.table_name mimirSchema = mimir.getSchema(sql) # Create list of dataset columns colSql = '' idx = 0 for col in mimirSchema: col_id = len(columns) name_in_dataset = sanitize_column_name(col['name'].upper()) name_in_rdb = sanitize_column_name(col['name'].upper()) col = MimirDatasetColumn(identifier=col_id, name_in_dataset=name_in_dataset, name_in_rdb=name_in_rdb) if idx == 0: colSql = name_in_dataset + ' AS ' + name_in_rdb elif idx == colIndex: colSql = colSql + ', ' + name_in_dataset + ' AS ' + name col.name = name col.name_in_rdb = name else: colSql = colSql + ', ' + name_in_dataset + ' AS ' + name_in_rdb columns.append(col) idx = idx + 1 # Create view for loaded dataset sql = 'SELECT ' + colSql + ' FROM {{input}};' view_name, dependencies = mimir.createView(dataset.table_name, sql) # There are no changes to the underlying database. We only need to # change the column information in the dataset schema. # Store updated dataset to get new identifier ds = datastore.register_dataset(table_name=view_name, columns=columns, row_counter=dataset.row_counter, annotations=dataset.annotations) return VizualApiResult(ds) else: return VizualApiResult(dataset)
def execute_query(self, args, context): """Execute a SQL query in the given context. Parameters ---------- args: vizier.viztrail.command.ModuleArguments User-provided command arguments context: vizier.engine.task.base.TaskContext Context in which a task is being executed Returns ------- vizier.engine.task.processor.ExecResult """ # Get SQL source code that is in this cell and the global # variables source = args.get_value(cmd.PARA_SQL_SOURCE) if not source.endswith(';'): source = source + ';' ds_name = args.get_value(cmd.PARA_OUTPUT_DATASET, raise_error=False) # Get mapping of datasets in the context to their respective table # name in the Mimir backend mimir_table_names = dict() for ds_name_o in context.datasets: dataset_id = context.datasets[ds_name_o] dataset = context.datastore.get_dataset(dataset_id) if dataset is None: raise ValueError('unknown dataset \'' + ds_name_o + '\'') mimir_table_names[ds_name_o] = dataset.table_name # Module outputs outputs = ModuleOutputs() try: # Create the view from the SQL source view_name, dependencies = mimir.createView(mimir_table_names, source) sql = 'SELECT * FROM ' + view_name mimirSchema = mimir.getSchema(sql) columns = list() for col in mimirSchema: col_id = len(columns) name_in_dataset = col['name'] col = MimirDatasetColumn(identifier=col_id, name_in_dataset=name_in_dataset) columns.append(col) row_count = mimir.countRows(view_name) provenance = None if ds_name is None or ds_name == '': ds_name = "TEMPORARY_RESULT" ds = context.datastore.register_dataset(table_name=view_name, columns=columns, row_counter=row_count) ds_output = server.api.datasets.get_dataset( project_id=context.project_id, dataset_id=ds.identifier, offset=0, limit=10) ds_output['name'] = ds_name dependencies = dict((dep_name.lower(), context.datasets.get(dep_name.lower(), None)) for dep_name in dependencies) # print("---- SQL DATASETS ----\n{}\n{}".format(context.datasets, dependencies)) outputs.stdout.append(DatasetOutput(ds_output)) provenance = ModuleProvenance(write={ ds_name: DatasetDescriptor(identifier=ds.identifier, columns=ds.columns, row_count=ds.row_count) }, read=dependencies) except Exception as ex: provenance = ModuleProvenance() outputs.error(ex) # Return execution result return ExecResult(is_success=(len(outputs.stderr) == 0), outputs=outputs, provenance=provenance)
def create_dataset(self, columns, rows, human_readable_name=None, annotations=None, backend_options=[], dependencies=[]): """Create a new dataset in the datastore. Expects at least the list of columns and the rows for the dataset. Parameters ---------- columns: list(vizier.datastore.dataset.DatasetColumn) List of columns. It is expected that each column has a unique identifier. rows: list(vizier.datastore.dataset.DatasetRow) List of dataset rows. annotations: vizier.datastore.annotation.dataset.DatasetMetadata, optional Annotations for dataset components Returns ------- vizier.datastore.dataset.DatasetDescriptor """ # Get unique identifier for new dataset identifier = 'DS_' + get_unique_identifier() # Write rows to temporary file in CSV format tmp_file = os.path.abspath(self.base_path + identifier) # Create a list of columns that contain the user-vizible column name and # the name in the database db_columns = list() colSql = '' for col in map(base.sanitize_column_name, columns): db_columns.append( MimirDatasetColumn(identifier=col.identifier, name_in_dataset=col.name, name_in_rdb=col.name)) if colSql == '': colSql = col.name + ' AS ' + col.name else: colSql = colSql + ', ' + col.name + ' AS ' + col.name # Create CSV file for load with open(tmp_file, 'w') as f_out: writer = csv.writer(f_out, quoting=csv.QUOTE_MINIMAL) writer.writerow([col.name_in_rdb for col in db_columns]) for row in rows: record = helper.encode_values(row.values) writer.writerow(record) # Load CSV file using Mimirs loadCSV method. table_name = mimir.loadDataSource( tmp_file, True, True, human_readable_name=human_readable_name, backend_options=backend_options, dependencies=dependencies) os.remove(tmp_file) sql = 'SELECT ' + colSql + ' FROM {{input}};' view_name, dependencies = mimir.createView(table_name, sql) # Get number of rows in the view that was created in the backend row_count = mimir.countRows(view_name) # Insert the new dataset metadata information into the datastore return self.register_dataset(table_name=view_name, columns=db_columns, row_counter=row_count, annotations=annotations)
def load_dataset(self, f_handle=None, url=None, detect_headers=True, infer_types=True, load_format='csv', options=[], human_readable_name=None): """Create a new dataset from a given file or url. Expects that either the file handle or the url are not None. Raises ValueError if both are None or not None. Parameters ---------- f_handle : vizier.filestore.base.FileHandle, optional handle for an uploaded file on the associated file server. url: string, optional, optional Url for the file source detect_headers: bool, optional Detect column names in loaded file if True infer_types: bool, optional Infer column types for loaded dataset if True load_format: string, optional Format identifier options: list, optional Additional options for Mimirs load command human_readable_name: string, optional Optional human readable name for the resulting table Returns ------- vizier.datastore.mimir.dataset.MimirDatasetHandle """ if f_handle is None and url is None: raise ValueError('no load source given') elif not f_handle is None and not url is None: raise ValueError('too many load sources given') elif url is None: # os.path.abspath((r'%s' % os.getcwd().replace('\\','/') ) + '/' + f_handle.filepath) abspath = f_handle.filepath elif not url is None: abspath = url # Load dataset into Mimir init_load_name = mimir.loadDataSource(abspath, infer_types, detect_headers, load_format, human_readable_name, options) # Retrieve schema information for the created dataset sql = 'SELECT * FROM ' + init_load_name mimirSchema = mimir.getSchema(sql) # Create list of dataset columns columns = list() colSql = '' for col in mimirSchema: col_id = len(columns) name_in_dataset = base.sanitize_column_name(col['name'].upper()) name_in_rdb = base.sanitize_column_name(col['name'].upper()) col = MimirDatasetColumn(identifier=col_id, name_in_dataset=name_in_dataset, name_in_rdb=name_in_rdb) if colSql == '': colSql = name_in_dataset + ' AS ' + name_in_rdb else: colSql = colSql + ', ' + name_in_dataset + ' AS ' + name_in_rdb columns.append(col) # Create view for loaded dataset sql = 'SELECT ' + colSql + ' FROM {{input}};' view_name, dependencies = mimir.createView(init_load_name, sql) # TODO: this is a hack to speed up this step a bit. # we get the first row id and the count and take a range; # this is fragile and should be made better # # NOTE: This does not work because ROW_ID appears to be a string. # Thus, sorting not necessarily returns the smallest integer value # first. # row_count = mimir.countRows(view_name) return self.register_dataset(table_name=view_name, columns=columns, row_counter=row_count)