def test_deduplicate_annotations(self): """Test removing duplicated annotations.""" store = FileSystemDatastore(STORE_DIR) ds = store.create_dataset( columns=[ DatasetColumn(identifier=0, name='A'), DatasetColumn(identifier=1, name='B') ], rows=[DatasetRow(identifier=0, values=['a', 'b'])], annotations=DatasetMetadata( cells=[ DatasetAnnotation(column_id=0, row_id=0, key='X', value=1), DatasetAnnotation(column_id=0, row_id=0, key='X', value=2), DatasetAnnotation(column_id=1, row_id=0, key='X', value=3), DatasetAnnotation(column_id=1, row_id=1, key='X', value=3), DatasetAnnotation(column_id=0, row_id=0, key='Y', value=1), DatasetAnnotation(column_id=0, row_id=0, key='X', value=1), DatasetAnnotation(column_id=0, row_id=0, key='X', value=2), DatasetAnnotation(column_id=1, row_id=0, key='X', value=3), DatasetAnnotation(column_id=1, row_id=1, key='X', value=3), ], columns=[ DatasetAnnotation(column_id=0, key='A', value='x'), DatasetAnnotation(column_id=1, key='A', value='x'), DatasetAnnotation(column_id=0, key='A', value='x'), DatasetAnnotation(column_id=1, key='A', value='x'), DatasetAnnotation(column_id=0, key='A', value='x'), DatasetAnnotation(column_id=1, key='A', value='x'), DatasetAnnotation(column_id=0, key='A', value='x'), DatasetAnnotation(column_id=1, key='A', value='x') ], rows=[ DatasetAnnotation(row_id=0, key='E', value=100), DatasetAnnotation(row_id=0, key='E', value=100) ] ) ) ds = store.get_dataset(ds.identifier) self.assertEqual(len(ds.annotations.cells), 4) self.assertEqual(len(ds.annotations.columns), 2) self.assertEqual(len(ds.annotations.rows), 1) annos = ds.annotations.for_cell(column_id=0, row_id=0) self.assertEqual(len(annos), 3) self.assertTrue(1 in [a.value for a in annos]) self.assertTrue(2 in [a.value for a in annos]) self.assertFalse(3 in [a.value for a in annos]) self.assertEqual(len(ds.annotations.find_all(values=annos, key='X')), 2) with self.assertRaises(ValueError): ds.annotations.find_one(values=annos, key='X') self.assertEqual(len(ds.annotations.for_column(column_id=0)), 1) self.assertEqual(len(ds.annotations.for_row(row_id=0)), 1) annotations = ds.annotations.filter(columns=[1]) self.assertEqual(len(annotations.cells), 1) self.assertEqual(len(annotations.columns), 1) self.assertEqual(len(annotations.rows), 1)
def test_query_annotations(self): """Test retrieving annotations via the datastore.""" store = FileSystemDatastore(STORE_DIR) ds = store.create_dataset( columns=[ DatasetColumn(identifier=0, name='A'), DatasetColumn(identifier=1, name='B') ], rows=[DatasetRow(identifier=0, values=['a', 'b'])], properties=EXAMPLE_PROPERTIES ) properties = store.get_properties(ds.identifier) self.assertEqual(len(properties["columns"]), 2)
def test_unique_name(self): """Test method that computes unique column names.""" ds = DatasetDescriptor(identifier='0', columns=[ DatasetColumn(identifier=0, name='ABC'), DatasetColumn(identifier=1, name='A'), DatasetColumn(identifier=2, name='ABC_1'), DatasetColumn(identifier=3, name='DEF'), DatasetColumn(identifier=4, name='xyz'), ]) self.assertEqual(ds.get_unique_name('Age'), 'Age') self.assertEqual(ds.get_unique_name('XYZ'), 'XYZ_1') self.assertEqual(ds.get_unique_name('xyz'), 'xyz_1') self.assertEqual(ds.get_unique_name('ABC'), 'ABC_2')
def from_file(descriptor_file, data_file, annotations=None): """Read dataset descriptor from file and return a new instance of the dataset handle. Parameters ---------- descriptor_file: string Path to the file containing the dataset descriptor data_file: string Path to the file that contains the dataset rows. annotations: vizier.datastore.annotation.dataset.DatasetMetadata, optional Annotations for dataset components Returns ------- vizier.datastore.fs.dataset.FileSystemDatasetHandle """ with open(descriptor_file, 'r') as f: doc = json.loads(f.read()) return FileSystemDatasetHandle(identifier=doc[KEY_IDENTIFIER], columns=[ DatasetColumn( identifier=col[KEY_COLUMN_ID], name=col[KEY_COLUMN_NAME], data_type=col[KEY_COLUMN_TYPE]) for col in doc[KEY_COLUMNS] ], data_file=data_file, row_count=doc[KEY_ROWCOUNT], max_row_id=doc[KEY_MAXROWID], annotations=annotations)
def from_dict(obj): """Create the descriptor from a dictionary serialization. Parameters ---------- obj: dict Dictionary serialization for dataset descriptor as returned by the server. Returns ------- vizier.api.client.resources.dataset.DatasetDescriptor """ return DatasetDescriptor( identifier=obj[labels.ID], name=obj[labels.NAME], columns=[ DatasetColumn( identifier=col[labels.ID], name=col[labels.NAME], data_type=col[labels.DATATYPE] ) for col in obj['columns'] ], links=deserialize.HATEOAS(links=obj[labels.LINKS]) )
def insert_column(self, name: str, data_type: str = "varchar", position: Optional[int] = None ) -> DatasetColumn: """Add a new column to the dataset schema. Parameters ---------- name: string Name of the new column position: int, optional Position in the dataset schema where new column is inserted. If None, the column is appended to the list of dataset columns. Returns DatasetColumn """ column = DatasetColumn(name=name, data_type = data_type) self.columns = list(self.columns) if not position is None: self.columns.insert(position, column) # Add a null value to each row for the new column for row in self.rows: row.values.insert(position, None) else: self.columns.append(column) # Add a null value to each row for the new column for row in self.rows: row.values.append(None) return column
def filter_columns(self, identifier: str, columns: List[int], names: List[str], datastore: Datastore) -> VizualApiResult: """Dataset projection operator. Returns a copy of the dataset with the given identifier that contains only those columns listed in columns. The list of names contains optional new names for the filtered columns. A value of None in names indicates that the name of the corresponding column is not changed. Raises ValueError if no dataset with given identifier exists or if any of the filter columns are unknown. Parameters ---------- identifier: string Unique dataset identifier columns: list(int) List of column identifier for columns in the result. names: list(string) Optional new names for filtered columns. datastore : vizier.datastore.fs.base.FileSystemDatastore Datastore to retireve and update datasets Returns ------- vizier.engine.packages.vizual.api.VizualApiResult """ # Get dataset. Raise exception if dataset is unknown dataset = datastore.get_dataset(identifier) if dataset is None: raise ValueError('unknown dataset \'' + identifier + '\'') # The schema of the new dataset only contains the columns in the given # list. Keep track of their index positions to filter values. schema = list() val_filter = list() for i in range(len(columns)): col_idx = dataset.get_index(columns[i]) if col_idx is None: raise ValueError('unknown column identifier \'' + str(columns[i]) + '\'') col = dataset.columns[col_idx] if not names[i] is None: schema.append( DatasetColumn(identifier=col.identifier, name=names[i], data_type=col.data_type)) else: schema.append(col) val_filter.append(col_idx) # Create a list of projected rows rows = list() for row in dataset.fetch_rows(): values = list() for v_idx in val_filter: values.append(row.values[v_idx]) rows.append(DatasetRow(identifier=row.identifier, values=values)) # Store updated dataset to get new identifier ds = datastore.create_dataset(columns=schema, rows=rows, properties={}) return VizualApiResult(ds)
def test_query_annotations(self): """Test retrieving annotations via the datastore.""" store = FileSystemDatastore(STORE_DIR) ds = store.create_dataset( columns=[ DatasetColumn(identifier=0, name='A'), DatasetColumn(identifier=1, name='B') ], rows=[DatasetRow(identifier=0, values=['a', 'b'])], annotations=DatasetMetadata( cells=[ DatasetAnnotation(column_id=0, row_id=0, key='X', value=1), DatasetAnnotation(column_id=0, row_id=0, key='X', value=2), DatasetAnnotation(column_id=1, row_id=0, key='X', value=3), DatasetAnnotation(column_id=0, row_id=0, key='Y', value=1) ], columns=[ DatasetAnnotation(column_id=0, key='A', value='x'), DatasetAnnotation(column_id=1, key='A', value='x') ], rows=[ DatasetAnnotation(row_id=0, key='E', value=100) ] ) ) annos = store.get_annotations(ds.identifier, column_id=1) self.assertEqual(len(annos.columns), 1) self.assertEqual(len(annos.rows), 0) self.assertEqual(len(annos.cells), 0) annos = store.get_annotations(ds.identifier, column_id=0) self.assertEqual(len(annos.columns), 1) self.assertEqual(len(annos.rows), 0) self.assertEqual(len(annos.cells), 0) annos = store.get_annotations(ds.identifier, row_id=0) self.assertEqual(len(annos.columns), 0) self.assertEqual(len(annos.rows), 1) self.assertEqual(len(annos.cells), 0) annos = store.get_annotations(ds.identifier, column_id=1, row_id=0) self.assertEqual(len(annos.columns), 0) self.assertEqual(len(annos.rows), 0) self.assertEqual(len(annos.cells), 1) annos = store.get_annotations(ds.identifier, column_id=0, row_id=0) self.assertEqual(len(annos.columns), 0) self.assertEqual(len(annos.rows), 0) self.assertEqual(len(annos.cells), 3)
def test_load_with_dataset_delete(self): """Test loading workflows where each module creates a new dataset and deletes the previous dataset (except for the first module). """ base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC') os.makedirs(base_path) vt = OSViztrailHandle.create_viztrail(identifier='ABC', properties={}, base_path=base_path) branch = vt.get_default_branch() # Append ten modules for i in range(5): ts = get_current_time() deleted_datasets = list() if i > 0: deleted_datasets.append('DS' + str(i - 1)) command = python_cell(source='print ' + str(i) + '+' + str(i)) module = OSModuleHandle.create_module( command=command, external_form='print ' + str(i) + '+' + str(i), state=MODULE_SUCCESS, outputs=ModuleOutputs(stdout=[TextOutput(str(i + i))]), provenance=ModuleProvenance(write={ 'DS' + str(i): DatasetDescriptor( identifier=str(i), name='DS' + str(i), columns=[ DatasetColumn(identifier=j, name=str(j)) for j in range(i) ], ) }, delete=deleted_datasets), timestamp=ModuleTimestamp(created_at=ts, started_at=ts, finished_at=ts), module_folder=vt.modules_folder, object_store=vt.object_store) if not branch.head is None: modules = branch.head.modules + [module] else: modules = [module] branch.append_workflow(modules=modules, action=ACTION_INSERT, command=command) vt = OSViztrailHandle.load_viztrail(base_path) workflow = vt.get_default_branch().get_head() self.assertEqual(len(workflow.modules), 5) datasets = {} for i in range(5): module = workflow.modules[i] datasets = module.provenance.get_database_state(datasets) self.assertEqual(len(datasets), 1) key = 'DS' + str(i) self.assertTrue(key in datasets) self.assertEqual(len(datasets[key].columns), i)
def load_dataset( self, f_handle: FileHandle, proposed_schema: List[Tuple[str, str]] = []) -> FileSystemDatasetHandle: """Create a new dataset from a given file. Raises ValueError if the given file could not be loaded as a dataset. Parameters ---------- f_handle : vizier.filestore.base.FileHandle Handle for an uploaded file Returns ------- vizier.datastore.fs.dataset.FileSystemDatasetHandle """ # The file handle might be None in which case an exception is raised if f_handle is None: raise ValueError('unknown file') # Expects a file in a supported tabular data format. if not f_handle.is_tabular: raise ValueError('cannot create dataset from file \'' + f_handle.name + '\'') # Open the file as a csv file. Expects that the first row contains the # column names. Read dataset schema and dataset rows into two separate # lists. columns: List[DatasetColumn] = [] rows: List[DatasetRow] = [] with f_handle.open() as csvfile: reader = csv.reader(csvfile, delimiter=f_handle.delimiter) for col_name in next(reader): columns.append( DatasetColumn(identifier=len(columns), name=col_name.strip())) for row in reader: values = [cast(v.strip()) for v in row] rows.append( DatasetRow(identifier=str(len(rows)), values=values)) # Get unique identifier and create subfolder for the new dataset identifier = get_unique_identifier() dataset_dir = self.get_dataset_dir(identifier) os.makedirs(dataset_dir) # Write rows to data file data_file = os.path.join(dataset_dir, DATA_FILE) DefaultJsonDatasetReader(data_file).write(rows) # Create dataset an write descriptor to file dataset = FileSystemDatasetHandle(identifier=identifier, columns=columns, data_file=data_file, row_count=len(rows), max_row_id=len(rows) - 1) dataset.to_file( descriptor_file=os.path.join(dataset_dir, DESCRIPTOR_FILE)) return dataset
def create_dataset(self, name, dataset, backend_options=[]): """Create a new dataset with given name. Raises ValueError if a dataset with given name already exist. Parameters ---------- name : string Unique dataset name dataset : vizier.datastore.client.DatasetClient Dataset object Returns ------- vizier.datastore.client.DatasetClient """ # Raise an exception if a dataset with the given name already exists or # if the name is not valid if self.has_dataset_identifier(name): # Record access to the datasets self.read.add(name.lower()) raise ValueError('dataset \'' + name + '\' already exists') if not is_valid_name(name): raise ValueError('invalid dataset name \'' + name + '\'') # Create list of columns for new dataset. Ensure that every column has # a positive identifier columns = list() if len(dataset.columns) > 0: column_counter = max( max([col.identifier for col in dataset.columns]) + 1, 0) for col in dataset.columns: if col.identifier < 0: col.identifier = column_counter column_counter += 1 columns.append( DatasetColumn(identifier=col.identifier, name=col.name, data_type=col.data_type)) rows = dataset.rows if len(rows) > 0: # Ensure that all rows have positive identifier row_counter = max(max([row.identifier for row in rows]) + 1, 0) for row in rows: if row.identifier < 0: row.identifier = row_counter row_counter += 1 # Write dataset to datastore and add new dataset to context ds = self.datastore.create_dataset(columns=columns, rows=rows, annotations=dataset.annotations, human_readable_name=name.upper(), backend_options=backend_options) self.set_dataset_identifier(name, ds.identifier) self.descriptors[ds.identifier] = ds return DatasetClient(dataset=self.datastore.get_dataset(ds.identifier))
def test_properties(self): """Test loading a dataset from file.""" store = FileSystemDatastore(STORE_DIR) ds = store.create_dataset( columns=[ DatasetColumn(identifier=0, name='A'), DatasetColumn(identifier=1, name='B') ], rows=[DatasetRow(identifier=0, values=[1, 2])], properties=EXAMPLE_PROPERTIES ) ds = store.get_dataset(ds.identifier) column_props = ds.properties['columns'] self.assertEqual(len(column_props), 2) self.assertTrue('A' in [prop['name'] for prop in column_props]) # Reload datastore store = FileSystemDatastore(STORE_DIR) ds = store.get_dataset(ds.identifier) column_props = ds.properties['columns'] self.assertEqual(len(column_props), 2)
def test_create_dataset(self): """Test loading a dataset from file.""" store = FileSystemDatastore(STORE_DIR) ds = store.create_dataset( columns=[ DatasetColumn(identifier=0, name='A'), DatasetColumn(identifier=1, name='B') ], rows=[DatasetRow(identifier=0, values=['a', 'b'])] ) ds = store.get_dataset(ds.identifier) column_ids = [col.identifier for col in ds.columns] self.assertEqual(len(ds.columns), 2) for id in [0, 1]: self.assertTrue(id in column_ids) column_names = [col.name for col in ds.columns] for name in ['A', 'B']: self.assertTrue(name in column_names) rows = ds.fetch_rows() self.assertEqual(len(rows), 1) self.assertEqual(rows[0].values, ['a', 'b']) self.assertEqual(len(ds.annotations.cells), 0) self.assertEqual(len(ds.annotations.columns), 0) self.assertEqual(len(ds.annotations.rows), 0) # Reload the datastore store = FileSystemDatastore(STORE_DIR) ds = store.get_dataset(ds.identifier) column_ids = [col.identifier for col in ds.columns] self.assertEqual(len(ds.columns), 2) for id in [0, 1]: self.assertTrue(id in column_ids) column_names = [col.name for col in ds.columns] for name in ['A', 'B']: self.assertTrue(name in column_names) rows = ds.fetch_rows() self.assertEqual(len(rows), 1) self.assertEqual(rows[0].values, ['a', 'b']) self.assertEqual(len(ds.annotations.cells), 0) self.assertEqual(len(ds.annotations.columns), 0) self.assertEqual(len(ds.annotations.rows), 0)
def test_validate_dataset(self): """Test the validate dataset function.""" columns = [] rows = [] # Empty dataset max_col_id, max_row_id = validate_dataset(columns, rows) self.assertEqual(max_col_id, -1) self.assertEqual(max_row_id, -1) max_col_id, max_row_id = validate_dataset( columns=columns, rows=rows ) self.assertEqual(max_col_id, -1) self.assertEqual(max_row_id, -1) # Valid set of columns and rows columns = [DatasetColumn(0, 'A'), DatasetColumn(10, 'B')] rows = [DatasetRow(0, [1, 2]), DatasetRow(4, [None, 2]), DatasetRow(2, [0, 0])] max_col_id, max_row_id = validate_dataset(columns, rows) self.assertEqual(max_col_id, 10) self.assertEqual(max_row_id, 4) max_col_id, max_row_id = validate_dataset( columns=columns, rows=rows ) self.assertEqual(max_col_id, 10) self.assertEqual(max_row_id, 4) # Column errors with self.assertRaises(ValueError): validate_dataset(columns + [DatasetColumn()], []) with self.assertRaises(ValueError): validate_dataset(columns + [DatasetColumn(10, 'C')], []) # Row errors with self.assertRaises(ValueError): validate_dataset(columns, rows + [DatasetRow(1000, [0, 1, 3])]) with self.assertRaises(ValueError): validate_dataset(columns, rows + [DatasetRow(-1, [1, 3])]) with self.assertRaises(ValueError): validate_dataset(columns, rows + [DatasetRow(0, [1, 3])])
def dataset_column_index(self): """Test the column by id index of the dataset handle.""" self.setup_fileserver() ds = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE)) # Ensure that the project data has three columns and two rows self.assertEqual(ds.column_by_id(0).name.upper(), 'NAME') self.assertEqual(ds.column_by_id(1).name.upper(), 'AGE') self.assertEqual(ds.column_by_id(2).name.upper(), 'SALARY') with self.assertRaises(ValueError): ds.column_by_id(5) ds.columns.append(DatasetColumn(identifier=5, name='NEWNAME')) self.assertEqual(ds.column_by_id(5).name.upper(), 'NEWNAME') with self.assertRaises(ValueError): ds.column_by_id(4)
def rename_column(self, identifier: str, column_id: int, name: str, datastore: Datastore) -> VizualApiResult: """Rename column in a given dataset. Raises ValueError if no dataset with given identifier exists, if the specified column is unknown, or if the given column name is invalid. Parameters ---------- identifier: string Unique dataset identifier column_id: int Unique column identifier name: string New column name datastore : vizier.datastore.fs.base.FileSystemDatastore Datastore to retireve and update datasets Returns ------- vizier.engine.packages.vizual.api.VizualApiResult """ # Raise ValueError if given colum name is invalid if not is_valid_name(name): raise ValueError('invalid column name \'' + name + '\'') # Get dataset. Raise exception if dataset is unknown dataset = datastore.get_dataset(identifier) if dataset is None: raise ValueError('unknown dataset \'' + identifier + '\'') # Get the specified column that is to be renamed and set the column name # to the new name col_idx = dataset.get_index(column_id) if col_idx is None: raise ValueError('unknown column identifier \'' + str(column_id) + '\'') # Nothing needs to be changed if name does not differ from column name if dataset.columns[col_idx].name.lower() != name.lower(): columns = list(dataset.columns) col = columns[col_idx] columns[col_idx] = DatasetColumn(identifier=col.identifier, name=name, data_type=col.data_type) # Store updated dataset to get new identifier ds = datastore.create_dataset(columns=columns, rows=dataset.fetch_rows(), properties={}) return VizualApiResult(ds) else: return VizualApiResult(dataset)
def create_dataset(self, name, dataset, backend_options=[]): """Create a new dataset with given name. Raises ValueError if a dataset with given name already exist. Parameters ---------- name : string Unique dataset name dataset : vizier.datastore.client.DatasetClient Dataset object Returns ------- vizier.datastore.client.DatasetClient """ # Raise an exception if a dataset with the given name already exists or # if the name is not valid if name.lower() in self.datasets: # Record access to the datasets raise ValueError('dataset \'' + name + '\' already exists') if not is_valid_name(name): raise ValueError('invalid dataset name \'' + name + '\'') # Create list of columns for new dataset. Ensure that every column has # a positive identifier columns = list() if len(dataset.columns) > 0: column_counter = max( max([col.identifier for col in dataset.columns]) + 1, 0) for col in dataset.columns: if col.identifier < 0: col.identifier = column_counter column_counter += 1 columns.append( DatasetColumn(identifier=col.identifier, name=col.name, data_type=col.data_type)) rows = dataset.rows # Write dataset to datastore and add new dataset to context ds = self.datastore.create_dataset(columns=columns, rows=rows, properties=dataset.properties, human_readable_name=name, backend_options=backend_options) self.datasets[name.lower()] = ds self.write.add(name.lower()) return DatasetClient(dataset=self.datastore.get_dataset(ds.identifier), client=self, existing_name=name.lower())
def insert_column(self, identifier: str, position: int, name: str, datastore: Datastore) -> VizualApiResult: """Insert column with given name at given position in dataset. Raises ValueError if no dataset with given identifier exists, if the specified column position is outside of the current schema bounds, or if the column name is invalid. Parameters ---------- identifier: string Unique dataset identifier position: int Index position at which the column will be inserted name: string New column name datastore : vizier.datastore.fs.base.FileSystemDatastore Datastore to retireve and update datasets Returns ------- vizier.engine.packages.vizual.api.VizualApiResult """ # Raise ValueError if given colum name is invalid if not name is None and not is_valid_name(name): raise ValueError('invalid column name \'' + name + '\'') # Get dataset. Raise exception if dataset is unknown dataset = datastore.get_dataset(identifier) if dataset is None: raise ValueError('unknown dataset \'' + identifier + '\'') # Make sure that position is a valid column index in the new dataset if position < 0 or position > len(dataset.columns): raise ValueError('invalid column index \'' + str(position) + '\'') # Insert new column into dataset columns = list(dataset.columns) rows = dataset.fetch_rows() columns.insert( position, DatasetColumn(identifier=dataset.max_column_id() + 1, name=name if not name is None else '')) # Add a null value to each row for the new column for row in rows: row.values.insert(position, None) # Store updated dataset to get new identifier ds = datastore.create_dataset(columns=columns, rows=rows, properties={}) return VizualApiResult(ds)
def DATASET_COLUMNS(obj): """Convert a list of dictionaries into a list of dataset columns. Parameters ---------- obj: list List of dataset columns in default serialization format Returns ------- list """ return [ DatasetColumn(identifier=col[labels.ID], name=col[labels.NAME], data_type=col[labels.DATATYPE]) for col in obj ]
def test_update_annotations(self): """Test updating annotations via the datastore.""" store = FileSystemDatastore(STORE_DIR) ds = store.create_dataset( columns=[ DatasetColumn(identifier=0, name='A'), DatasetColumn(identifier=1, name='B') ], rows=[DatasetRow(identifier=0, values=['a', 'b'])], annotations=DatasetMetadata( cells=[ DatasetAnnotation(column_id=0, row_id=0, key='X', value=1), DatasetAnnotation(column_id=0, row_id=0, key='X', value=2), DatasetAnnotation(column_id=1, row_id=0, key='X', value=3), DatasetAnnotation(column_id=0, row_id=0, key='Y', value=1) ], columns=[ DatasetAnnotation(column_id=0, key='A', value='x'), DatasetAnnotation(column_id=1, key='A', value='x') ], rows=[ DatasetAnnotation(row_id=0, key='E', value=100) ] ) ) # INSERT row annotatins store.update_annotation( ds.identifier, key='D', row_id=0, new_value=200 ) annos = store.get_annotations(ds.identifier, row_id=0) self.assertEqual(len(annos.rows), 2) for key in ['D', 'E']: self.assertTrue(key in [a.key for a in annos.rows]) for val in [100, 200]: self.assertTrue(val in [a.value for a in annos.rows]) # UPDATE column annotation store.update_annotation( ds.identifier, key='A', column_id=1, old_value='x', new_value='y' ) annos = store.get_annotations(ds.identifier, column_id=1) self.assertEqual(annos.columns[0].key, 'A') self.assertEqual(annos.columns[0].value, 'y') # DELETE cell annotation store.update_annotation( ds.identifier, key='X', column_id=0, row_id=0, old_value=2, ) annos = store.get_annotations(ds.identifier, column_id=0, row_id=0) self.assertEqual(len(annos.cells), 2) for a in annos.cells: self.assertNotEqual(a.value, 2) result = store.update_annotation( ds.identifier, key='X', column_id=1, row_id=0, old_value=3, ) self.assertTrue(result) annos = store.get_annotations(ds.identifier, column_id=1, row_id=0) self.assertEqual(len(annos.cells), 0)
def load_module(identifier, module_path, prev_state=None, object_store=None): """Load module from given object store. Parameters ---------- identifier: string Unique module identifier module_path: string Resource path for module object prev_state: dict(string: vizier.datastore.dataset.DatasetDescriptor) Dataset descriptors keyed by the user-provided name that exist in the database state of the previous moudle (in sequence of occurrence in the workflow) object_store: vizier.core.io.base.ObjectStore, optional Object store implementation to access and maintain resources Returns ------- vizier.viztrail.objectstore.module.OSModuleHandle """ # Make sure the object store is not None if object_store is None: object_store = DefaultObjectStore() # Read object from store. This may raise a ValueError to indicate that # the module does not exists (in a system error condtion). In this # case we return a new module that is in error state. try: obj = object_store.read_object(object_path=module_path) except ValueError: return OSModuleHandle( identifier=identifier, command=ModuleCommand(package_id=UNKNOWN_ID, command_id=UNKNOWN_ID), external_form='fatal error: object not found', module_path=module_path, state=mstate.MODULE_ERROR, object_store=object_store) # Create module command command = ModuleCommand(package_id=obj[KEY_COMMAND][KEY_PACKAGE_ID], command_id=obj[KEY_COMMAND][KEY_COMMAND_ID], arguments=obj[KEY_COMMAND][KEY_ARGUMENTS]) # Create module timestamps created_at = to_datetime(obj[KEY_TIMESTAMP][KEY_CREATED_AT]) if KEY_STARTED_AT in obj[KEY_TIMESTAMP]: started_at = to_datetime(obj[KEY_TIMESTAMP][KEY_STARTED_AT]) else: started_at = None if KEY_FINISHED_AT in obj[KEY_TIMESTAMP]: finished_at = to_datetime(obj[KEY_TIMESTAMP][KEY_FINISHED_AT]) else: finished_at = None timestamp = ModuleTimestamp(created_at=created_at, started_at=started_at, finished_at=finished_at) # Create module output streams. outputs = ModuleOutputs( stdout=get_output_stream(obj[KEY_OUTPUTS][KEY_STDOUT]), stderr=get_output_stream(obj[KEY_OUTPUTS][KEY_STDERR])) # Create module provenance information read_prov = None if KEY_PROVENANCE_READ in obj[KEY_PROVENANCE]: read_prov = dict() for ds in obj[KEY_PROVENANCE][KEY_PROVENANCE_READ]: read_prov[ds[KEY_DATASET_NAME]] = ds[KEY_DATASET_ID] write_prov = None if KEY_PROVENANCE_WRITE in obj[KEY_PROVENANCE]: write_prov = dict() for ds in obj[KEY_PROVENANCE][KEY_PROVENANCE_WRITE]: descriptor = DatasetDescriptor( identifier=ds[KEY_DATASET_ID], columns=[ DatasetColumn(identifier=col[KEY_COLUMN_ID], name=col[KEY_COLUMN_NAME], data_type=col[KEY_COLUMN_TYPE]) for col in ds[KEY_DATASET_COLUMNS] ], row_count=ds[KEY_DATASET_ROWCOUNT]) write_prov[ds[KEY_DATASET_NAME]] = descriptor delete_prov = None if KEY_PROVENANCE_DELETE in obj[KEY_PROVENANCE]: delete_prov = obj[KEY_PROVENANCE][KEY_PROVENANCE_DELETE] res_prov = None if KEY_PROVENANCE_RESOURCES in obj[KEY_PROVENANCE]: res_prov = obj[KEY_PROVENANCE][KEY_PROVENANCE_RESOURCES] charts_prov = None if KEY_PROVENANCE_CHARTS in obj[KEY_PROVENANCE]: charts_prov = [ ChartViewHandle.from_dict(c) for c in obj[KEY_PROVENANCE][KEY_PROVENANCE_CHARTS] ] provenance = ModuleProvenance(read=read_prov, write=write_prov, delete=delete_prov, resources=res_prov, charts=charts_prov) # Create dictionary of dataset descriptors only if previous state is # given and the module is in SUCCESS state. Otherwise, the database # state is empty. if obj[KEY_STATE] == mstate.MODULE_SUCCESS and not prev_state is None: datasets = provenance.get_database_state(prev_state) else: datasets = dict() # Return module handle return OSModuleHandle(identifier=identifier, command=command, external_form=obj[KEY_EXTERNAL_FORM], module_path=module_path, state=obj[KEY_STATE], timestamp=timestamp, datasets=datasets, outputs=outputs, provenance=provenance, object_store=object_store)
def load_module( identifier: str, module_path: str, prev_state: Optional[Dict[str, ArtifactDescriptor]] = None, object_store: ObjectStore = DefaultObjectStore() ) -> "OSModuleHandle": """Load module from given object store. Parameters ---------- identifier: string Unique module identifier module_path: string Resource path for module object prev_state: dict(string: vizier.datastore.dataset.DatasetDescriptor) Dataset descriptors keyed by the user-provided name that exist in the database state of the previous moudle (in sequence of occurrence in the workflow) object_store: vizier.core.io.base.ObjectStore, optional Object store implementation to access and maintain resources Returns ------- vizier.viztrail.objectstore.module.OSModuleHandle """ # Make sure the object store is not None # Read object from store. This may raise a ValueError to indicate that # the module does not exists (in a system error condtion). In this # case we return a new module that is in error state. try: obj = cast(Dict[str, Any], object_store.read_object(object_path=module_path)) except ValueError: return OSModuleHandle( identifier=identifier, command=ModuleCommand( package_id=UNKNOWN_ID, command_id=UNKNOWN_ID, arguments=list(), packages=None ), external_form='fatal error: object not found', module_path=module_path, state=mstate.MODULE_ERROR, object_store=object_store ) # Create module command command = ModuleCommand( package_id=obj[KEY_COMMAND][KEY_PACKAGE_ID], command_id=obj[KEY_COMMAND][KEY_COMMAND_ID], arguments=obj[KEY_COMMAND][KEY_ARGUMENTS], packages=None ) # Create module timestamps created_at = to_datetime(obj[KEY_TIMESTAMP][KEY_CREATED_AT]) if KEY_STARTED_AT in obj[KEY_TIMESTAMP]: started_at: Optional[datetime] = to_datetime(obj[KEY_TIMESTAMP][KEY_STARTED_AT]) else: started_at = None if KEY_FINISHED_AT in obj[KEY_TIMESTAMP]: finished_at: Optional[datetime] = to_datetime(obj[KEY_TIMESTAMP][KEY_FINISHED_AT]) else: finished_at = None timestamp = ModuleTimestamp( created_at=created_at, started_at=started_at, finished_at=finished_at ) # Create module output streams. outputs = ModuleOutputs( stdout=get_output_stream(obj[KEY_OUTPUTS][KEY_STDOUT]), stderr=get_output_stream(obj[KEY_OUTPUTS][KEY_STDERR]) ) # Create module provenance information read_prov = None if KEY_PROVENANCE_READ in obj[KEY_PROVENANCE]: read_prov = dict() for ds in obj[KEY_PROVENANCE][KEY_PROVENANCE_READ]: read_prov[ds[KEY_DATASET_NAME]] = ds[KEY_DATASET_ID] write_prov = None if KEY_PROVENANCE_WRITE in obj[KEY_PROVENANCE]: write_prov = dict() for ds in obj[KEY_PROVENANCE][KEY_PROVENANCE_WRITE]: if KEY_DATAOBJECT_TYPE in ds: descriptor = ArtifactDescriptor( identifier=ds[KEY_DATAOBJECT_ID], name=ds[KEY_DATAOBJECT_NAME], artifact_type=ds[KEY_DATAOBJECT_TYPE]) else: descriptor = DatasetDescriptor( identifier=ds[KEY_DATASET_ID], name=ds[KEY_DATASET_NAME], columns=[ DatasetColumn( identifier=col[KEY_COLUMN_ID], name=col[KEY_COLUMN_NAME], data_type=col[KEY_COLUMN_TYPE] ) for col in ds[KEY_DATASET_COLUMNS] ] ) write_prov[ds[KEY_DATASET_NAME]] = descriptor if KEY_PROVENANCE_DELETE in obj[KEY_PROVENANCE]: delete_prov = set(obj[KEY_PROVENANCE][KEY_PROVENANCE_DELETE]) else: delete_prov = set() if KEY_PROVENANCE_RESOURCES in obj[KEY_PROVENANCE]: res_prov = cast(Dict[str, Any], obj[KEY_PROVENANCE][KEY_PROVENANCE_RESOURCES]) else: res_prov = dict() if KEY_PROVENANCE_CHARTS in obj[KEY_PROVENANCE]: charts_prov = [ ( c[0], ChartViewHandle.from_dict(c[1]) # type: ignore[no-untyped-call] ) if isinstance(c, list) else ( "Chart", ChartViewHandle.from_dict(c) ) for c in obj[KEY_PROVENANCE][KEY_PROVENANCE_CHARTS] ] else: charts_prov = list() provenance = ModuleProvenance( read=read_prov, write=write_prov, delete=delete_prov, resources=res_prov, charts=charts_prov ) # Return module handle return OSModuleHandle( identifier=identifier, command=command, external_form=obj[KEY_EXTERNAL_FORM], module_path=module_path, state=obj[KEY_STATE], timestamp=timestamp, outputs=outputs, provenance=provenance, object_store=object_store, )
from vizier.viztrail.module.output import ModuleOutputs, OutputObject, TextOutput from vizier.viztrail.module.provenance import ModuleProvenance from vizier.viztrail.module.timestamp import ModuleTimestamp from vizier.engine.packages.plot.command import create_plot from vizier.engine.packages.pycell.command import python_cell MODULE_DIR = './.temp' DATASETS = { 'DS1': DatasetDescriptor(identifier='ID1'), 'DS2': DatasetDescriptor(identifier='ID2', columns=[ DatasetColumn(identifier=0, name='ABC', data_type='int'), DatasetColumn(identifier=1, name='xyz', data_type='real') ], row_count=100) } class TestOSModuleIO(unittest.TestCase): def setUp(self): """Create an empty directory.""" if os.path.isdir(MODULE_DIR): shutil.rmtree(MODULE_DIR) os.makedirs(MODULE_DIR)
from atexit import register as at_exit URLS = UrlFactory(base_url='http://localhost:5000/vizier-db/api/v1') api = VizierApiClient(URLS) PROJECT_ID = api.create_project({"name": "Test Client Datastore"}).identifier at_exit(api.delete_project, PROJECT_ID) # We're just doing some unit testing on the fields specific to DatastoreClient, so # ignore complaints about instantiating an abstract class store = DatastoreClient( # type: ignore[abstract] urls=DatastoreClientUrlFactory(urls=URLS, project_id=PROJECT_ID)) ds = store.create_dataset(columns=[ DatasetColumn(identifier=0, name='Name'), DatasetColumn(identifier=1, name='Age', data_type="int") ], rows=[ DatasetRow(identifier=0, values=['Alice', 32]), DatasetRow(identifier=1, values=['Bob', 23]) ], properties={"example_property": "foo"}) # print(ds) # print([col.identifier for col in ds.columns]) # print([col.name for col in ds.columns]) dh = store.get_dataset(ds.identifier) assert dh is not None for row in dh.fetch_rows():
from vizier.engine.packages.mimir.command import mimir_geocode from vizier.engine.packages.mimir.command import mimir_key_repair, mimir_missing_key from vizier.engine.packages.mimir.command import mimir_missing_value, mimir_picker from vizier.datastore.dataset import DatasetColumn, DatasetDescriptor import vizier.engine.packages.base as pckg import vizier.engine.packages.mimir.base as mimir import vizier.viztrail.command as md DATASETS = { 'ds': DatasetDescriptor(identifier='0000', name='ds', columns=[ DatasetColumn(identifier=2, name='Some Name'), DatasetColumn(identifier=1, name='Street') ]) } PACKAGE = pckg.PackageIndex(mimir.MIMIR_LENSES) class TestValidateMimir(unittest.TestCase): def test_mimir_geocode(self): """Test validation of Mimir geocode lens.""" cmd = mimir_geocode(dataset_name='ds', geocoder='GOOGLE', street=1, city=2, materialize_input=False, validate=True).to_external_form(
line += str(anno.row_id) line += ']: ' + anno.key + ' = ' + str(anno.value) print(line) store = DatastoreClient( urls=DatastoreClientUrlFactory( urls=UrlFactory( base_url='http://localhost:5000/vizier-db/api/v1' ), project_id=PROJECT_ID ) ) ds = store.create_dataset( columns=[DatasetColumn(identifier=0, name='Name'), DatasetColumn(identifier=1, name='Age')], rows=[DatasetRow(identifier=0, values=['Alice', 32]), DatasetRow(identifier=1, values=['Bob', 23])], annotations=DatasetMetadata(rows=[DatasetAnnotation(row_id=1, key='user:comment', value='Needs cleaning')]) ) print(ds) print([col.identifier for col in ds.columns]) print([col.name for col in ds.columns]) dh = store.get_dataset(ds.identifier) for row in dh.fetch_rows(): print([row.identifier] + row.values) annotations = dh.get_annotations() print_annotations(annotations)
def test_column_index(self): """Test access to columns based on identifier and name.""" ds = DatasetDescriptor(identifier='0', columns=[ DatasetColumn(identifier=0, name='ABC'), DatasetColumn(identifier=1, name='A'), DatasetColumn(identifier=2, name='ABC'), DatasetColumn(identifier=3, name='DEF'), DatasetColumn(identifier=4, name='xyz'), ]) # Get column by identifier self.assertEqual(ds.column_by_id(0).name, 'ABC') self.assertEqual(ds.column_by_id(1).name, 'A') self.assertEqual(ds.column_by_id(2).name, 'ABC') self.assertEqual(ds.column_by_id(3).name, 'DEF') self.assertEqual(ds.column_by_id(4).name, 'xyz') with self.assertRaises(ValueError): ds.column_by_id(6) with self.assertRaises(ValueError): ds.column_by_id(-1) # Get column by name self.assertEqual(ds.column_by_name('ABC').identifier, 0) self.assertEqual(ds.column_by_name('A').identifier, 1) self.assertEqual( ds.column_by_name('abc', ignore_case=True).identifier, 0) self.assertEqual( ds.column_by_name('XYZ', ignore_case=True).identifier, 4) self.assertIsNone(ds.column_by_name('4')) # Get column index self.assertEqual(ds.column_index(0), 0) self.assertEqual(ds.column_index(1), 1) self.assertEqual(ds.column_index('DEF'), 3) self.assertEqual(ds.column_index('XYZ'), 4) self.assertEqual(ds.column_index('A'), 1) self.assertEqual(ds.column_index('B'), 1) self.assertEqual(ds.column_index('C'), 2) self.assertEqual(ds.column_index('D'), 3) self.assertEqual(ds.column_index('E'), 4) for i in range(len(ds.columns)): self.assertEqual(ds.get_index(i), i) with self.assertRaises(ValueError): ds.column_index('ABC') with self.assertRaises(ValueError): ds.column_index('abc') # Create a descriptor when column identifier does not match the index # position in the schema ds = DatasetDescriptor(identifier='0', columns=[ DatasetColumn(identifier=4, name='ABC'), DatasetColumn(identifier=2, name='A'), DatasetColumn(identifier=3, name='ABC'), DatasetColumn(identifier=0, name='DEF'), DatasetColumn(identifier=1, name='xyz'), ]) self.assertEqual(ds.column_by_id(0).name, 'DEF') self.assertEqual(ds.column_by_id(1).name, 'xyz') self.assertEqual(ds.column_by_id(2).name, 'A') self.assertEqual(ds.column_by_id(3).name, 'ABC') self.assertEqual(ds.column_by_id(4).name, 'ABC') self.assertEqual(ds.column_index(0), 0) self.assertEqual(ds.column_index(1), 1) self.assertEqual(ds.column_index('DEF'), 3) self.assertEqual(ds.column_index('XYZ'), 4) self.assertEqual(ds.column_index('A'), 1) self.assertEqual(ds.column_index('B'), 1) self.assertEqual(ds.column_index('C'), 2) self.assertEqual(ds.column_index('D'), 3) self.assertEqual(ds.column_index('E'), 4) self.assertEqual(ds.get_index(0), 3) self.assertEqual(ds.get_index(1), 4) self.assertEqual(ds.get_index(2), 1) self.assertEqual(ds.get_index(3), 2) self.assertEqual(ds.get_index(4), 0)