def test_default_json_reader(self): """Test functionality of Json dataset reader.""" reader = DefaultJsonDatasetReader(JSON_FILE) with self.assertRaises(StopIteration): next(reader) count = 0 with reader.open() as r: for row in r: self.assertEqual(len(row.values), 3) self.assertEqual(row.identifier, count) count += 1 self.assertEqual(count, 2) with self.assertRaises(StopIteration): next(reader) # Create a new dataset and read it tmp_file = tempfile.mkstemp()[1] reader = DefaultJsonDatasetReader(tmp_file) values = ['A', 'B', 1, 2] rows = [ DatasetRow(0, values), DatasetRow(1, values), DatasetRow(2, values) ] reader.write(rows) count = 0 with reader.open() as reader: for row in reader: self.assertEqual(len(row.values), 4) self.assertEqual(row.identifier, count) count += 1 self.assertEqual(count, len(rows)) os.remove(tmp_file)
def load_dataset( self, f_handle: FileHandle, proposed_schema: List[Tuple[str, str]] = []) -> FileSystemDatasetHandle: """Create a new dataset from a given file. Raises ValueError if the given file could not be loaded as a dataset. Parameters ---------- f_handle : vizier.filestore.base.FileHandle Handle for an uploaded file Returns ------- vizier.datastore.fs.dataset.FileSystemDatasetHandle """ # The file handle might be None in which case an exception is raised if f_handle is None: raise ValueError('unknown file') # Expects a file in a supported tabular data format. if not f_handle.is_tabular: raise ValueError('cannot create dataset from file \'' + f_handle.name + '\'') # Open the file as a csv file. Expects that the first row contains the # column names. Read dataset schema and dataset rows into two separate # lists. columns: List[DatasetColumn] = [] rows: List[DatasetRow] = [] with f_handle.open() as csvfile: reader = csv.reader(csvfile, delimiter=f_handle.delimiter) for col_name in next(reader): columns.append( DatasetColumn(identifier=len(columns), name=col_name.strip())) for row in reader: values = [cast(v.strip()) for v in row] rows.append( DatasetRow(identifier=str(len(rows)), values=values)) # Get unique identifier and create subfolder for the new dataset identifier = get_unique_identifier() dataset_dir = self.get_dataset_dir(identifier) os.makedirs(dataset_dir) # Write rows to data file data_file = os.path.join(dataset_dir, DATA_FILE) DefaultJsonDatasetReader(data_file).write(rows) # Create dataset an write descriptor to file dataset = FileSystemDatasetHandle(identifier=identifier, columns=columns, data_file=data_file, row_count=len(rows), max_row_id=len(rows) - 1) dataset.to_file( descriptor_file=os.path.join(dataset_dir, DESCRIPTOR_FILE)) return dataset
def create_dataset(self, columns, rows, annotations=None): """Create a new dataset in the datastore. Expects at least the list of columns and the rows for the dataset. Raises ValueError if (1) the column identifier are not unique, (2) the row identifier are not uniqe, (3) the number of columns and values in a row do not match, (4) any of the column or row identifier have a negative value, or (5) if the given column or row counter have value lower or equal to any of the column or row identifier. Parameters ---------- columns: list(vizier.datastore.dataset.DatasetColumn) List of columns. It is expected that each column has a unique identifier. rows: list(vizier.datastore.dataset.DatasetRow) List of dataset rows. annotations: vizier.datastore.annotation.dataset.DatasetMetadata, optional Annotations for dataset components Returns ------- vizier.datastore.dataset.DatasetDescriptor """ # Validate (i) that each column has a unique identifier, (ii) each row # has a unique identifier, and (iii) that every row has exactly one # value per column. _, max_row_id = validate_dataset(columns=columns, rows=rows) # Get new identifier and create directory for new dataset identifier = get_unique_identifier() dataset_dir = self.get_dataset_dir(identifier) os.makedirs(dataset_dir) # Write rows to data file data_file = os.path.join(dataset_dir, DATA_FILE) DefaultJsonDatasetReader(data_file).write(rows) # Filter annotations for non-existing resources if not annotations is None: annotations = annotations.filter( columns=[c.identifier for c in columns], rows=[r.identifier for r in rows]) # Create dataset an write dataset file dataset = FileSystemDatasetHandle(identifier=identifier, columns=columns, data_file=data_file, row_count=len(rows), max_row_id=max_row_id, annotations=annotations) dataset.to_file( descriptor_file=os.path.join(dataset_dir, DESCRIPTOR_FILE)) # Write metadata file if annotations are given if not annotations is None: dataset.annotations.to_file(self.get_metadata_filename(identifier)) # Return handle for new dataset return DatasetDescriptor(identifier=dataset.identifier, columns=dataset.columns, row_count=dataset.row_count)
def reader(self, offset=0, limit=-1): """Get reader for the dataset to access the dataset rows. The optional offset amd limit parameters are used to retrieve only a subset of rows. Parameters ---------- offset: int, optional Number of rows at the beginning of the list that are skipped. limit: int, optional Limits the number of rows that are returned. Returns ------- vizier.datastore.reader.DefaultJsonDatasetReader """ return DefaultJsonDatasetReader(self.data_file, columns=self.columns, offset=offset, limit=limit)
def create_dataset( self, columns: List[DatasetColumn], rows: List[DatasetRow], properties: Optional[Dict[str, Any]] = None, human_readable_name: str = "Untitled Dataset", backend_options: Optional[List[Tuple[str, str]]] = None, dependencies: Optional[List[str]] = None) -> DatasetDescriptor: """Create a new dataset in the datastore. Expects at least the list of columns and the rows for the dataset. Raises ValueError if (1) the column identifier are not unique, (2) the row identifier are not uniqe, (3) the number of columns and values in a row do not match, (4) any of the column or row identifier have a negative value, or (5) if the given column or row counter have value lower or equal to any of the column or row identifier. Parameters ---------- columns: list(vizier.datastore.dataset.DatasetColumn) List of columns. It is expected that each column has a unique identifier. rows: list(vizier.datastore.dataset.DatasetRow) List of dataset rows. properties: dict(string, ANY), optional Properties for dataset components Returns ------- vizier.datastore.dataset.DatasetDescriptor """ # Validate (i) that each column has a unique identifier, (ii) each row # has a unique identifier, and (iii) that every row has exactly one # value per column. properties = {} if properties is None else properties dependencies = [] if dependencies is None else dependencies identifiers = set( int(row.identifier) for row in rows if row.identifier is not None and int(row.identifier) >= 0) identifiers.add(0) max_row_id = max(identifiers) rows = [ DatasetRow(identifier=row.identifier if row.identifier is not None and int(row.identifier) >= 0 else str(idx + max_row_id), values=row.values, caveats=row.caveats) for idx, row in enumerate(rows) ] _, max_row_id = validate_dataset(columns=columns, rows=rows) # Get new identifier and create directory for new dataset identifier = get_unique_identifier() dataset_dir = self.get_dataset_dir(identifier) os.makedirs(dataset_dir) # Write rows to data file data_file = os.path.join(dataset_dir, DATA_FILE) DefaultJsonDatasetReader(data_file).write(rows) # Create dataset an write dataset file dataset = FileSystemDatasetHandle(identifier=identifier, columns=columns, data_file=data_file, row_count=len(rows), max_row_id=max_row_id, properties=properties) dataset.to_file( descriptor_file=os.path.join(dataset_dir, DESCRIPTOR_FILE)) # Write metadata file if annotations are given if properties is not None: dataset.write_properties_to_file( self.get_properties_filename(identifier)) # Return handle for new dataset return DatasetDescriptor(identifier=dataset.identifier, name=human_readable_name, columns=dataset.columns)
def create_dataset(self, identifier=None, columns=None, rows=None, column_counter=None, row_counter=None, annotations=None): """Create a new dataset in the data store for the given data. Raises ValueError if (1) any of the column or row identifier have a negative value, or (2) if the given column or row counter have value lower or equal to any of the column or row identifier. Parameters ---------- identifier: string, optional Unique dataset identifier columns: list(vizier.datastore.base.DatasetColumn) List of columns. It is expected that each column has a unique identifier. rows: list(vizier.datastore.base.DatasetRow) List of dataset rows. column_counter: int, optional Counter to generate unique column identifier row_counter: int, optional Counter to generate unique row identifier annotations: vizier.datastore.metadata.DatasetMetadata, optional Annotations for dataset components Returns ------- vizier.datastore.fs.FileSystemDatasetHandle """ # Set columns and rows if not given if columns is None: columns = list() if rows is None: rows = list() else: # Validate the number of values in the given rows validate_schema(columns, rows) # Validate that all column identifier are smaller that the given # column counter if not column_counter is None: for col in columns: if col.identifier >= column_counter: raise ValueError('invalid column counter') else: # Set column counter to max. column identifier + 1 column_counter = -1 for col in columns: if col.identifier > column_counter: column_counter = col.identifier column_counter += 1 # Validate that all row ids are non-negative, unique, lower that the # given row_counter max_rowid = -1 row_ids = set() for row in rows: if row.identifier < 0: raise ValueError('invalid row identifier \'' + str(row.identifier) + '\'') elif not row_counter is None and row.identifier >= row_counter: raise ValueError('invalid row counter') elif row.identifier in row_ids: raise ValueError('duplicate row identifier \'' + str(row.identifier) + '\'') row_ids.add(row.identifier) if row_counter is None and row.identifier > max_rowid: max_rowid = row.identifier if row_counter is None: row_counter = max_rowid + 1 # Get new identifier and create directory for new dataset identifier = get_unique_identifier() dataset_dir = self.get_dataset_dir(identifier) os.makedirs(dataset_dir) # Write rows to data file datafile = os.path.join(dataset_dir, DATA_FILE) DefaultJsonDatasetReader(datafile).write(rows) # Create dataset an write dataset file dataset = FileSystemDatasetHandle(identifier=identifier, columns=columns, row_count=len(rows), datafile=datafile, column_counter=column_counter, row_counter=row_counter, annotations=annotations) dataset.to_file(os.path.join(dataset_dir, HANDLE_FILE)) # Write metadata file dataset.annotations.to_file(os.path.join(dataset_dir, METADATA_FILE)) # Return handle for new dataset return dataset