def test_default_json_reader(self):
     """Test functionality of Json dataset reader."""
     reader = DefaultJsonDatasetReader(JSON_FILE)
     with self.assertRaises(StopIteration):
         next(reader)
     count = 0
     with reader.open() as r:
         for row in r:
             self.assertEqual(len(row.values), 3)
             self.assertEqual(row.identifier, count)
             count += 1
     self.assertEqual(count, 2)
     with self.assertRaises(StopIteration):
         next(reader)
     # Create a new dataset and read it
     tmp_file = tempfile.mkstemp()[1]
     reader = DefaultJsonDatasetReader(tmp_file)
     values = ['A', 'B', 1, 2]
     rows = [
         DatasetRow(0, values),
         DatasetRow(1, values),
         DatasetRow(2, values)
     ]
     reader.write(rows)
     count = 0
     with reader.open() as reader:
         for row in reader:
             self.assertEqual(len(row.values), 4)
             self.assertEqual(row.identifier, count)
             count += 1
     self.assertEqual(count, len(rows))
     os.remove(tmp_file)
Esempio n. 2
0
    def load_dataset(
        self,
        f_handle: FileHandle,
        proposed_schema: List[Tuple[str,
                                    str]] = []) -> FileSystemDatasetHandle:
        """Create a new dataset from a given file.

        Raises ValueError if the given file could not be loaded as a dataset.

        Parameters
        ----------
        f_handle : vizier.filestore.base.FileHandle
            Handle for an uploaded file

        Returns
        -------
        vizier.datastore.fs.dataset.FileSystemDatasetHandle
        """
        # The file handle might be None in which case an exception is raised
        if f_handle is None:
            raise ValueError('unknown file')
        # Expects a file in a supported tabular data format.
        if not f_handle.is_tabular:
            raise ValueError('cannot create dataset from file \'' +
                             f_handle.name + '\'')
        # Open the file as a csv file. Expects that the first row contains the
        # column names. Read dataset schema and dataset rows into two separate
        # lists.
        columns: List[DatasetColumn] = []
        rows: List[DatasetRow] = []
        with f_handle.open() as csvfile:
            reader = csv.reader(csvfile, delimiter=f_handle.delimiter)
            for col_name in next(reader):
                columns.append(
                    DatasetColumn(identifier=len(columns),
                                  name=col_name.strip()))
            for row in reader:
                values = [cast(v.strip()) for v in row]
                rows.append(
                    DatasetRow(identifier=str(len(rows)), values=values))
        # Get unique identifier and create subfolder for the new dataset
        identifier = get_unique_identifier()
        dataset_dir = self.get_dataset_dir(identifier)
        os.makedirs(dataset_dir)
        # Write rows to data file
        data_file = os.path.join(dataset_dir, DATA_FILE)
        DefaultJsonDatasetReader(data_file).write(rows)
        # Create dataset an write descriptor to file
        dataset = FileSystemDatasetHandle(identifier=identifier,
                                          columns=columns,
                                          data_file=data_file,
                                          row_count=len(rows),
                                          max_row_id=len(rows) - 1)
        dataset.to_file(
            descriptor_file=os.path.join(dataset_dir, DESCRIPTOR_FILE))
        return dataset
Esempio n. 3
0
    def create_dataset(self, columns, rows, annotations=None):
        """Create a new dataset in the datastore. Expects at least the list of
        columns and the rows for the dataset.

        Raises ValueError if (1) the column identifier are not unique, (2) the
        row identifier are not uniqe, (3) the number of columns and values in a
        row do not match, (4) any of the column or row identifier have a
        negative value, or (5) if the given column or row counter have value
        lower or equal to any of the column or row identifier.

        Parameters
        ----------
        columns: list(vizier.datastore.dataset.DatasetColumn)
            List of columns. It is expected that each column has a unique
            identifier.
        rows: list(vizier.datastore.dataset.DatasetRow)
            List of dataset rows.
        annotations: vizier.datastore.annotation.dataset.DatasetMetadata, optional
            Annotations for dataset components

        Returns
        -------
        vizier.datastore.dataset.DatasetDescriptor
        """
        # Validate (i) that each column has a unique identifier, (ii) each row
        # has a unique identifier, and (iii) that every row has exactly one
        # value per column.
        _, max_row_id = validate_dataset(columns=columns, rows=rows)
        # Get new identifier and create directory for new dataset
        identifier = get_unique_identifier()
        dataset_dir = self.get_dataset_dir(identifier)
        os.makedirs(dataset_dir)
        # Write rows to data file
        data_file = os.path.join(dataset_dir, DATA_FILE)
        DefaultJsonDatasetReader(data_file).write(rows)
        # Filter annotations for non-existing resources
        if not annotations is None:
            annotations = annotations.filter(
                columns=[c.identifier for c in columns],
                rows=[r.identifier for r in rows])
        # Create dataset an write dataset file
        dataset = FileSystemDatasetHandle(identifier=identifier,
                                          columns=columns,
                                          data_file=data_file,
                                          row_count=len(rows),
                                          max_row_id=max_row_id,
                                          annotations=annotations)
        dataset.to_file(
            descriptor_file=os.path.join(dataset_dir, DESCRIPTOR_FILE))
        # Write metadata file if annotations are given
        if not annotations is None:
            dataset.annotations.to_file(self.get_metadata_filename(identifier))
        # Return handle for new dataset
        return DatasetDescriptor(identifier=dataset.identifier,
                                 columns=dataset.columns,
                                 row_count=dataset.row_count)
Esempio n. 4
0
    def reader(self, offset=0, limit=-1):
        """Get reader for the dataset to access the dataset rows. The optional
        offset amd limit parameters are used to retrieve only a subset of
        rows.

        Parameters
        ----------
        offset: int, optional
            Number of rows at the beginning of the list that are skipped.
        limit: int, optional
            Limits the number of rows that are returned.

        Returns
        -------
        vizier.datastore.reader.DefaultJsonDatasetReader
        """
        return DefaultJsonDatasetReader(self.data_file,
                                        columns=self.columns,
                                        offset=offset,
                                        limit=limit)
Esempio n. 5
0
    def create_dataset(
            self,
            columns: List[DatasetColumn],
            rows: List[DatasetRow],
            properties: Optional[Dict[str, Any]] = None,
            human_readable_name: str = "Untitled Dataset",
            backend_options: Optional[List[Tuple[str, str]]] = None,
            dependencies: Optional[List[str]] = None) -> DatasetDescriptor:
        """Create a new dataset in the datastore. Expects at least the list of
        columns and the rows for the dataset.

        Raises ValueError if (1) the column identifier are not unique, (2) the
        row identifier are not uniqe, (3) the number of columns and values in a
        row do not match, (4) any of the column or row identifier have a
        negative value, or (5) if the given column or row counter have value
        lower or equal to any of the column or row identifier.

        Parameters
        ----------
        columns: list(vizier.datastore.dataset.DatasetColumn)
            List of columns. It is expected that each column has a unique
            identifier.
        rows: list(vizier.datastore.dataset.DatasetRow)
            List of dataset rows.
        properties: dict(string, ANY), optional
            Properties for dataset components

        Returns
        -------
        vizier.datastore.dataset.DatasetDescriptor
        """
        # Validate (i) that each column has a unique identifier, (ii) each row
        # has a unique identifier, and (iii) that every row has exactly one
        # value per column.
        properties = {} if properties is None else properties
        dependencies = [] if dependencies is None else dependencies
        identifiers = set(
            int(row.identifier) for row in rows
            if row.identifier is not None and int(row.identifier) >= 0)
        identifiers.add(0)
        max_row_id = max(identifiers)
        rows = [
            DatasetRow(identifier=row.identifier if row.identifier is not None
                       and int(row.identifier) >= 0 else str(idx + max_row_id),
                       values=row.values,
                       caveats=row.caveats) for idx, row in enumerate(rows)
        ]
        _, max_row_id = validate_dataset(columns=columns, rows=rows)
        # Get new identifier and create directory for new dataset
        identifier = get_unique_identifier()
        dataset_dir = self.get_dataset_dir(identifier)
        os.makedirs(dataset_dir)
        # Write rows to data file
        data_file = os.path.join(dataset_dir, DATA_FILE)
        DefaultJsonDatasetReader(data_file).write(rows)
        # Create dataset an write dataset file
        dataset = FileSystemDatasetHandle(identifier=identifier,
                                          columns=columns,
                                          data_file=data_file,
                                          row_count=len(rows),
                                          max_row_id=max_row_id,
                                          properties=properties)
        dataset.to_file(
            descriptor_file=os.path.join(dataset_dir, DESCRIPTOR_FILE))
        # Write metadata file if annotations are given
        if properties is not None:
            dataset.write_properties_to_file(
                self.get_properties_filename(identifier))
        # Return handle for new dataset
        return DatasetDescriptor(identifier=dataset.identifier,
                                 name=human_readable_name,
                                 columns=dataset.columns)
Esempio n. 6
0
    def create_dataset(self,
                       identifier=None,
                       columns=None,
                       rows=None,
                       column_counter=None,
                       row_counter=None,
                       annotations=None):
        """Create a new dataset in the data store for the given data.

        Raises ValueError if (1) any of the column or row identifier have a
        negative value, or (2) if the given column or row counter have value
        lower or equal to any of the column or row identifier.

        Parameters
        ----------
        identifier: string, optional
            Unique dataset identifier
        columns: list(vizier.datastore.base.DatasetColumn)
            List of columns. It is expected that each column has a unique
            identifier.
        rows: list(vizier.datastore.base.DatasetRow)
            List of dataset rows.
        column_counter: int, optional
            Counter to generate unique column identifier
        row_counter: int, optional
            Counter to generate unique row identifier
        annotations: vizier.datastore.metadata.DatasetMetadata, optional
            Annotations for dataset components

        Returns
        -------
        vizier.datastore.fs.FileSystemDatasetHandle
        """
        # Set columns and rows if not given
        if columns is None:
            columns = list()
        if rows is None:
            rows = list()
        else:
            # Validate the number of values in the given rows
            validate_schema(columns, rows)
        # Validate that all column identifier are smaller that the given
        # column counter
        if not column_counter is None:
            for col in columns:
                if col.identifier >= column_counter:
                    raise ValueError('invalid column counter')
        else:
            # Set column counter to max. column identifier + 1
            column_counter = -1
            for col in columns:
                if col.identifier > column_counter:
                    column_counter = col.identifier
            column_counter += 1
        # Validate that all row ids are non-negative, unique, lower that the
        # given row_counter
        max_rowid = -1
        row_ids = set()
        for row in rows:
            if row.identifier < 0:
                raise ValueError('invalid row identifier \'' +
                                 str(row.identifier) + '\'')
            elif not row_counter is None and row.identifier >= row_counter:
                raise ValueError('invalid row counter')
            elif row.identifier in row_ids:
                raise ValueError('duplicate row identifier \'' +
                                 str(row.identifier) + '\'')
            row_ids.add(row.identifier)
            if row_counter is None and row.identifier > max_rowid:
                max_rowid = row.identifier
        if row_counter is None:
            row_counter = max_rowid + 1
        # Get new identifier and create directory for new dataset
        identifier = get_unique_identifier()
        dataset_dir = self.get_dataset_dir(identifier)
        os.makedirs(dataset_dir)
        # Write rows to data file
        datafile = os.path.join(dataset_dir, DATA_FILE)
        DefaultJsonDatasetReader(datafile).write(rows)
        # Create dataset an write dataset file
        dataset = FileSystemDatasetHandle(identifier=identifier,
                                          columns=columns,
                                          row_count=len(rows),
                                          datafile=datafile,
                                          column_counter=column_counter,
                                          row_counter=row_counter,
                                          annotations=annotations)
        dataset.to_file(os.path.join(dataset_dir, HANDLE_FILE))
        # Write metadata file
        dataset.annotations.to_file(os.path.join(dataset_dir, METADATA_FILE))
        # Return handle for new dataset
        return dataset