def next(self): """Return the next row in the dataset iterator. Raises StopIteration if end of file is reached or file has been closed. Automatically closes any open file when end of iteration is reached for the first time. Returns ------- vizier.datastore.base.DatasetRow """ if self.is_open: # Catch exception to close any open file try: row = self.reader.next() if self.has_row_ids: row = DatasetRow(int(row[0]), row[1:]) else: row = DatasetRow(self.line_count, row) self.line_count += 1 return row except StopIteration as ex: self.close() raise ex raise StopIteration
def test_default_json_reader(self): """Test functionality of Json dataset reader.""" reader = DefaultJsonDatasetReader(JSON_FILE) with self.assertRaises(StopIteration): reader.next() count = 0 with reader.open() as r: for row in r: self.assertEquals(len(row.values), 3) self.assertEquals(row.identifier, count) count += 1 self.assertEquals(count, 2) with self.assertRaises(StopIteration): reader.next() # Create a new dataset and read it tmp_file = tempfile.mkstemp()[1] reader = DefaultJsonDatasetReader(tmp_file) values = ['A', 'B', 1, 2] rows = [ DatasetRow(0, values), DatasetRow(1, values), DatasetRow(2, values) ] reader.write(rows) count = 0 with reader.open() as reader: for row in reader: self.assertEquals(len(row.values), 4) self.assertEquals(row.identifier, count) count += 1 self.assertEquals(count, len(rows)) os.remove(tmp_file)
def next(self): """Return the next row in the dataset iterator. Raises StopIteration if end of file is reached or file has been closed. Automatically closes any open file when end of iteration is reached for the first time. Returns ------- vizier.datastore.base.DatasetRow """ if self.is_open: if self.read_index < len(self.rows): row = DatasetRow.from_dict(self.rows[self.read_index]) # Set the annotation flags in the dataset row if not self.annotations is None: for i in range(len(self.columns)): col = self.columns[i] has_anno = self.annotations.has_cell_annotation( col.identifier, row.identifier ) if has_anno: row.cell_annotations[i] = True self.read_index += 1 return row raise StopIteration
def from_file(f_handle): """Read dataset from file. Expects the file to be in Json format which is the default serialization format used by to_file(). Parameters ---------- f_handle : vizier.filestore.base.FileHandle Handle for an uploaded file on a file server Returns ------- vizier.datastore.base.Dataset """ # Expects a CSV/TSV file. The first row contains the column names. # Read all information and return a InMemDatasetHandle if not f_handle.is_verified_csv: raise ValueError('failed to create dataset from file \'' + f_handle.name + '\'') # Read all information and return a InMemDatasetHandle columns = [] rows = [] with f_handle.open() as csvfile: reader = csv.reader(csvfile, delimiter=f_handle.delimiter) for col_name in reader.next(): columns.append(DatasetColumn(len(columns), col_name.strip())) for row in reader: values = [cast(v.strip()) for v in row] rows.append(DatasetRow(len(rows), values)) # Return InMemDatasetHandle return InMemDatasetHandle( identifier=get_unique_identifier(), columns=columns, rows=rows, column_counter=len(columns), row_counter=len(rows) )
def get_dataset(self, identifier): """Read a full dataset from the data store. Returns None if no dataset with the given identifier exists. Parameters ---------- identifier : string Unique dataset identifier Returns ------- vizier.datastore.base.DatasetHandle """ if identifier in self.datasets: dataset = self.datasets[identifier] return InMemDatasetHandle( identifier=identifier, columns=[ DatasetColumn(col.identifier, col.name) for col in dataset.columns ], rows=[ DatasetRow(row.identifier, list(row.values)) for row in dataset.fetch_rows() ], column_counter=dataset.column_counter, row_counter=dataset.row_counter, annotations=dataset.annotations.copy_metadata() )
def filter_columns(self, identifier, columns, names): """Dataset projection operator. Returns a copy of the dataset with the given identifier that contains only those columns listed in columns. The list of names contains optional new names for the filtered columns. A value of None in names indicates that the name of the corresponding column is not changed. Returns the number of rows in the dataset and the identifier of the projected dataset. Raises ValueError if no dataset with given identifier exists or if any of the filter columns are unknown. Parameters ---------- identifier: string Unique dataset identifier columns: list(int) List of column identifier for columns in the result. names: list(string) Optional new names for filtered columns. Returns ------- int, string """ # Get dataset. Raise exception if dataset is unknown dataset = self.datastore.get_dataset(identifier) if dataset is None: raise ValueError('unknown dataset \'' + identifier + '\'') # The schema of the new dataset only contains the columns in the given # list. Keep track of their index positions to filter values. schema = list() val_filter = list() for i in range(len(columns)): col_idx = get_index_for_column(dataset, columns[i]) col = dataset.columns[col_idx] if not names[i] is None: schema.append( DatasetColumn(identifier=col.identifier, name=names[i])) else: schema.append(col) val_filter.append(col_idx) # Create a list of projected rows rows = list() for row in dataset.fetch_rows(): values = list() for v_idx in val_filter: values.append(row.values[v_idx]) rows.append(DatasetRow(identifier=row.identifier, values=values)) # Store updated dataset to get new identifier ds = self.datastore.create_dataset( columns=schema, rows=rows, column_counter=dataset.column_counter, row_counter=dataset.row_counter, annotations=dataset.annotations.filter_columns(columns)) return len(rows), ds.identifier
def open(self): """Setup the reader by querying the database and creating an in-memory copy of the dataset rows. Returns ------- vizier.datastore.reader.MimirDatasetReader """ # Query the database to retrieve dataset rows if reader is not already # open if not self.is_open: # Query the database to get the list of rows. Sort rows according to # order in row_ids and return a InMemReader sql = get_select_query(self.table_name, columns=self.columns) if self.rowid != None: sql += ' WHERE ROWID() = ' + str(self.rowid) if self.is_range_query: sql += ' LIMIT ' + str(self.limit) + ' OFFSET ' + str( self.offset) rs = json.loads( mimir._mimir.vistrailsQueryMimirJson(sql, True, False)) self.row_ids = rs['prov'] # Initialize mapping of column rdb names to index positions in # dataset rows self.col_map = dict() for i in range(len(rs['schema'])): col = rs['schema'][i] self.col_map[col['name']] = i # Initialize rows (make sure to sort them according to order in # row_ids list), read index and open flag rowid_idx = self.col_map[ROW_ID] # Filter rows if this is a range query (needed until IN works) rs_rows = rs['data'] self.rows = list() for row_index in range(len(rs_rows)): row = rs_rows[row_index] row_id = str(row[self.col_map[ROW_ID]]) values = [None] * len(self.columns) row_annos = [False] * len(values) for i in range(len(self.columns)): col = self.columns[i] col_index = self.col_map[col.name_in_rdb] values[i] = row[col_index] has_anno = self.annotations.has_cell_annotation( col.identifier, row_id) if not has_anno: # Check if the cell taint is true has_anno = not rs['col_taint'][row_index][col_index] row_annos[i] = has_anno self.rows.append( DatasetRow(row_id, values, annotations=row_annos)) self.rows.sort(key=lambda row: self.sortbyrowid(row.identifier)) self.read_index = 0 self.is_open = True return self
def update_cell(self, identifier, column, row, value): """Update a cell in a given dataset. Raises ValueError if no dataset with given identifier exists or if the specified cell is outside of the current dataset ranges. Parameters ---------- identifier : string Unique dataset identifier column : int Unique column identifier row : int Row index for updated cell (starting at 0) value : string New cell value Returns ------- int, string Number of updated rows (i.e., 1) and identifier of resulting dataset """ # Get dataset. Raise exception if dataset is unknown dataset = self.datastore.get_dataset(identifier) if dataset is None: raise ValueError('unknown dataset \'' + identifier + '\'') # Get column index forst in case it raises an exception col_idx = get_index_for_column(dataset, column) # Make sure that row refers a valid row in the dataset rows = dataset.fetch_rows() if row < 0 or row >= len(rows): raise ValueError('invalid cell [' + str(column) + ', ' + str(row) + ']') # Update the specified cell in the given data array r = rows[row] values = list(r.values) values[col_idx] = value rows[row] = DatasetRow(r.identifier, values) # Store updated dataset to get new identifier ds = self.datastore.create_dataset( columns=dataset.columns, rows=rows, column_counter=dataset.column_counter, row_counter=dataset.row_counter, annotations=dataset.annotations) return 1, ds.identifier
def insert_row(self, identifier, position): """Insert row at given position in a dataset. Raises ValueError if no dataset with given identifier exists or if the specified row psotion isoutside the dataset bounds. Parameters ---------- identifier : string Unique dataset identifier position : int Index position at which the row will be inserted Returns ------- int, string Number of inserted rows (i.e., 1) and identifier of resulting dataset """ # Get dataset. Raise exception if dataset is unknown dataset = self.datastore.get_dataset(identifier) if dataset is None: raise ValueError('unknown dataset \'' + identifier + '\'') # Make sure that position is a valid row index in the new dataset rows = dataset.fetch_rows() if position < 0 or position > len(rows): raise ValueError('invalid row index \'' + str(position) + '\'') # Create empty set of values row = DatasetRow(dataset.row_counter, [None] * len(dataset.columns)) rows.insert(position, row) # Store updated dataset to get new identifier ds = self.datastore.create_dataset( columns=dataset.columns, rows=rows, column_counter=dataset.column_counter, row_counter=dataset.row_counter + 1, annotations=dataset.annotations) return 1, ds.identifier