def _ndarray_to_block(ndarray: np.ndarray) -> Block[np.ndarray]: stats = BlockExecStats.builder() import pyarrow as pa from ray.data.extensions import TensorArray table = pa.Table.from_pydict({"value": TensorArray(ndarray)}) return (table, BlockAccessor.for_block(table).get_metadata( input_files=None, exec_stats=stats.build()))
def to_dataframe(pairs): from ray.data.extensions import TensorArray def to_numpy(d): return {k: v.detach().cpu().numpy() for (k, v) in d.items()} def box2dict(boxes): return { "x1": boxes[:, 0], "y1": boxes[:, 1], "x2": boxes[:, 2], "y2": boxes[:, 3], } dfs = [] for (filename, d) in pairs: d2 = to_numpy(d) rdf = pd.DataFrame.from_dict( { "filename": filename, **box2dict(d2["boxes"]), "object_score": d2["scores"], "features": TensorArray(d2["features"]), } ) dfs.append(rdf) return pd.concat(dfs, ignore_index=True)
def test_convert_image_df_to_tensor(): images = np.zeros([4, 3, 32, 32]) df = pd.DataFrame({"image": TensorArray(images)}) actual_tensor = convert_pandas_to_tf_tensor(df) expected_tensor = tf.zeros([4, 3, 32, 32], dtype=images.dtype) tf.debugging.assert_equal(actual_tensor, expected_tensor)
def infer_coarse_embedding(pdtab): # max_zoom_out = pdtab.groupby('file_path').zoom_level.max().rename('max_zoom_level') # wmax = pd.merge(pdtab, max_zoom_out, left_on='file_path', right_index=True) wmax = pdtab lev1 = wmax[wmax.zoom_level == wmax.max_zoom_level] ser = lev1.groupby("dbidx").vectors.mean().reset_index() res = ser["vectors"].values.to_numpy() normres = res / np.maximum(np.linalg.norm(res, axis=1, keepdims=True), 1e-6) return ser.assign(vectors=TensorArray(normres))
def extract_meta(self, ray_dataset, pyramid_factor, part_id): # dataset = Subset(dataset, indices=indices) # txds = TxDataset(dataset, tx=preprocess) meta_dict = self.meta_dict def fix_meta(ray_tup): fullpath, binary = ray_tup p = os.path.realpath(fullpath) file_path, dbidx = meta_dict[p] return {"file_path": file_path, "dbidx": dbidx, "binary": binary} def full_preproc(tup): ray_tup = fix_meta(tup) try: image = PIL.Image.open(io.BytesIO(ray_tup["binary"])) except PIL.UnidentifiedImageError: print(f'error parsing binary {ray_tup["file_path"]}') ## some images are corrupted / not copied properly ## it is easier to handle that softly image = None del ray_tup["binary"] if image is None: return [] # empty list ok? else: ray_tup["image"] = image return preprocess(ray_tup, factor=pyramid_factor) def preproc_batch(b): return [full_preproc(tup) for tup in b] dl = ray_dataset.window(blocks_per_window=20).map_batches( preproc_batch, batch_size=20) res = [] for batch in dl.iter_rows(): batch_res = self.bim(batch) res.extend(batch_res) # dl = DataLoader(txds, num_workers=1, shuffle=False, # batch_size=1, collate_fn=iden) # res = [] # for batch in dl: # flat_batch = sum(batch,[]) # batch_res = self.bim(flat_batch) # res.append(batch_res) merged_res = pd.concat(res, ignore_index=True) ofile = f"{self.output_dir}/part_{part_id:04d}.parquet" ### TMP: parquet does not allow half prec. x = merged_res x = x.assign( vectors=TensorArray(x["vectors"].to_numpy().astype("single"))) x.to_parquet(ofile) return ofile
def prepare_read(self, parallelism: int, n: int, block_format: str = "list", tensor_shape: Tuple = (1, )) -> List[ReadTask]: read_tasks: List[ReadTask] = [] block_size = max(1, n // parallelism) # Example of a read task. In a real datasource, this would pull data # from an external system instead of generating dummy data. def make_block(start: int, count: int) -> Block: if block_format == "arrow": return pyarrow.Table.from_arrays( [np.arange(start, start + count)], names=["value"]) elif block_format == "tensor": tensor = TensorArray( np.ones(tensor_shape, dtype=np.int64) * np.expand_dims( np.arange(start, start + count), tuple(range(1, 1 + len(tensor_shape))))) return pyarrow.Table.from_pydict({"value": tensor}) else: return list(builtins.range(start, start + count)) i = 0 while i < n: count = min(block_size, n - i) if block_format == "arrow": _check_pyarrow_version() import pyarrow schema = pyarrow.Table.from_pydict({"value": [0]}).schema elif block_format == "tensor": _check_pyarrow_version() from ray.data.extensions import TensorArray import pyarrow tensor = TensorArray( np.ones(tensor_shape, dtype=np.int64) * np.expand_dims( np.arange(0, 10), tuple( range(1, 1 + len(tensor_shape))))) schema = pyarrow.Table.from_pydict({"value": tensor}).schema elif block_format == "list": schema = int else: raise ValueError("Unsupported block type", block_format) meta = BlockMetadata( num_rows=count, size_bytes=8 * count, schema=schema, input_files=None, exec_stats=None) read_tasks.append( ReadTask( lambda i=i, count=count: [make_block(i, count)], meta)) i += block_size return read_tasks
def _read_file(self, f: "pyarrow.NativeFile", path: str, **reader_args): from ray.data.extensions import TensorArray import pyarrow as pa # TODO(ekl) Ideally numpy can read directly from the file, but it # seems like it requires the file to be seekable. buf = BytesIO() data = f.readall() buf.write(data) buf.seek(0) return pa.Table.from_pydict( {"value": TensorArray(np.load(buf, allow_pickle=True))})
def make_block(start: int, count: int) -> Block: if block_format == "arrow": return pyarrow.Table.from_arrays( [np.arange(start, start + count)], names=["value"]) elif block_format == "tensor": tensor = TensorArray( np.ones(tensor_shape, dtype=np.int64) * np.expand_dims( np.arange(start, start + count), tuple(range(1, 1 + len(tensor_shape))))) return pyarrow.Table.from_pydict({"value": tensor}) else: return list(builtins.range(start, start + count))
def postprocess_results(acc): flat_acc = { "iis": [], "jjs": [], "dbidx": [], "vecs": [], "zoom_factor": [], "zoom_level": [], "file_path": [], } flat_vecs = [] # {'accs':accs, 'sf':sf, 'dbidx':dbidx, 'zoom_level':zoom_level} for item in acc: acc0, sf, dbidx, zl, fp = itemgetter("accs", "scale_factor", "dbidx", "zoom_level", "file_path")(item) acc0 = acc0.squeeze(0) acc0 = acc0.transpose((1, 2, 0)) iis, jjs = np.meshgrid( np.arange(acc0.shape[0], dtype=np.int16), np.arange(acc0.shape[1], dtype=np.int16), indexing="ij", ) # iis = iis.reshape(-1, acc0) iis = iis.reshape(-1) jjs = jjs.reshape(-1) acc0 = acc0.reshape(-1, acc0.shape[-1]) imids = np.ones_like(iis) * dbidx zf = np.ones_like(iis) * (1.0 / sf) zl = np.ones_like(iis) * zl flat_acc["iis"].append(iis) flat_acc["jjs"].append(jjs) flat_acc["dbidx"].append(imids) flat_acc["vecs"].append(acc0) flat_acc["zoom_factor"].append(zf.astype("float32")) flat_acc["zoom_level"].append(zl.astype("int16")) flat_acc["file_path"].append([fp] * iis.shape[0]) flat = {} for k, v in flat_acc.items(): flat[k] = np.concatenate(v) vecs = flat["vecs"] del flat["vecs"] vec_meta = pd.DataFrame(flat) # vecs = vecs.astype('float32') # vecs = vecs/(np.linalg.norm(vecs, axis=-1, keepdims=True) + 1e-6) vec_meta = vec_meta.assign(**get_boxes(vec_meta), vectors=TensorArray(vecs)) return vec_meta.drop(["iis", "jjs"], axis=1)
def _transform_pandas(self, df: pd.DataFrame): self._validate(df) included_columns = set(df) if self.included_columns: # subset of included columns included_columns = set(self.included_columns) columns_to_concat = list(included_columns - set(self.excluded_columns)) concatenated = df[columns_to_concat].to_numpy(dtype=self.dtype) df = df.drop(columns=columns_to_concat) df[self.output_column_name] = TensorArray(concatenated) return df
def _read_file(self, f: "pyarrow.NativeFile", path: str, **reader_args): import imageio as iio import pandas as pd from ray.data.extensions import TensorArray records = super()._read_file(f, path, include_paths=True) assert len(records) == 1 path, data = records[0] image = iio.imread(data) label = _get_class_from_path(path, self.root) return pd.DataFrame({ "image": TensorArray([np.array(image)]), "label": [label], })
def join_vecs2annotations(db: MultiscaleIndex, dbidx, annotations): patch_box_df = db.get_data(dbidx) roi_box_df = pd.DataFrame.from_records([b.dict() for b in annotations]) dfvec = add_iou_score(patch_box_df, roi_box_df) dfvec = dfvec.assign(descriptions=dfvec.best_box_idx.map( lambda idx: annotations[idx].description)) dfbox = add_iou_score(roi_box_df, patch_box_df) matched_vecs = np.stack( [dfvec.vectors.iloc[i].copy() for i in dfbox.best_box_idx.values]) dfbox = dfbox.assign(descriptions=dfbox.description, vectors=TensorArray(matched_vecs)) return dfvec, dfbox
def get_data(self, dbidx) -> pd.DataFrame: vmeta = self.vector_meta[self.vector_meta.dbidx == dbidx] vectors = self.vectors[vmeta.index] return vmeta.assign(vectors=TensorArray(vectors))
def convert_batch_to_pandas(batch): images = [TensorArray(image) for image, _ in batch] # because we did autoencoder here df = pd.DataFrame({"image": images, "label": images}) return df
def cast_as_tensor_dtype(series: Series) -> Series: return TensorArray(series)