Esempio n. 1
0
def _ndarray_to_block(ndarray: np.ndarray) -> Block[np.ndarray]:
    stats = BlockExecStats.builder()
    import pyarrow as pa
    from ray.data.extensions import TensorArray
    table = pa.Table.from_pydict({"value": TensorArray(ndarray)})
    return (table, BlockAccessor.for_block(table).get_metadata(
        input_files=None, exec_stats=stats.build()))
Esempio n. 2
0
def to_dataframe(pairs):
    from ray.data.extensions import TensorArray

    def to_numpy(d):
        return {k: v.detach().cpu().numpy() for (k, v) in d.items()}

    def box2dict(boxes):
        return {
            "x1": boxes[:, 0],
            "y1": boxes[:, 1],
            "x2": boxes[:, 2],
            "y2": boxes[:, 3],
        }

    dfs = []
    for (filename, d) in pairs:
        d2 = to_numpy(d)
        rdf = pd.DataFrame.from_dict(
            {
                "filename": filename,
                **box2dict(d2["boxes"]),
                "object_score": d2["scores"],
                "features": TensorArray(d2["features"]),
            }
        )
        dfs.append(rdf)

    return pd.concat(dfs, ignore_index=True)
Esempio n. 3
0
def test_convert_image_df_to_tensor():
    images = np.zeros([4, 3, 32, 32])
    df = pd.DataFrame({"image": TensorArray(images)})

    actual_tensor = convert_pandas_to_tf_tensor(df)

    expected_tensor = tf.zeros([4, 3, 32, 32], dtype=images.dtype)
    tf.debugging.assert_equal(actual_tensor, expected_tensor)
Esempio n. 4
0
def infer_coarse_embedding(pdtab):
    # max_zoom_out = pdtab.groupby('file_path').zoom_level.max().rename('max_zoom_level')
    # wmax = pd.merge(pdtab, max_zoom_out, left_on='file_path', right_index=True)
    wmax = pdtab
    lev1 = wmax[wmax.zoom_level == wmax.max_zoom_level]
    ser = lev1.groupby("dbidx").vectors.mean().reset_index()
    res = ser["vectors"].values.to_numpy()
    normres = res / np.maximum(np.linalg.norm(res, axis=1, keepdims=True),
                               1e-6)
    return ser.assign(vectors=TensorArray(normres))
Esempio n. 5
0
    def extract_meta(self, ray_dataset, pyramid_factor, part_id):
        # dataset = Subset(dataset, indices=indices)
        # txds = TxDataset(dataset, tx=preprocess)

        meta_dict = self.meta_dict

        def fix_meta(ray_tup):
            fullpath, binary = ray_tup
            p = os.path.realpath(fullpath)
            file_path, dbidx = meta_dict[p]
            return {"file_path": file_path, "dbidx": dbidx, "binary": binary}

        def full_preproc(tup):
            ray_tup = fix_meta(tup)
            try:
                image = PIL.Image.open(io.BytesIO(ray_tup["binary"]))
            except PIL.UnidentifiedImageError:
                print(f'error parsing binary {ray_tup["file_path"]}')
                ## some images are corrupted / not copied properly
                ## it is easier to handle that softly
                image = None

            del ray_tup["binary"]
            if image is None:
                return []  # empty list ok?
            else:
                ray_tup["image"] = image
                return preprocess(ray_tup, factor=pyramid_factor)

        def preproc_batch(b):
            return [full_preproc(tup) for tup in b]

        dl = ray_dataset.window(blocks_per_window=20).map_batches(
            preproc_batch, batch_size=20)
        res = []
        for batch in dl.iter_rows():
            batch_res = self.bim(batch)
            res.extend(batch_res)
        # dl = DataLoader(txds, num_workers=1, shuffle=False,
        #                 batch_size=1, collate_fn=iden)
        # res = []
        # for batch in dl:
        #     flat_batch = sum(batch,[])
        #     batch_res = self.bim(flat_batch)
        #     res.append(batch_res)

        merged_res = pd.concat(res, ignore_index=True)
        ofile = f"{self.output_dir}/part_{part_id:04d}.parquet"

        ### TMP: parquet does not allow half prec.
        x = merged_res
        x = x.assign(
            vectors=TensorArray(x["vectors"].to_numpy().astype("single")))
        x.to_parquet(ofile)
        return ofile
Esempio n. 6
0
    def prepare_read(self,
                     parallelism: int,
                     n: int,
                     block_format: str = "list",
                     tensor_shape: Tuple = (1, )) -> List[ReadTask]:
        read_tasks: List[ReadTask] = []
        block_size = max(1, n // parallelism)

        # Example of a read task. In a real datasource, this would pull data
        # from an external system instead of generating dummy data.
        def make_block(start: int, count: int) -> Block:
            if block_format == "arrow":
                return pyarrow.Table.from_arrays(
                    [np.arange(start, start + count)], names=["value"])
            elif block_format == "tensor":
                tensor = TensorArray(
                    np.ones(tensor_shape, dtype=np.int64) * np.expand_dims(
                        np.arange(start, start + count),
                        tuple(range(1, 1 + len(tensor_shape)))))
                return pyarrow.Table.from_pydict({"value": tensor})
            else:
                return list(builtins.range(start, start + count))

        i = 0
        while i < n:
            count = min(block_size, n - i)
            if block_format == "arrow":
                _check_pyarrow_version()
                import pyarrow
                schema = pyarrow.Table.from_pydict({"value": [0]}).schema
            elif block_format == "tensor":
                _check_pyarrow_version()
                from ray.data.extensions import TensorArray
                import pyarrow
                tensor = TensorArray(
                    np.ones(tensor_shape, dtype=np.int64) * np.expand_dims(
                        np.arange(0, 10), tuple(
                            range(1, 1 + len(tensor_shape)))))
                schema = pyarrow.Table.from_pydict({"value": tensor}).schema
            elif block_format == "list":
                schema = int
            else:
                raise ValueError("Unsupported block type", block_format)
            meta = BlockMetadata(
                num_rows=count,
                size_bytes=8 * count,
                schema=schema,
                input_files=None,
                exec_stats=None)
            read_tasks.append(
                ReadTask(
                    lambda i=i, count=count: [make_block(i, count)], meta))
            i += block_size

        return read_tasks
Esempio n. 7
0
 def _read_file(self, f: "pyarrow.NativeFile", path: str, **reader_args):
     from ray.data.extensions import TensorArray
     import pyarrow as pa
     # TODO(ekl) Ideally numpy can read directly from the file, but it
     # seems like it requires the file to be seekable.
     buf = BytesIO()
     data = f.readall()
     buf.write(data)
     buf.seek(0)
     return pa.Table.from_pydict(
         {"value": TensorArray(np.load(buf, allow_pickle=True))})
Esempio n. 8
0
 def make_block(start: int, count: int) -> Block:
     if block_format == "arrow":
         return pyarrow.Table.from_arrays(
             [np.arange(start, start + count)], names=["value"])
     elif block_format == "tensor":
         tensor = TensorArray(
             np.ones(tensor_shape, dtype=np.int64) * np.expand_dims(
                 np.arange(start, start + count),
                 tuple(range(1, 1 + len(tensor_shape)))))
         return pyarrow.Table.from_pydict({"value": tensor})
     else:
         return list(builtins.range(start, start + count))
Esempio n. 9
0
def postprocess_results(acc):
    flat_acc = {
        "iis": [],
        "jjs": [],
        "dbidx": [],
        "vecs": [],
        "zoom_factor": [],
        "zoom_level": [],
        "file_path": [],
    }
    flat_vecs = []

    # {'accs':accs, 'sf':sf, 'dbidx':dbidx, 'zoom_level':zoom_level}
    for item in acc:
        acc0, sf, dbidx, zl, fp = itemgetter("accs", "scale_factor", "dbidx",
                                             "zoom_level", "file_path")(item)
        acc0 = acc0.squeeze(0)
        acc0 = acc0.transpose((1, 2, 0))

        iis, jjs = np.meshgrid(
            np.arange(acc0.shape[0], dtype=np.int16),
            np.arange(acc0.shape[1], dtype=np.int16),
            indexing="ij",
        )
        # iis = iis.reshape(-1, acc0)
        iis = iis.reshape(-1)
        jjs = jjs.reshape(-1)
        acc0 = acc0.reshape(-1, acc0.shape[-1])
        imids = np.ones_like(iis) * dbidx
        zf = np.ones_like(iis) * (1.0 / sf)
        zl = np.ones_like(iis) * zl

        flat_acc["iis"].append(iis)
        flat_acc["jjs"].append(jjs)
        flat_acc["dbidx"].append(imids)
        flat_acc["vecs"].append(acc0)
        flat_acc["zoom_factor"].append(zf.astype("float32"))
        flat_acc["zoom_level"].append(zl.astype("int16"))
        flat_acc["file_path"].append([fp] * iis.shape[0])

    flat = {}
    for k, v in flat_acc.items():
        flat[k] = np.concatenate(v)

    vecs = flat["vecs"]
    del flat["vecs"]

    vec_meta = pd.DataFrame(flat)
    # vecs = vecs.astype('float32')
    # vecs = vecs/(np.linalg.norm(vecs, axis=-1, keepdims=True) + 1e-6)
    vec_meta = vec_meta.assign(**get_boxes(vec_meta),
                               vectors=TensorArray(vecs))
    return vec_meta.drop(["iis", "jjs"], axis=1)
Esempio n. 10
0
    def _transform_pandas(self, df: pd.DataFrame):
        self._validate(df)

        included_columns = set(df)
        if self.included_columns:  # subset of included columns
            included_columns = set(self.included_columns)

        columns_to_concat = list(included_columns - set(self.excluded_columns))
        concatenated = df[columns_to_concat].to_numpy(dtype=self.dtype)
        df = df.drop(columns=columns_to_concat)
        df[self.output_column_name] = TensorArray(concatenated)
        return df
Esempio n. 11
0
    def _read_file(self, f: "pyarrow.NativeFile", path: str, **reader_args):
        import imageio as iio
        import pandas as pd
        from ray.data.extensions import TensorArray

        records = super()._read_file(f, path, include_paths=True)
        assert len(records) == 1
        path, data = records[0]

        image = iio.imread(data)
        label = _get_class_from_path(path, self.root)

        return pd.DataFrame({
            "image": TensorArray([np.array(image)]),
            "label": [label],
        })
Esempio n. 12
0
def join_vecs2annotations(db: MultiscaleIndex, dbidx, annotations):
    patch_box_df = db.get_data(dbidx)
    roi_box_df = pd.DataFrame.from_records([b.dict() for b in annotations])

    dfvec = add_iou_score(patch_box_df, roi_box_df)
    dfvec = dfvec.assign(descriptions=dfvec.best_box_idx.map(
        lambda idx: annotations[idx].description))

    dfbox = add_iou_score(roi_box_df, patch_box_df)

    matched_vecs = np.stack(
        [dfvec.vectors.iloc[i].copy() for i in dfbox.best_box_idx.values])
    dfbox = dfbox.assign(descriptions=dfbox.description,
                         vectors=TensorArray(matched_vecs))

    return dfvec, dfbox
Esempio n. 13
0
    def get_data(self, dbidx) -> pd.DataFrame:
        vmeta = self.vector_meta[self.vector_meta.dbidx == dbidx]
        vectors = self.vectors[vmeta.index]

        return vmeta.assign(vectors=TensorArray(vectors))
Esempio n. 14
0
    def convert_batch_to_pandas(batch):

        images = [TensorArray(image) for image, _ in batch]
        # because we did autoencoder here
        df = pd.DataFrame({"image": images, "label": images})
        return df
Esempio n. 15
0
 def cast_as_tensor_dtype(series: Series) -> Series:
     return TensorArray(series)