Example #1
0
    def test_any(self):
        arr = TensorArray(np.arange(6).reshape(3, 2))
        s = pd.Series(arr)

        # Test as agg to TensorElement, defaults to axis=0
        result = s % 3 == 0
        npt.assert_array_equal(result[2], np.array([False, False]))
        npt.assert_array_equal(result.any(), np.array([True, True]))
Example #2
0
 def test_sort(self):
     arr = TensorArray(np.arange(6).reshape(3, 2))
     date_range = pd.date_range('2018-01-01', periods=3, freq='H')
     df = pd.DataFrame({"time": date_range, "tensor": arr})
     df = df.sort_values(by="time", ascending=False)
     self.assertEqual(df["tensor"].array.numpy_dtype, arr.numpy_dtype)
     expected = np.array([[4, 5], [2, 3], [0, 1]])
     npt.assert_array_equal(df["tensor"].array, expected)
Example #3
0
 def test_concat(self):
     x = np.arange(6).reshape((3, 2))
     y = np.arange(6, 12).reshape((3, 2))
     x_arr = TensorArray(x)
     y_arr = TensorArray(y)
     concat_arr = TensorArray._concat_same_type((x_arr, y_arr))
     result = str(concat_arr)
     self.assertEqual(
         result,
         textwrap.dedent("""\
             [[ 0  1]
              [ 2  3]
              [ 4  5]
              [ 6  7]
              [ 8  9]
              [10 11]]"""),
     )
Example #4
0
    def test_repr(self):
        x = np.array([[1, 2], [3, 4], [5, 6]])
        expected = textwrap.dedent("""\
        array([[1, 2],
               [3, 4],
               [5, 6]])""")
        s = TensorArray(x)
        result = s.__repr__()
        self.assertEqual(expected, result)

        result = repr(pd.Series(s))
        expected = textwrap.dedent("""\
            0    [1, 2]
            1    [3, 4]
            2    [5, 6]
            dtype: TensorDtype""")
        self.assertEqual(expected, result)
    def test_take(self):
        x = np.array([[1, 2], [3, 4], [5, 6]])
        s = TensorArray(x)

        # Test no missing gets same dtype
        result = s.take([0, 2], allow_fill=True)
        expected = np.array([[1, 2], [5, 6]])
        self.assertEqual(result.numpy_dtype, expected.dtype)
        npt.assert_array_equal(result, expected)
        result = s.take([0, 2], allow_fill=False)
        npt.assert_array_equal(result, expected)

        # Test missing with nan fill gets promoted to float and filled
        result = s.take([1, -1], allow_fill=True)
        expected = np.array([[3, 4], [np.nan, np.nan]])
        self.assertEqual(result.numpy_dtype, expected.dtype)
        npt.assert_array_equal(result, expected)
        npt.assert_array_equal(result.isna(), [False, True])
Example #6
0
    def test_sum(self):
        keys = ["a", "a", "b", "c", "c", "c"]
        values = np.array([[1, 1]] * len(keys))
        df = pd.DataFrame({"key": keys, "value": TensorArray(values)})
        result_df = df.groupby("key").aggregate({"value": "sum"})

        # Check array gets unwrapped from TensorElements
        arr = result_df["value"].array
        self.assertEqual(arr.numpy_dtype, values.dtype)
        npt.assert_array_equal(arr.to_numpy(), [[2, 2], [1, 1], [3, 3]])

        # Check the resulting DataFrame
        self.assertEqual(
            repr(result_df),
            textwrap.dedent("""\
                      value
                key        
                a    [2, 2]
                b    [1, 1]
                c    [3, 3]"""),
        )

        # 2D values
        values2 = np.array([[[1, 1], [1, 1]]] * len(keys))
        df2 = pd.DataFrame({"key": keys, "value": TensorArray(values2)})
        result2_df = df2.groupby("key").aggregate({"value": "sum"})

        # Check array gets unwrapped from TensorElements
        arr2 = result2_df["value"].array
        self.assertEqual(arr2.numpy_dtype, values.dtype)
        npt.assert_array_equal(
            arr2.to_numpy(),
            [[[2, 2], [2, 2]], [[1, 1], [1, 1]], [[3, 3], [3, 3]]])

        # Check the resulting DataFrame
        self.assertEqual(
            repr(result2_df),
            textwrap.dedent("""\
                                value
                key                  
                a    [[2, 2], [2, 2]]
                b    [[1, 1], [1, 1]]
                c    [[3, 3], [3, 3]]"""),
        )
    def test_sum(self):
        x = np.array([[1, 2], [3, 4], [5, 6]])
        s = TensorArray(x)
        df = pd.DataFrame({"s": s})

        sum_all = df["s"].sum()
        npt.assert_array_equal(sum_all.to_numpy(), [9, 12])

        sum_some = df["s"][[True, False, True]].sum()
        npt.assert_array_equal(sum_some.to_numpy(), [6, 8])
Example #8
0
    def _embedding_to_int(df: pd.DataFrame, colname: str):
        """
        Turn embeddings into ints so that test results will be more stable.

        :param df: DataFrame containing embeddings. MODIFIED IN PLACE.
        :param colname: Name of column where embeddings reside
        """
        before = df[colname].values.to_numpy()
        after = (before * 10.).astype(int)
        df[colname] = TensorArray(after)
 def test_create_series(self):
     x = np.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]] * 100)
     a = TensorArray(x)
     s1 = pd.Series(a)
     s2 = pd.Series(a, dtype=TensorDtype())
     s3 = pd.Series(a, dtype=TensorDtype(), copy=True)
     self.assertEqual(len(x), len(s1))
     npt.assert_array_equal(x, s1.to_numpy())
     pdt.assert_series_equal(s1, s2)
     pdt.assert_series_equal(s1, s3)
    def test_parquet(self):
        x = np.arange(10).reshape(5, 2)
        s = TensorArray(x)
        df = pd.DataFrame({"i": list(range(len(x))), "tensor": s})

        with tempfile.TemporaryDirectory() as dirpath:
            filename = os.path.join(dirpath, "tensor_array_test.parquet")
            df.to_parquet(filename)
            df_read = pd.read_parquet(filename)
            pd.testing.assert_frame_equal(df, df_read)
Example #11
0
    def test_large_display_numeric(self):

        # Test integer, uses IntArrayFormatter
        df = pd.DataFrame({"foo": TensorArray(np.array([[1, 2]] * 100))})
        self.assertEqual(
            repr(df),
            textwrap.dedent("""\
                       foo
                0   [1, 2]
                1   [1, 2]
                2   [1, 2]
                3   [1, 2]
                4   [1, 2]
                ..     ...
                95  [1, 2]
                96  [1, 2]
                97  [1, 2]
                98  [1, 2]
                99  [1, 2]
                
                [100 rows x 1 columns]"""))

        # Test float, uses FloatArrayFormatter
        df = pd.DataFrame({"foo": TensorArray(np.array([[1.1, 2.2]] * 100))})
        self.assertEqual(
            repr(df),
            textwrap.dedent("""\
                           foo
                0   [1.1, 2.2]
                1   [1.1, 2.2]
                2   [1.1, 2.2]
                3   [1.1, 2.2]
                4   [1.1, 2.2]
                ..         ...
                95  [1.1, 2.2]
                96  [1.1, 2.2]
                97  [1.1, 2.2]
                98  [1.1, 2.2]
                99  [1.1, 2.2]
                
                [100 rows x 1 columns]"""))
Example #12
0
    def test_int_tensor_selection(self):
        data = TensorArray([[1, 2], [3, 4], [5, 6]])
        sel = TensorArray([0, 2])
        expected = np.array([[1, 2], [5, 6]])

        # Test TensorArray.__getitem__ with TensorArray
        result = data[sel]
        npt.assert_array_equal(result, expected)

        # Test Series of TensorDtype selection with numpy array
        s = pd.Series(data)
        result = s[np.asarray(sel)]
        npt.assert_array_equal(result, expected)

        # Test Series of TensorDtype selection with TensorArray
        result = s[sel]
        npt.assert_array_equal(result, expected)

        # Test Series of TensorDtype selection by integer location
        result = s.iloc[sel]
        npt.assert_array_equal(result, expected)
    def test_slice(self):
        x = np.array([[1, 2], [3, 4], [5, 6]])
        s = TensorArray(x)

        result = s[1]
        self.assertTrue(isinstance(result, TensorElement))
        expected = np.array([3, 4])
        npt.assert_array_equal(expected, result)

        result = s[1:3]
        self.assertTrue(isinstance(result, TensorArray))
        expected = np.array([[3, 4], [5, 6]])
        npt.assert_array_equal(expected, result.to_numpy())
Example #14
0
    def test_bool_tensor_selection(self):
        data = TensorArray([[1, 2], [3, 4], [5, 6]])
        sel = TensorArray([True, False, True])
        expected = np.array([[1, 2], [5, 6]])

        # Test TensorArray.__getitem__ with TensorArray
        result = data[sel]
        npt.assert_array_equal(result, expected)

        # Test Series of TensorDtype selection with numpy array
        s = pd.Series(data)
        result = s[np.asarray(sel)]
        npt.assert_array_equal(result, expected)

        # Test Series of TensorDtype selection with TensorArray
        # Currently fails due to Pandas not recognizing as bool index GH#162
        if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
            with self.assertRaises(Exception):
                result = s[sel]
        else:
            result = s[sel]
            npt.assert_array_equal(result, expected)
    def test_create(self):
        x = np.ones([5, 2, 3])
        s = TensorArray(x)
        self.assertEqual(len(s), 5)

        x = [np.ones([2, 3])] * 5
        s = TensorArray(x)
        self.assertEqual(len(s), 5)

        x = np.empty((0, 2))
        s = TensorArray(x)
        self.assertEqual(len(s), 0)
        s = TensorArray([])
        self.assertEqual(len(s), 0)

        x = [np.ones([2, 3]), np.ones([3, 2])]
        with self.assertRaises(ValueError):
            TensorArray(x)

        # Copy constructor
        s_copy = s.copy()
        self.assertEqual(len(s), len(s_copy))
Example #16
0
 def test_series_to_str(self):
     x = np.arange(50).reshape((10, 5))
     a = TensorArray(x)
     s = pd.Series(a)
     result = s.to_string(max_rows=4)
     self.assertEqual(
         result,
         textwrap.dedent("""\
             0    [ 0,  1,  2,  3,  4]
             1    [ 5,  6,  7,  8,  9]
                          ...         
             8    [40, 41, 42, 43, 44]
             9    [45, 46, 47, 48, 49]"""),
     )
    def test_isna(self):
        expected = np.array([False, True, False, False])

        # Test numeric
        x = np.array([[1, 2], [np.nan, np.nan], [3, np.nan], [5, 6]])
        s = TensorArray(x)
        result = s.isna()
        npt.assert_equal(result, expected)

        # Test object
        d = {"a": 1}
        x = np.array([[d, d], None, [d, None], [d, d]], dtype=object)
        s = TensorArray(x)
        result = s.isna()
        npt.assert_equal(result, expected)

        # Test str
        x = np.array([["foo", "foo"], ["", ""], ["bar", ""], ["baz", "baz"]])
        s = TensorArray(x)
        result = s.isna()
        npt.assert_equal(result, expected)
 def test_series_to_str(self):
     x = np.arange(50).reshape((10, 5))
     a = TensorArray(x)
     s = pd.Series(a)
     result = s.to_string(max_rows=4)
     self.assertEqual(
         result,
         textwrap.dedent(
             """\
             0        [0 1 2 3 4]
             1        [5 6 7 8 9]
                       ...       
             8   [40 41 42 43 44]
             9   [45 46 47 48 49]"""
         ),
     )
    def test_feather_auto_chunked(self):
        from pyarrow.feather import read_table, write_feather

        x = np.arange(2048).reshape(1024, 2)
        s = TensorArray(x)
        df = pd.DataFrame({"i": list(range(len(s))), "tensor": s})

        table = pa.Table.from_pandas(df)

        # Write table to feather and read back as a DataFrame
        with tempfile.TemporaryDirectory() as dirpath:
            filename = os.path.join(dirpath, "tensor_array_chunked_test.feather")
            write_feather(table, filename, chunksize=512)
            table = read_table(filename)
            self.assertGreaterEqual(table.column("tensor").num_chunks, 2)
            df_read = pd.read_feather(filename)
            pd.testing.assert_frame_equal(df, df_read)
def arrow_to_tensor_array(extension_array: pa.ExtensionArray) -> TensorArray:
    """
    Convert a pyarrow.ExtensionArray with type ArrowTensorType to a
    TensorArray.

    :param extension_array: pyarrow.ExtensionArray with type ArrowTensorType
    :return: TensorArray
    """

    if isinstance(extension_array, pa.ChunkedArray):
        if extension_array.num_chunks > 1:
            # TODO: look into removing concat and constructing from list w/ shape
            values = np.concatenate(
                [chunk.to_numpy() for chunk in extension_array.iterchunks()])
        else:
            values = extension_array.chunk(0).to_numpy()
    else:
        values = extension_array.to_numpy()

    return TensorArray(values)
    def test_bool_indexing_series(self):
        s = pd.Series(TensorArray([[1, 2], [3, 4]]))
        result = s[[False, False]]
        self.assertTrue(isinstance(result, pd.Series))
        self.assertEqual(len(result), 0)

        result = s[[True, True]]
        self.assertTrue(isinstance(result, pd.Series))
        pd.testing.assert_series_equal(result, s)

        result = s[[True, False]]
        self.assertTrue(isinstance(result, pd.Series))
        self.assertEqual(len(result), 1)
        expected = s.iloc[[0]]
        pd.testing.assert_series_equal(result, expected)

        result = s[[False, True]]
        self.assertTrue(isinstance(result, pd.Series))
        self.assertEqual(len(result), 1)
        expected = s.iloc[[1]]
        pd.testing.assert_series_equal(result, expected)
Example #22
0
    def test_bool_indexing_dataframe(self):
        s = TensorArray([[1, 2], [3, 4]])
        df = pd.DataFrame({"col1": s})
        result = df[[False, False]]
        self.assertTrue(isinstance(result, pd.DataFrame))
        self.assertEqual(len(result), 0)

        result = df[[True, True]]
        self.assertTrue(isinstance(result, pd.DataFrame))
        pd.testing.assert_frame_equal(result, df)

        result = df[[True, False]]
        self.assertTrue(isinstance(result, pd.DataFrame))
        self.assertEqual(len(result), 1)
        expected = df.iloc[[0]]
        pd.testing.assert_frame_equal(result, expected)

        result = df[[False, True]]
        self.assertTrue(isinstance(result, pd.DataFrame))
        self.assertEqual(len(result), 1)
        expected = df.iloc[[1]]
        pd.testing.assert_frame_equal(result, expected)
    def test_feather_chunked(self):
        from pyarrow.feather import write_feather

        x = np.arange(10).reshape(5, 2)
        s = TensorArray(x)
        df1 = pd.DataFrame({"i": list(range(len(s))), "tensor": s})

        # Create a Table with 2 chunks
        table1 = pa.Table.from_pandas(df1)
        df2 = df1.copy()
        df2["tensor"] = df2["tensor"] * 10
        table2 = pa.Table.from_pandas(df2)
        table = pa.concat_tables([table1, table2])
        self.assertEqual(table.column("tensor").num_chunks, 2)

        # Write table to feather and read back as a DataFrame
        with tempfile.TemporaryDirectory() as dirpath:
            filename = os.path.join(dirpath, "tensor_array_chunked_test.feather")
            write_feather(table, filename)
            df_read = pd.read_feather(filename)
            df_expected = pd.concat([df1, df2]).reset_index(drop=True)
            pd.testing.assert_frame_equal(df_expected, df_read)
    def test_bool_indexing(self):
        s = TensorArray([[1, 2], [3, 4]])

        result = s[[True, True]]
        self.assertTrue(isinstance(result, TensorArray))
        expected = np.array([[1, 2], [3, 4]])
        npt.assert_array_equal(result.to_numpy(), expected)

        result = s[[True, False]]
        self.assertTrue(isinstance(result, TensorArray))
        expected = np.array([[1, 2]])
        npt.assert_array_equal(result.to_numpy(), expected)

        result = s[[False, True]]
        self.assertTrue(isinstance(result, TensorArray))
        expected = np.array([[3, 4]])
        npt.assert_array_equal(result.to_numpy(), expected)

        result = s[[False, False]]
        self.assertTrue(isinstance(result, TensorArray))
        expected = np.empty((0, 2))
        npt.assert_array_equal(result.to_numpy(), expected)
    def test_create_from_scalar_list(self):
        x = [1, 2, 3, 4, 5]
        s = TensorArray(x)
        self.assertTupleEqual(s.numpy_shape, (len(x),))
        expected = np.array(x)
        npt.assert_array_equal(s.to_numpy(), expected)

        # Now with TensorElement values
        e = [TensorElement(np.array(i)) for i in x]
        s = pd.array(e, dtype=TensorDtype())
        npt.assert_array_equal(s.to_numpy(), expected)

        # Now with list of 1d tensors
        x = [np.array([i]) for i in x]
        s = pd.array(x, dtype=TensorDtype())
        self.assertTupleEqual(s.to_numpy().shape, (len(x), 1))
        npt.assert_array_equal(s.to_numpy(), np.array([[e] for e in expected]))

        # Pandas will create list of copies of the tensor element for the given indices
        s = pd.Series(np.nan, index=[0, 1, 2], dtype=TensorDtype())
        self.assertEqual(len(s), 3)
        self.assertTupleEqual(s.to_numpy().shape, (3,))
        result = s.isna()
        self.assertTrue(np.all(result.to_numpy()))
def add_embeddings(df: pd.DataFrame,
                   bert: Any,
                   overlap: int = 32,
                   non_overlap: int = 64) -> pd.DataFrame:
    """
    Add BERT embeddings to a DataFrame of BERT tokens.

    :param df: DataFrame containing BERT tokens, as returned by
      :func:`make_bert_tokens` Must contain a column
      ``input_id`` containing token IDs.
    :param bert: PyTorch-based BERT model from the ``transformers`` library
    :param overlap: (optional) how much overlap there should be between adjacent windows
    :param non_overlap: (optional) how much non-overlapping content between the
     overlapping regions there should be at the middle of each window?
    :returns: A copy of ``df`` with a new column, "embedding", containing
     BERT embeddings as a ``TensorArray``.

    .. note:: PyTorch must be installed to run this function.
    """
    # Import torch inline so that the rest of this library will function without it.
    # noinspection PyPackageRequirements
    import torch

    flat_input_ids = df["input_id"].values
    windows = seq_to_windows(flat_input_ids, overlap, non_overlap)
    bert_result = bert(
        input_ids=torch.tensor(windows["input_ids"]),
        attention_mask=torch.tensor(windows["attention_masks"]),
    )
    hidden_states = windows_to_seq(flat_input_ids,
                                   bert_result[0].detach().numpy(), overlap,
                                   non_overlap)
    embeddings = TensorArray(hidden_states)
    ret = df.copy()
    ret["embedding"] = embeddings
    return ret
    def test_large_display_string(self):

        # Uses the GenericArrayFormatter, doesn't work for Pandas 1.0.x but fixed in later versions
        df = pd.DataFrame({"foo": TensorArray(np.array([["Hello", "world"]] * 100))})
        self.assertEqual(
            repr(df),
            textwrap.dedent(
                """\
                                  foo
                0   ['Hello' 'world']
                1   ['Hello' 'world']
                2   ['Hello' 'world']
                3   ['Hello' 'world']
                4   ['Hello' 'world']
                ..                ...
                95  ['Hello' 'world']
                96  ['Hello' 'world']
                97  ['Hello' 'world']
                98  ['Hello' 'world']
                99  ['Hello' 'world']
                
                [100 rows x 1 columns]"""
            )
        )
Example #28
0
    def test_create(self):
        x = np.ones([5, 2, 3])
        s = TensorArray(x)
        self.assertEqual(len(s), 5)

        x = [np.ones([2, 3])] * 5
        s = TensorArray(x)
        self.assertEqual(len(s), 5)

        x = np.empty((0, 2))
        s = TensorArray(x)
        self.assertEqual(len(s), 0)
        s = TensorArray([])
        self.assertEqual(len(s), 0)

        x = [np.ones([2, 3]), np.ones([3, 2])]
        with self.assertRaises(ValueError):
            TensorArray(x)

        with self.assertRaises(TypeError):
            TensorArray(2112)
 def test_create(self):
     x = np.array([[1, 2], [3, 4], [5, 6]])
     s = TensorArray(x)
     df = pd.DataFrame({"i": list(range(len(x))), "tensor": s})
     self.assertEqual(len(df), len(x))
 def test_numpy_properties(self):
     data = np.arange(6).reshape(3, 2)
     arr = TensorArray(data)
     self.assertEqual(arr.numpy_ndim, data.ndim)
     self.assertEqual(arr.numpy_shape, data.shape)
     self.assertEqual(arr.numpy_dtype, data.dtype)