def test_any(self): arr = TensorArray(np.arange(6).reshape(3, 2)) s = pd.Series(arr) # Test as agg to TensorElement, defaults to axis=0 result = s % 3 == 0 npt.assert_array_equal(result[2], np.array([False, False])) npt.assert_array_equal(result.any(), np.array([True, True]))
def test_sort(self): arr = TensorArray(np.arange(6).reshape(3, 2)) date_range = pd.date_range('2018-01-01', periods=3, freq='H') df = pd.DataFrame({"time": date_range, "tensor": arr}) df = df.sort_values(by="time", ascending=False) self.assertEqual(df["tensor"].array.numpy_dtype, arr.numpy_dtype) expected = np.array([[4, 5], [2, 3], [0, 1]]) npt.assert_array_equal(df["tensor"].array, expected)
def test_concat(self): x = np.arange(6).reshape((3, 2)) y = np.arange(6, 12).reshape((3, 2)) x_arr = TensorArray(x) y_arr = TensorArray(y) concat_arr = TensorArray._concat_same_type((x_arr, y_arr)) result = str(concat_arr) self.assertEqual( result, textwrap.dedent("""\ [[ 0 1] [ 2 3] [ 4 5] [ 6 7] [ 8 9] [10 11]]"""), )
def test_repr(self): x = np.array([[1, 2], [3, 4], [5, 6]]) expected = textwrap.dedent("""\ array([[1, 2], [3, 4], [5, 6]])""") s = TensorArray(x) result = s.__repr__() self.assertEqual(expected, result) result = repr(pd.Series(s)) expected = textwrap.dedent("""\ 0 [1, 2] 1 [3, 4] 2 [5, 6] dtype: TensorDtype""") self.assertEqual(expected, result)
def test_take(self): x = np.array([[1, 2], [3, 4], [5, 6]]) s = TensorArray(x) # Test no missing gets same dtype result = s.take([0, 2], allow_fill=True) expected = np.array([[1, 2], [5, 6]]) self.assertEqual(result.numpy_dtype, expected.dtype) npt.assert_array_equal(result, expected) result = s.take([0, 2], allow_fill=False) npt.assert_array_equal(result, expected) # Test missing with nan fill gets promoted to float and filled result = s.take([1, -1], allow_fill=True) expected = np.array([[3, 4], [np.nan, np.nan]]) self.assertEqual(result.numpy_dtype, expected.dtype) npt.assert_array_equal(result, expected) npt.assert_array_equal(result.isna(), [False, True])
def test_sum(self): keys = ["a", "a", "b", "c", "c", "c"] values = np.array([[1, 1]] * len(keys)) df = pd.DataFrame({"key": keys, "value": TensorArray(values)}) result_df = df.groupby("key").aggregate({"value": "sum"}) # Check array gets unwrapped from TensorElements arr = result_df["value"].array self.assertEqual(arr.numpy_dtype, values.dtype) npt.assert_array_equal(arr.to_numpy(), [[2, 2], [1, 1], [3, 3]]) # Check the resulting DataFrame self.assertEqual( repr(result_df), textwrap.dedent("""\ value key a [2, 2] b [1, 1] c [3, 3]"""), ) # 2D values values2 = np.array([[[1, 1], [1, 1]]] * len(keys)) df2 = pd.DataFrame({"key": keys, "value": TensorArray(values2)}) result2_df = df2.groupby("key").aggregate({"value": "sum"}) # Check array gets unwrapped from TensorElements arr2 = result2_df["value"].array self.assertEqual(arr2.numpy_dtype, values.dtype) npt.assert_array_equal( arr2.to_numpy(), [[[2, 2], [2, 2]], [[1, 1], [1, 1]], [[3, 3], [3, 3]]]) # Check the resulting DataFrame self.assertEqual( repr(result2_df), textwrap.dedent("""\ value key a [[2, 2], [2, 2]] b [[1, 1], [1, 1]] c [[3, 3], [3, 3]]"""), )
def test_sum(self): x = np.array([[1, 2], [3, 4], [5, 6]]) s = TensorArray(x) df = pd.DataFrame({"s": s}) sum_all = df["s"].sum() npt.assert_array_equal(sum_all.to_numpy(), [9, 12]) sum_some = df["s"][[True, False, True]].sum() npt.assert_array_equal(sum_some.to_numpy(), [6, 8])
def _embedding_to_int(df: pd.DataFrame, colname: str): """ Turn embeddings into ints so that test results will be more stable. :param df: DataFrame containing embeddings. MODIFIED IN PLACE. :param colname: Name of column where embeddings reside """ before = df[colname].values.to_numpy() after = (before * 10.).astype(int) df[colname] = TensorArray(after)
def test_create_series(self): x = np.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]] * 100) a = TensorArray(x) s1 = pd.Series(a) s2 = pd.Series(a, dtype=TensorDtype()) s3 = pd.Series(a, dtype=TensorDtype(), copy=True) self.assertEqual(len(x), len(s1)) npt.assert_array_equal(x, s1.to_numpy()) pdt.assert_series_equal(s1, s2) pdt.assert_series_equal(s1, s3)
def test_parquet(self): x = np.arange(10).reshape(5, 2) s = TensorArray(x) df = pd.DataFrame({"i": list(range(len(x))), "tensor": s}) with tempfile.TemporaryDirectory() as dirpath: filename = os.path.join(dirpath, "tensor_array_test.parquet") df.to_parquet(filename) df_read = pd.read_parquet(filename) pd.testing.assert_frame_equal(df, df_read)
def test_large_display_numeric(self): # Test integer, uses IntArrayFormatter df = pd.DataFrame({"foo": TensorArray(np.array([[1, 2]] * 100))}) self.assertEqual( repr(df), textwrap.dedent("""\ foo 0 [1, 2] 1 [1, 2] 2 [1, 2] 3 [1, 2] 4 [1, 2] .. ... 95 [1, 2] 96 [1, 2] 97 [1, 2] 98 [1, 2] 99 [1, 2] [100 rows x 1 columns]""")) # Test float, uses FloatArrayFormatter df = pd.DataFrame({"foo": TensorArray(np.array([[1.1, 2.2]] * 100))}) self.assertEqual( repr(df), textwrap.dedent("""\ foo 0 [1.1, 2.2] 1 [1.1, 2.2] 2 [1.1, 2.2] 3 [1.1, 2.2] 4 [1.1, 2.2] .. ... 95 [1.1, 2.2] 96 [1.1, 2.2] 97 [1.1, 2.2] 98 [1.1, 2.2] 99 [1.1, 2.2] [100 rows x 1 columns]"""))
def test_int_tensor_selection(self): data = TensorArray([[1, 2], [3, 4], [5, 6]]) sel = TensorArray([0, 2]) expected = np.array([[1, 2], [5, 6]]) # Test TensorArray.__getitem__ with TensorArray result = data[sel] npt.assert_array_equal(result, expected) # Test Series of TensorDtype selection with numpy array s = pd.Series(data) result = s[np.asarray(sel)] npt.assert_array_equal(result, expected) # Test Series of TensorDtype selection with TensorArray result = s[sel] npt.assert_array_equal(result, expected) # Test Series of TensorDtype selection by integer location result = s.iloc[sel] npt.assert_array_equal(result, expected)
def test_slice(self): x = np.array([[1, 2], [3, 4], [5, 6]]) s = TensorArray(x) result = s[1] self.assertTrue(isinstance(result, TensorElement)) expected = np.array([3, 4]) npt.assert_array_equal(expected, result) result = s[1:3] self.assertTrue(isinstance(result, TensorArray)) expected = np.array([[3, 4], [5, 6]]) npt.assert_array_equal(expected, result.to_numpy())
def test_bool_tensor_selection(self): data = TensorArray([[1, 2], [3, 4], [5, 6]]) sel = TensorArray([True, False, True]) expected = np.array([[1, 2], [5, 6]]) # Test TensorArray.__getitem__ with TensorArray result = data[sel] npt.assert_array_equal(result, expected) # Test Series of TensorDtype selection with numpy array s = pd.Series(data) result = s[np.asarray(sel)] npt.assert_array_equal(result, expected) # Test Series of TensorDtype selection with TensorArray # Currently fails due to Pandas not recognizing as bool index GH#162 if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"): with self.assertRaises(Exception): result = s[sel] else: result = s[sel] npt.assert_array_equal(result, expected)
def test_create(self): x = np.ones([5, 2, 3]) s = TensorArray(x) self.assertEqual(len(s), 5) x = [np.ones([2, 3])] * 5 s = TensorArray(x) self.assertEqual(len(s), 5) x = np.empty((0, 2)) s = TensorArray(x) self.assertEqual(len(s), 0) s = TensorArray([]) self.assertEqual(len(s), 0) x = [np.ones([2, 3]), np.ones([3, 2])] with self.assertRaises(ValueError): TensorArray(x) # Copy constructor s_copy = s.copy() self.assertEqual(len(s), len(s_copy))
def test_series_to_str(self): x = np.arange(50).reshape((10, 5)) a = TensorArray(x) s = pd.Series(a) result = s.to_string(max_rows=4) self.assertEqual( result, textwrap.dedent("""\ 0 [ 0, 1, 2, 3, 4] 1 [ 5, 6, 7, 8, 9] ... 8 [40, 41, 42, 43, 44] 9 [45, 46, 47, 48, 49]"""), )
def test_isna(self): expected = np.array([False, True, False, False]) # Test numeric x = np.array([[1, 2], [np.nan, np.nan], [3, np.nan], [5, 6]]) s = TensorArray(x) result = s.isna() npt.assert_equal(result, expected) # Test object d = {"a": 1} x = np.array([[d, d], None, [d, None], [d, d]], dtype=object) s = TensorArray(x) result = s.isna() npt.assert_equal(result, expected) # Test str x = np.array([["foo", "foo"], ["", ""], ["bar", ""], ["baz", "baz"]]) s = TensorArray(x) result = s.isna() npt.assert_equal(result, expected)
def test_series_to_str(self): x = np.arange(50).reshape((10, 5)) a = TensorArray(x) s = pd.Series(a) result = s.to_string(max_rows=4) self.assertEqual( result, textwrap.dedent( """\ 0 [0 1 2 3 4] 1 [5 6 7 8 9] ... 8 [40 41 42 43 44] 9 [45 46 47 48 49]""" ), )
def test_feather_auto_chunked(self): from pyarrow.feather import read_table, write_feather x = np.arange(2048).reshape(1024, 2) s = TensorArray(x) df = pd.DataFrame({"i": list(range(len(s))), "tensor": s}) table = pa.Table.from_pandas(df) # Write table to feather and read back as a DataFrame with tempfile.TemporaryDirectory() as dirpath: filename = os.path.join(dirpath, "tensor_array_chunked_test.feather") write_feather(table, filename, chunksize=512) table = read_table(filename) self.assertGreaterEqual(table.column("tensor").num_chunks, 2) df_read = pd.read_feather(filename) pd.testing.assert_frame_equal(df, df_read)
def arrow_to_tensor_array(extension_array: pa.ExtensionArray) -> TensorArray: """ Convert a pyarrow.ExtensionArray with type ArrowTensorType to a TensorArray. :param extension_array: pyarrow.ExtensionArray with type ArrowTensorType :return: TensorArray """ if isinstance(extension_array, pa.ChunkedArray): if extension_array.num_chunks > 1: # TODO: look into removing concat and constructing from list w/ shape values = np.concatenate( [chunk.to_numpy() for chunk in extension_array.iterchunks()]) else: values = extension_array.chunk(0).to_numpy() else: values = extension_array.to_numpy() return TensorArray(values)
def test_bool_indexing_series(self): s = pd.Series(TensorArray([[1, 2], [3, 4]])) result = s[[False, False]] self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(len(result), 0) result = s[[True, True]] self.assertTrue(isinstance(result, pd.Series)) pd.testing.assert_series_equal(result, s) result = s[[True, False]] self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(len(result), 1) expected = s.iloc[[0]] pd.testing.assert_series_equal(result, expected) result = s[[False, True]] self.assertTrue(isinstance(result, pd.Series)) self.assertEqual(len(result), 1) expected = s.iloc[[1]] pd.testing.assert_series_equal(result, expected)
def test_bool_indexing_dataframe(self): s = TensorArray([[1, 2], [3, 4]]) df = pd.DataFrame({"col1": s}) result = df[[False, False]] self.assertTrue(isinstance(result, pd.DataFrame)) self.assertEqual(len(result), 0) result = df[[True, True]] self.assertTrue(isinstance(result, pd.DataFrame)) pd.testing.assert_frame_equal(result, df) result = df[[True, False]] self.assertTrue(isinstance(result, pd.DataFrame)) self.assertEqual(len(result), 1) expected = df.iloc[[0]] pd.testing.assert_frame_equal(result, expected) result = df[[False, True]] self.assertTrue(isinstance(result, pd.DataFrame)) self.assertEqual(len(result), 1) expected = df.iloc[[1]] pd.testing.assert_frame_equal(result, expected)
def test_feather_chunked(self): from pyarrow.feather import write_feather x = np.arange(10).reshape(5, 2) s = TensorArray(x) df1 = pd.DataFrame({"i": list(range(len(s))), "tensor": s}) # Create a Table with 2 chunks table1 = pa.Table.from_pandas(df1) df2 = df1.copy() df2["tensor"] = df2["tensor"] * 10 table2 = pa.Table.from_pandas(df2) table = pa.concat_tables([table1, table2]) self.assertEqual(table.column("tensor").num_chunks, 2) # Write table to feather and read back as a DataFrame with tempfile.TemporaryDirectory() as dirpath: filename = os.path.join(dirpath, "tensor_array_chunked_test.feather") write_feather(table, filename) df_read = pd.read_feather(filename) df_expected = pd.concat([df1, df2]).reset_index(drop=True) pd.testing.assert_frame_equal(df_expected, df_read)
def test_bool_indexing(self): s = TensorArray([[1, 2], [3, 4]]) result = s[[True, True]] self.assertTrue(isinstance(result, TensorArray)) expected = np.array([[1, 2], [3, 4]]) npt.assert_array_equal(result.to_numpy(), expected) result = s[[True, False]] self.assertTrue(isinstance(result, TensorArray)) expected = np.array([[1, 2]]) npt.assert_array_equal(result.to_numpy(), expected) result = s[[False, True]] self.assertTrue(isinstance(result, TensorArray)) expected = np.array([[3, 4]]) npt.assert_array_equal(result.to_numpy(), expected) result = s[[False, False]] self.assertTrue(isinstance(result, TensorArray)) expected = np.empty((0, 2)) npt.assert_array_equal(result.to_numpy(), expected)
def test_create_from_scalar_list(self): x = [1, 2, 3, 4, 5] s = TensorArray(x) self.assertTupleEqual(s.numpy_shape, (len(x),)) expected = np.array(x) npt.assert_array_equal(s.to_numpy(), expected) # Now with TensorElement values e = [TensorElement(np.array(i)) for i in x] s = pd.array(e, dtype=TensorDtype()) npt.assert_array_equal(s.to_numpy(), expected) # Now with list of 1d tensors x = [np.array([i]) for i in x] s = pd.array(x, dtype=TensorDtype()) self.assertTupleEqual(s.to_numpy().shape, (len(x), 1)) npt.assert_array_equal(s.to_numpy(), np.array([[e] for e in expected])) # Pandas will create list of copies of the tensor element for the given indices s = pd.Series(np.nan, index=[0, 1, 2], dtype=TensorDtype()) self.assertEqual(len(s), 3) self.assertTupleEqual(s.to_numpy().shape, (3,)) result = s.isna() self.assertTrue(np.all(result.to_numpy()))
def add_embeddings(df: pd.DataFrame, bert: Any, overlap: int = 32, non_overlap: int = 64) -> pd.DataFrame: """ Add BERT embeddings to a DataFrame of BERT tokens. :param df: DataFrame containing BERT tokens, as returned by :func:`make_bert_tokens` Must contain a column ``input_id`` containing token IDs. :param bert: PyTorch-based BERT model from the ``transformers`` library :param overlap: (optional) how much overlap there should be between adjacent windows :param non_overlap: (optional) how much non-overlapping content between the overlapping regions there should be at the middle of each window? :returns: A copy of ``df`` with a new column, "embedding", containing BERT embeddings as a ``TensorArray``. .. note:: PyTorch must be installed to run this function. """ # Import torch inline so that the rest of this library will function without it. # noinspection PyPackageRequirements import torch flat_input_ids = df["input_id"].values windows = seq_to_windows(flat_input_ids, overlap, non_overlap) bert_result = bert( input_ids=torch.tensor(windows["input_ids"]), attention_mask=torch.tensor(windows["attention_masks"]), ) hidden_states = windows_to_seq(flat_input_ids, bert_result[0].detach().numpy(), overlap, non_overlap) embeddings = TensorArray(hidden_states) ret = df.copy() ret["embedding"] = embeddings return ret
def test_large_display_string(self): # Uses the GenericArrayFormatter, doesn't work for Pandas 1.0.x but fixed in later versions df = pd.DataFrame({"foo": TensorArray(np.array([["Hello", "world"]] * 100))}) self.assertEqual( repr(df), textwrap.dedent( """\ foo 0 ['Hello' 'world'] 1 ['Hello' 'world'] 2 ['Hello' 'world'] 3 ['Hello' 'world'] 4 ['Hello' 'world'] .. ... 95 ['Hello' 'world'] 96 ['Hello' 'world'] 97 ['Hello' 'world'] 98 ['Hello' 'world'] 99 ['Hello' 'world'] [100 rows x 1 columns]""" ) )
def test_create(self): x = np.ones([5, 2, 3]) s = TensorArray(x) self.assertEqual(len(s), 5) x = [np.ones([2, 3])] * 5 s = TensorArray(x) self.assertEqual(len(s), 5) x = np.empty((0, 2)) s = TensorArray(x) self.assertEqual(len(s), 0) s = TensorArray([]) self.assertEqual(len(s), 0) x = [np.ones([2, 3]), np.ones([3, 2])] with self.assertRaises(ValueError): TensorArray(x) with self.assertRaises(TypeError): TensorArray(2112)
def test_create(self): x = np.array([[1, 2], [3, 4], [5, 6]]) s = TensorArray(x) df = pd.DataFrame({"i": list(range(len(x))), "tensor": s}) self.assertEqual(len(df), len(x))
def test_numpy_properties(self): data = np.arange(6).reshape(3, 2) arr = TensorArray(data) self.assertEqual(arr.numpy_ndim, data.ndim) self.assertEqual(arr.numpy_shape, data.shape) self.assertEqual(arr.numpy_dtype, data.dtype)