def testSparseWithoutShape(self): import pandas as pd pdf = pd.DataFrame([[1, 2, 3.1]], columns=['i', 'j', 'v']) df = DataFrame(pdf) with self.assertRaises(ValueError): df.to_mars_tensor_via_oss(['i', 'j'], 'v', 15, sparse=True, oss_path='fake')
def testSparseVectorToMars(self): import pandas as pd import numpy as np shape = (50, ) data = np.random.rand(*shape) kv = [(i, data[i]) for i in range(shape[0])] pdf = pd.DataFrame(kv, columns=['i', 'v']) df = DataFrame(pdf).persist(tn('test_vector_to_mars'), lifecycle=1, odps=self.odps) oss_access_id, oss_secret_access_key, oss_bucket_name, oss_endpoint = self.config.oss t = df.to_mars_tensor_via_oss(['i'], 'v', 15, oss_access_id=oss_access_id, oss_access_key=oss_secret_access_key, oss_bucket_name=oss_bucket_name, oss_endpoint=oss_endpoint, oss_path=tn('test_vector_to_mars'), shape=shape, sparse=True) table_name = tn('test_vector_to_mars_store') self.odps.delete_table(table_name, if_exists=True) self.odps.persist_tensor_via_oss(t, table_name, ['x'], 'y', oss_access_id=oss_access_id, oss_access_key=oss_secret_access_key, oss_bucket_name=oss_bucket_name, oss_endpoint=oss_endpoint, oss_path=table_name) with self.odps.get_table(table_name).open_reader() as reader: result = sorted([(r['x'], r['y']) for r in reader], key=lambda x: x[0]) self.assertEqual(kv, result)
def testNoPartitionToMars(self): import numpy as np import pandas as pd shape = (100, 50) data = np.random.rand(*shape) kv = [(i, j, data[i, j]) for i, j in product(*[range(s) for s in shape])] pdf = pd.DataFrame(kv, columns=['i', 'j', 'v']) df = DataFrame(pdf).persist(tn('test_no_partition_dense_to_mars'), lifecycle=1, odps=self.odps) oss_access_id, oss_secret_access_key, oss_bucket_name, oss_endpoint = self.config.oss # test dense t = df.to_mars_tensor_via_oss( ['i', 'j'], 'v', 15, oss_access_id=oss_access_id, oss_access_key=oss_secret_access_key, oss_bucket_name=oss_bucket_name, oss_endpoint=oss_endpoint, oss_path=tn('test_no_partition_dense_to_mars_oss'), shape=shape) # test if oss file exist t = df.to_mars_tensor_via_oss( ['i', 'j'], 'v', 15, oss_access_id=oss_access_id, oss_access_key=oss_secret_access_key, oss_bucket_name=oss_bucket_name, oss_endpoint=oss_endpoint, oss_path=tn('test_no_partition_dense_to_mars_oss'), oss_file_exist=True) table_name = tn('test_no_partition_dense_to_mars_store') self.odps.delete_table(table_name, if_exists=True) self.odps.persist_tensor_via_oss(t, table_name, ['x', 'y'], 'z', oss_access_id=oss_access_id, oss_access_key=oss_secret_access_key, oss_bucket_name=oss_bucket_name, oss_endpoint=oss_endpoint, oss_path=table_name) with self.odps.get_table(table_name).open_reader() as reader: result = sorted([(r['x'], r['y'], r['z']) for r in reader], key=lambda x: (x[0], x[1])) self.assertEqual(kv, result) # test dense without setting chunks t = df.to_mars_tensor_via_oss( ['i', 'j'], 'v', oss_access_id=oss_access_id, oss_access_key=oss_secret_access_key, oss_bucket_name=oss_bucket_name, oss_endpoint=oss_endpoint, oss_path=tn('test_no_partition_dense_to_mars_oss'), shape=shape) table_name = tn('test_no_partition_dense_to_mars_store') self.odps.delete_table(table_name, if_exists=True) self.odps.persist_tensor_via_oss(t, table_name, ['x', 'y'], 'z', oss_access_id=oss_access_id, oss_access_key=oss_secret_access_key, oss_bucket_name=oss_bucket_name, oss_endpoint=oss_endpoint, oss_path=table_name) with self.odps.get_table(table_name).open_reader() as reader: result = sorted([(r['x'], r['y'], r['z']) for r in reader], key=lambda x: (x[0], x[1])) self.assertEqual(kv, result) # test sparse t = df.to_mars_tensor_via_oss( ['i', 'j'], 'v', 15, oss_access_id=oss_access_id, oss_access_key=oss_secret_access_key, oss_bucket_name=oss_bucket_name, oss_endpoint=oss_endpoint, oss_path=tn('test_no_partition_dense_to_mars_oss'), shape=shape, sparse=True) self.assertTrue(t.issparse()) table_name = tn('test_no_partition_dense_to_mars_store') self.odps.delete_table(table_name, if_exists=True) self.odps.persist_tensor_via_oss(t, table_name, ['x', 'y'], 'z', oss_access_id=oss_access_id, oss_access_key=oss_secret_access_key, oss_bucket_name=oss_bucket_name, oss_endpoint=oss_endpoint, oss_path=table_name) with self.odps.get_table(table_name).open_reader() as reader: result = sorted([(r['x'], r['y'], r['z']) for r in reader], key=lambda x: (x[0], x[1])) self.assertEqual(kv, result)