def pandas(self, df, name=None, database=None, persist=False): """ Create a (possibly temp) parquet table from a local pandas DataFrame. """ name, database = self._get_concrete_table_path(name, database, persist=persist) qualified_name = self._fully_qualified_name(name, database) # write df to a temp CSV file on HDFS temp_csv_hdfs_dir = pjoin(options.impala.temp_hdfs_path, util.guid()) buf = BytesIO() df.to_csv(buf, header=False, index=False, na_rep='\N') self.hdfs.put(pjoin(temp_csv_hdfs_dir, '0.csv'), buf) # define a temporary table using delimited data schema = util.pandas_to_ibis_schema(df) table = self.delimited_file( temp_csv_hdfs_dir, schema, name='ibis_tmp_pandas_{0}'.format(util.guid()), database=database, external=True, persist=False) # CTAS into Parquet self.create_table(name, expr=table, database=database, format='parquet', overwrite=False) # cleanup self.hdfs.delete(temp_csv_hdfs_dir, recursive=True) return self._wrap_new_table(qualified_name, persist)
def test_dtype_datetime64(self): df = pd.DataFrame({ 'col': [pd.Timestamp('2010-11-01 00:01:00'), pd.Timestamp('2010-11-01 00:02:00.1000'), pd.Timestamp('2010-11-01 00:03:00.300000')]}) inferred = pandas_to_ibis_schema(df) expected = ibis.schema([('col', 'timestamp')]) assert inferred == expected
def test_dtype_timedelta64(self): df = pd.DataFrame({ 'col': [pd.Timedelta('1 days'), pd.Timedelta('-1 days 2 min 3us'), pd.Timedelta('-2 days +23:57:59.999997')]}) inferred = pandas_to_ibis_schema(df) expected = ibis.schema([('col', 'int64')]) assert inferred == expected
def test_dtype_timedelta64(self): df = pd.DataFrame({ 'col': [ pd.Timedelta('1 days'), pd.Timedelta('-1 days 2 min 3us'), pd.Timedelta('-2 days +23:57:59.999997') ] }) inferred = pandas_to_ibis_schema(df) expected = ibis.schema([('col', 'int64')]) assert inferred == expected
def test_dtype_datetime64(self): df = pd.DataFrame({ 'col': [ pd.Timestamp('2010-11-01 00:01:00'), pd.Timestamp('2010-11-01 00:02:00.1000'), pd.Timestamp('2010-11-01 00:03:00.300000') ] }) inferred = pandas_to_ibis_schema(df) expected = ibis.schema([('col', 'timestamp')]) assert inferred == expected
def pandas(self, df, name=None, database=None, persist=False): """ Create a (possibly temp) parquet table from a local pandas DataFrame. """ name, database = self._get_concrete_table_path(name, database, persist=persist) qualified_name = self._fully_qualified_name(name, database) # write df to a temp CSV file on HDFS temp_csv_hdfs_dir = pjoin(options.impala.temp_hdfs_path, util.guid()) buf = BytesIO() df.to_csv(buf, header=False, index=False, na_rep='\N') self.hdfs.put(pjoin(temp_csv_hdfs_dir, '0.csv'), buf) # define a temporary table using delimited data schema = util.pandas_to_ibis_schema(df) table = self.delimited_file(temp_csv_hdfs_dir, schema, name='ibis_tmp_pandas_{0}'.format( util.guid()), database=database, external=True, persist=False) # CTAS into Parquet self.create_table(name, expr=table, database=database, format='parquet', overwrite=False) # cleanup self.hdfs.delete(temp_csv_hdfs_dir, recursive=True) return self._wrap_new_table(qualified_name, persist)
def test_dtype_bool(self): df = pd.DataFrame({'col': [True, False, False]}) inferred = pandas_to_ibis_schema(df) expected = ibis.schema([('col', 'boolean')]) assert inferred == expected
def test_dtype_uint64(self): df = pd.DataFrame({'col': np.uint64([666, 2, 3])}) with self.assertRaises(IbisTypeError): inferred = pandas_to_ibis_schema(df)
def test_dtype_float64(self): df = pd.DataFrame({'col': np.float64([-3e43, 43., 10000000.])}) inferred = pandas_to_ibis_schema(df) expected = ibis.schema([('col', 'double')]) assert inferred == expected
def test_dtype_uint16(self): df = pd.DataFrame({'col': np.uint16([5569, 1, 33])}) inferred = pandas_to_ibis_schema(df) expected = ibis.schema([('col', 'int32')]) assert inferred == expected
def test_dtype_uint32(self): df = pd.DataFrame({'col': np.uint32([100, 0, 6])}) inferred = pandas_to_ibis_schema(df) expected = ibis.schema([('col', 'int64')]) assert inferred == expected
def test_dtype_float32(self): df = pd.DataFrame({'col': np.float32([45e-3, -0.4, 99.])}) inferred = pandas_to_ibis_schema(df) expected = ibis.schema([('col', 'float')]) assert inferred == expected
def test_dtype_int32(self): df = pd.DataFrame({'col': np.int32([-12, 3, 25000])}) inferred = pandas_to_ibis_schema(df) expected = ibis.schema([('col', 'int32')]) assert inferred == expected
def test_dtype_string(self): df = pd.DataFrame({'col': ['foo', 'bar', 'hello']}) inferred = pandas_to_ibis_schema(df) expected = ibis.schema([('col', 'string')]) assert inferred == expected
def test_dtype_categorical(self): df = pd.DataFrame({'col': ['a', 'b', 'c', 'a']}, dtype='category') inferred = pandas_to_ibis_schema(df) expected = ibis.schema([('col', 'category')]) assert inferred == expected
def test_dtype_int16(self): df = pd.DataFrame({'col': np.int16([-5, 0, 12])}) inferred = pandas_to_ibis_schema(df) expected = ibis.schema([('col', 'int16')]) assert inferred == expected
def test_dtype_int64(self): df = pd.DataFrame({'col': np.int64([102, 67228734, -0])}) inferred = pandas_to_ibis_schema(df) expected = ibis.schema([('col', 'int64')]) assert inferred == expected