Exemple #1
0
    def pandas(self, df, name=None, database=None, persist=False):
        """
        Create a (possibly temp) parquet table from a local pandas DataFrame.
        """
        name, database = self._get_concrete_table_path(name, database,
                                                       persist=persist)
        qualified_name = self._fully_qualified_name(name, database)

        # write df to a temp CSV file on HDFS
        temp_csv_hdfs_dir = pjoin(options.impala.temp_hdfs_path, util.guid())
        buf = BytesIO()
        df.to_csv(buf, header=False, index=False, na_rep='\N')
        self.hdfs.put(pjoin(temp_csv_hdfs_dir, '0.csv'), buf)

        # define a temporary table using delimited data
        schema = util.pandas_to_ibis_schema(df)
        table = self.delimited_file(
            temp_csv_hdfs_dir, schema,
            name='ibis_tmp_pandas_{0}'.format(util.guid()), database=database,
            external=True, persist=False)

        # CTAS into Parquet
        self.create_table(name, expr=table, database=database,
                          format='parquet', overwrite=False)

        # cleanup
        self.hdfs.delete(temp_csv_hdfs_dir, recursive=True)

        return self._wrap_new_table(qualified_name, persist)
 def test_dtype_datetime64(self):
     df = pd.DataFrame({
         'col': [pd.Timestamp('2010-11-01 00:01:00'),
                 pd.Timestamp('2010-11-01 00:02:00.1000'),
                 pd.Timestamp('2010-11-01 00:03:00.300000')]})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'timestamp')])
     assert inferred == expected
 def test_dtype_timedelta64(self):
     df = pd.DataFrame({
         'col': [pd.Timedelta('1 days'),
                 pd.Timedelta('-1 days 2 min 3us'),
                 pd.Timedelta('-2 days +23:57:59.999997')]})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'int64')])
     assert inferred == expected
 def test_dtype_timedelta64(self):
     df = pd.DataFrame({
         'col': [
             pd.Timedelta('1 days'),
             pd.Timedelta('-1 days 2 min 3us'),
             pd.Timedelta('-2 days +23:57:59.999997')
         ]
     })
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'int64')])
     assert inferred == expected
 def test_dtype_datetime64(self):
     df = pd.DataFrame({
         'col': [
             pd.Timestamp('2010-11-01 00:01:00'),
             pd.Timestamp('2010-11-01 00:02:00.1000'),
             pd.Timestamp('2010-11-01 00:03:00.300000')
         ]
     })
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'timestamp')])
     assert inferred == expected
Exemple #6
0
    def pandas(self, df, name=None, database=None, persist=False):
        """
        Create a (possibly temp) parquet table from a local pandas DataFrame.
        """
        name, database = self._get_concrete_table_path(name,
                                                       database,
                                                       persist=persist)
        qualified_name = self._fully_qualified_name(name, database)

        # write df to a temp CSV file on HDFS
        temp_csv_hdfs_dir = pjoin(options.impala.temp_hdfs_path, util.guid())
        buf = BytesIO()
        df.to_csv(buf, header=False, index=False, na_rep='\N')
        self.hdfs.put(pjoin(temp_csv_hdfs_dir, '0.csv'), buf)

        # define a temporary table using delimited data
        schema = util.pandas_to_ibis_schema(df)
        table = self.delimited_file(temp_csv_hdfs_dir,
                                    schema,
                                    name='ibis_tmp_pandas_{0}'.format(
                                        util.guid()),
                                    database=database,
                                    external=True,
                                    persist=False)

        # CTAS into Parquet
        self.create_table(name,
                          expr=table,
                          database=database,
                          format='parquet',
                          overwrite=False)

        # cleanup
        self.hdfs.delete(temp_csv_hdfs_dir, recursive=True)

        return self._wrap_new_table(qualified_name, persist)
 def test_dtype_bool(self):
     df = pd.DataFrame({'col': [True, False, False]})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'boolean')])
     assert inferred == expected
 def test_dtype_uint64(self):
     df = pd.DataFrame({'col': np.uint64([666, 2, 3])})
     with self.assertRaises(IbisTypeError):
         inferred = pandas_to_ibis_schema(df)
 def test_dtype_float64(self):
     df = pd.DataFrame({'col': np.float64([-3e43, 43., 10000000.])})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'double')])
     assert inferred == expected
 def test_dtype_uint16(self):
     df = pd.DataFrame({'col': np.uint16([5569, 1, 33])})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'int32')])
     assert inferred == expected
 def test_dtype_uint32(self):
     df = pd.DataFrame({'col': np.uint32([100, 0, 6])})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'int64')])
     assert inferred == expected
 def test_dtype_uint16(self):
     df = pd.DataFrame({'col': np.uint16([5569, 1, 33])})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'int32')])
     assert inferred == expected
 def test_dtype_float32(self):
     df = pd.DataFrame({'col': np.float32([45e-3, -0.4, 99.])})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'float')])
     assert inferred == expected
 def test_dtype_bool(self):
     df = pd.DataFrame({'col': [True, False, False]})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'boolean')])
     assert inferred == expected
 def test_dtype_int32(self):
     df = pd.DataFrame({'col': np.int32([-12, 3, 25000])})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'int32')])
     assert inferred == expected
 def test_dtype_string(self):
     df = pd.DataFrame({'col': ['foo', 'bar', 'hello']})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'string')])
     assert inferred == expected
 def test_dtype_categorical(self):
     df = pd.DataFrame({'col': ['a', 'b', 'c', 'a']}, dtype='category')
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'category')])
     assert inferred == expected
 def test_dtype_string(self):
     df = pd.DataFrame({'col': ['foo', 'bar', 'hello']})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'string')])
     assert inferred == expected
 def test_dtype_uint32(self):
     df = pd.DataFrame({'col': np.uint32([100, 0, 6])})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'int64')])
     assert inferred == expected
 def test_dtype_uint64(self):
     df = pd.DataFrame({'col': np.uint64([666, 2, 3])})
     with self.assertRaises(IbisTypeError):
         inferred = pandas_to_ibis_schema(df)
 def test_dtype_categorical(self):
     df = pd.DataFrame({'col': ['a', 'b', 'c', 'a']}, dtype='category')
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'category')])
     assert inferred == expected
 def test_dtype_int16(self):
     df = pd.DataFrame({'col': np.int16([-5, 0, 12])})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'int16')])
     assert inferred == expected
 def test_dtype_int16(self):
     df = pd.DataFrame({'col': np.int16([-5, 0, 12])})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'int16')])
     assert inferred == expected
 def test_dtype_int32(self):
     df = pd.DataFrame({'col': np.int32([-12, 3, 25000])})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'int32')])
     assert inferred == expected
 def test_dtype_int64(self):
     df = pd.DataFrame({'col': np.int64([102, 67228734, -0])})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'int64')])
     assert inferred == expected
 def test_dtype_float32(self):
     df = pd.DataFrame({'col': np.float32([45e-3, -0.4, 99.])})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'float')])
     assert inferred == expected
 def test_dtype_float64(self):
     df = pd.DataFrame({'col': np.float64([-3e43, 43., 10000000.])})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'double')])
     assert inferred == expected
 def test_dtype_int64(self):
     df = pd.DataFrame({'col': np.int64([102, 67228734, -0])})
     inferred = pandas_to_ibis_schema(df)
     expected = ibis.schema([('col', 'int64')])
     assert inferred == expected