def test_create_table_parquet_with_schema(): directory = '/path/to/' schema = ibis.schema([('foo', 'string'), ('bar', 'int8'), ('baz', 'int16')]) statement = ddl.CreateTableParquet( 'new_table', directory, schema=schema, external=True, can_exist=True, database='foo', ) result = statement.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` (`foo` string, `bar` tinyint, `baz` smallint) STORED AS PARQUET LOCATION '{0}'""".format(directory) assert result == expected
def test_create_table_like_parquet(): directory = '/path/to/' path = '/path/to/parquetfile' statement = ddl.CreateTableParquet( 'new_table', directory, example_file=path, can_exist=True, database='foo', ) result = statement.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` LIKE PARQUET '{0}' STORED AS PARQUET LOCATION '{1}'""".format(path, directory) assert result == expected
def test_create_table_parquet_like_other(): # alternative to "LIKE PARQUET" directory = '/path/to/' example_table = 'db.other' statement = ddl.CreateTableParquet( 'new_table', directory, example_table=example_table, can_exist=True, database='foo', ) result = statement.compile() expected = """\ CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table` LIKE {0} STORED AS PARQUET LOCATION '{1}'""".format(example_table, directory) assert result == expected
def parquet_file( self, hdfs_dir, schema=None, name=None, database=None, external=True, like_file=None, like_table=None, persist=False, ): """Make indicated parquet file in HDFS available as an Ibis table. The table created can be optionally named and persisted, otherwise a unique name will be generated. Temporarily, for any non-persistent external table created by Ibis we will attempt to drop it when the underlying object is garbage collected (or the Python interpreter shuts down normally). Parameters ---------- hdfs_dir Path in HDFS schema If no schema provided, and neither of the like_* argument is passed, one will be inferred from one of the parquet files in the directory. like_file Absolute path to Parquet file in HDFS to use for schema definitions. An alternative to having to supply an explicit schema like_table Fully scoped and escaped string to an Impala table whose schema we will use for the newly created table. name Random unique name generated otherwise database Database to create the (possibly temporary) table in external If a table is external, the referenced data will not be deleted when the table is dropped in Impala. Otherwise (external=False) Impala takes ownership of the Parquet file. persist Do not drop the table during garbage collection Returns ------- ImpalaTable Impala table expression """ name, database = self._get_concrete_table_path( name, database, persist=persist ) # If no schema provided, need to find some absolute path to a file in # the HDFS directory if like_file is None and like_table is None and schema is None: try: file_name = next( fn for fn in ( os.path.basename(f["name"]) for f in self.hdfs.ls(hdfs_dir, detail=True) if f["type"].lower() == "file" ) if not fn.startswith(("_", ".")) if not fn.endswith((".tmp", ".copying")) ) except StopIteration: raise com.IbisError("No files found in the passed directory") else: like_file = pjoin(hdfs_dir, file_name) stmt = ddl.CreateTableParquet( name, hdfs_dir, schema=schema, database=database, example_file=like_file, example_table=like_table, external=external, can_exist=False, ) self.raw_sql(stmt) return self._wrap_new_table(name, database, persist)