Beispiel #1
0
def test_create_table_parquet_with_schema():
    directory = '/path/to/'

    schema = ibis.schema([('foo', 'string'), ('bar', 'int8'),
                          ('baz', 'int16')])

    statement = ddl.CreateTableParquet(
        'new_table',
        directory,
        schema=schema,
        external=True,
        can_exist=True,
        database='foo',
    )

    result = statement.compile()
    expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
(`foo` string,
 `bar` tinyint,
 `baz` smallint)
STORED AS PARQUET
LOCATION '{0}'""".format(directory)

    assert result == expected
Beispiel #2
0
def test_create_table_like_parquet():
    directory = '/path/to/'
    path = '/path/to/parquetfile'
    statement = ddl.CreateTableParquet(
        'new_table',
        directory,
        example_file=path,
        can_exist=True,
        database='foo',
    )

    result = statement.compile()
    expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
LIKE PARQUET '{0}'
STORED AS PARQUET
LOCATION '{1}'""".format(path, directory)

    assert result == expected
Beispiel #3
0
def test_create_table_parquet_like_other():
    # alternative to "LIKE PARQUET"
    directory = '/path/to/'
    example_table = 'db.other'

    statement = ddl.CreateTableParquet(
        'new_table',
        directory,
        example_table=example_table,
        can_exist=True,
        database='foo',
    )

    result = statement.compile()
    expected = """\
CREATE EXTERNAL TABLE IF NOT EXISTS foo.`new_table`
LIKE {0}
STORED AS PARQUET
LOCATION '{1}'""".format(example_table, directory)

    assert result == expected
Beispiel #4
0
    def parquet_file(
        self,
        hdfs_dir,
        schema=None,
        name=None,
        database=None,
        external=True,
        like_file=None,
        like_table=None,
        persist=False,
    ):
        """Make indicated parquet file in HDFS available as an Ibis table.

        The table created can be optionally named and persisted, otherwise a
        unique name will be generated. Temporarily, for any non-persistent
        external table created by Ibis we will attempt to drop it when the
        underlying object is garbage collected (or the Python interpreter shuts
        down normally).

        Parameters
        ----------
        hdfs_dir
            Path in HDFS
        schema
            If no schema provided, and neither of the like_* argument is
            passed, one will be inferred from one of the parquet files in the
            directory.
        like_file
            Absolute path to Parquet file in HDFS to use for schema
            definitions. An alternative to having to supply an explicit schema
        like_table
            Fully scoped and escaped string to an Impala table whose schema we
            will use for the newly created table.
        name
            Random unique name generated otherwise
        database
            Database to create the (possibly temporary) table in
        external
            If a table is external, the referenced data will not be deleted
            when the table is dropped in Impala. Otherwise (external=False)
            Impala takes ownership of the Parquet file.
        persist
            Do not drop the table during garbage collection

        Returns
        -------
        ImpalaTable
            Impala table expression
        """
        name, database = self._get_concrete_table_path(
            name, database, persist=persist
        )

        # If no schema provided, need to find some absolute path to a file in
        # the HDFS directory
        if like_file is None and like_table is None and schema is None:
            try:
                file_name = next(
                    fn
                    for fn in (
                        os.path.basename(f["name"])
                        for f in self.hdfs.ls(hdfs_dir, detail=True)
                        if f["type"].lower() == "file"
                    )
                    if not fn.startswith(("_", "."))
                    if not fn.endswith((".tmp", ".copying"))
                )
            except StopIteration:
                raise com.IbisError("No files found in the passed directory")
            else:
                like_file = pjoin(hdfs_dir, file_name)

        stmt = ddl.CreateTableParquet(
            name,
            hdfs_dir,
            schema=schema,
            database=database,
            example_file=like_file,
            example_table=like_table,
            external=external,
            can_exist=False,
        )
        self.raw_sql(stmt)
        return self._wrap_new_table(name, database, persist)