Beispiel #1
0
    def create_table_from_csv(  # pylint: disable=too-many-arguments, too-many-locals
        cls,
        filename: str,
        table: Table,
        database: "Database",
        csv_to_df_kwargs: Dict[str, Any],
        df_to_sql_kwargs: Dict[str, Any],
    ) -> None:
        """Uploads a csv file and creates a superset datasource in Hive."""
        def convert_to_hive_type(col_type: str) -> str:
            """maps tableschema's types to hive types"""
            tableschema_to_hive_types = {
                "boolean": "BOOLEAN",
                "integer": "BIGINT",
                "number": "DOUBLE",
                "string": "STRING",
            }
            return tableschema_to_hive_types.get(col_type, "STRING")

        bucket_path = config["CSV_TO_HIVE_UPLOAD_S3_BUCKET"]

        if not bucket_path:
            logger.info("No upload bucket specified")
            raise Exception(
                "No upload bucket specified. You can specify one in the config file."
            )

        upload_prefix = config["CSV_TO_HIVE_UPLOAD_DIRECTORY_FUNC"](
            database, g.user, table.schema)

        # Optional dependency
        from tableschema import (  # pylint: disable=import-error
            Table as TableSchemaTable, )

        hive_table_schema = TableSchemaTable(filename).infer()
        column_name_and_type = []
        for column_info in hive_table_schema["fields"]:
            column_name_and_type.append("`{}` {}".format(
                column_info["name"],
                convert_to_hive_type(column_info["type"])))
        schema_definition = ", ".join(column_name_and_type)

        # Optional dependency
        import boto3  # pylint: disable=import-error

        s3 = boto3.client("s3")
        location = os.path.join("s3a://", bucket_path, upload_prefix,
                                table.table)
        s3.upload_file(
            filename,
            bucket_path,
            os.path.join(upload_prefix, table.table,
                         os.path.basename(filename)),
        )
        # TODO(bkyryliuk): support other delimiters
        sql = f"""CREATE TABLE {str(table)} ( {schema_definition} )
            ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS
            TEXTFILE LOCATION '{location}'
            tblproperties ('skip.header.line.count'='1')"""
        engine = cls.get_engine(database)
        engine.execute(sql)
    def create_table_from_csv(  # pylint: disable=too-many-arguments, too-many-locals
        cls,
        filename: str,
        table: Table,
        database: "Database",
        csv_to_df_kwargs: Dict[str, Any],
        df_to_sql_kwargs: Dict[str, Any],
    ) -> None:
        """Uploads a csv file and creates a superset datasource in Hive."""
        if_exists = df_to_sql_kwargs["if_exists"]
        if if_exists == "append":
            raise SupersetException("Append operation not currently supported")

        def convert_to_hive_type(col_type: str) -> str:
            """maps tableschema's types to hive types"""
            tableschema_to_hive_types = {
                "boolean": "BOOLEAN",
                "integer": "BIGINT",
                "number": "DOUBLE",
                "string": "STRING",
            }
            return tableschema_to_hive_types.get(col_type, "STRING")

        upload_prefix = config["CSV_TO_HIVE_UPLOAD_DIRECTORY_FUNC"](
            database, g.user, table.schema
        )

        # Optional dependency
        from tableschema import (  # pylint: disable=import-error
            Table as TableSchemaTable,
        )

        hive_table_schema = TableSchemaTable(filename).infer()
        column_name_and_type = []
        for column_info in hive_table_schema["fields"]:
            column_name_and_type.append(
                "`{}` {}".format(
                    column_info["name"], convert_to_hive_type(column_info["type"])
                )
            )
        schema_definition = ", ".join(column_name_and_type)

        # ensure table doesn't already exist
        if if_exists == "fail":
            if table.schema:
                table_exists = not database.get_df(
                    f"SHOW TABLES IN {table.schema} LIKE '{table.table}'"
                ).empty
            else:
                table_exists = not database.get_df(
                    f"SHOW TABLES LIKE '{table.table}'"
                ).empty
            if table_exists:
                raise SupersetException("Table already exists")

        engine = cls.get_engine(database)

        if if_exists == "replace":
            engine.execute(f"DROP TABLE IF EXISTS {str(table)}")
        location = upload_to_s3(filename, upload_prefix, table)
        sql, params = cls.get_create_table_stmt(
            table,
            schema_definition,
            location,
            csv_to_df_kwargs["sep"].encode().decode("unicode_escape"),
            int(csv_to_df_kwargs.get("header", 0)),
            csv_to_df_kwargs.get("na_values"),
        )
        engine = cls.get_engine(database)
        engine.execute(text(sql), **params)
Beispiel #3
0
    def create_table_from_csv(  # pylint: disable=too-many-arguments, too-many-locals
        cls,
        filename: str,
        table: Table,
        database: "Database",
        csv_to_df_kwargs: Dict[str, Any],
        df_to_sql_kwargs: Dict[str, Any],
    ) -> None:
        """Uploads a csv file and creates a superset datasource in Hive."""

        if_exists = df_to_sql_kwargs["if_exists"]
        if if_exists == "append":
            raise SupersetException("Append operation not currently supported")

        def convert_to_hive_type(col_type: str) -> str:
            """maps tableschema's types to hive types"""
            tableschema_to_hive_types = {
                "boolean": "BOOLEAN",
                "integer": "BIGINT",
                "number": "DOUBLE",
                "string": "STRING",
            }
            return tableschema_to_hive_types.get(col_type, "STRING")

        bucket_path = config["CSV_TO_HIVE_UPLOAD_S3_BUCKET"]

        if not bucket_path:
            logger.info("No upload bucket specified")
            raise Exception(
                "No upload bucket specified. You can specify one in the config file."
            )

        upload_prefix = config["CSV_TO_HIVE_UPLOAD_DIRECTORY_FUNC"](
            database, g.user, table.schema)

        # Optional dependency
        from tableschema import (  # pylint: disable=import-error
            Table as TableSchemaTable, )

        hive_table_schema = TableSchemaTable(filename).infer()
        column_name_and_type = []
        for column_info in hive_table_schema["fields"]:
            column_name_and_type.append("`{}` {}".format(
                column_info["name"],
                convert_to_hive_type(column_info["type"])))
        schema_definition = ", ".join(column_name_and_type)

        # ensure table doesn't already exist
        if (if_exists == "fail" and not database.get_df(
                f"SHOW TABLES IN {table.schema} LIKE '{table.table}'").empty):
            raise SupersetException("Table already exists")

        engine = cls.get_engine(database)

        if if_exists == "replace":
            engine.execute(f"DROP TABLE IF EXISTS {str(table)}")

        # Optional dependency
        import boto3  # pylint: disable=import-error

        s3 = boto3.client("s3")
        location = os.path.join("s3a://", bucket_path, upload_prefix,
                                table.table)
        s3.upload_file(
            filename,
            bucket_path,
            os.path.join(upload_prefix, table.table,
                         os.path.basename(filename)),
        )
        sql = text(f"""CREATE TABLE {str(table)} ( {schema_definition} )
            ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim
            STORED AS TEXTFILE LOCATION :location
            tblproperties ('skip.header.line.count'='1')""")
        engine = cls.get_engine(database)
        engine.execute(
            sql,
            delim=csv_to_df_kwargs["sep"].encode().decode("unicode_escape"),
            location=location,
        )