def create_table_from_csv( # pylint: disable=too-many-arguments, too-many-locals cls, filename: str, table: Table, database: "Database", csv_to_df_kwargs: Dict[str, Any], df_to_sql_kwargs: Dict[str, Any], ) -> None: """Uploads a csv file and creates a superset datasource in Hive.""" def convert_to_hive_type(col_type: str) -> str: """maps tableschema's types to hive types""" tableschema_to_hive_types = { "boolean": "BOOLEAN", "integer": "BIGINT", "number": "DOUBLE", "string": "STRING", } return tableschema_to_hive_types.get(col_type, "STRING") bucket_path = config["CSV_TO_HIVE_UPLOAD_S3_BUCKET"] if not bucket_path: logger.info("No upload bucket specified") raise Exception( "No upload bucket specified. You can specify one in the config file." ) upload_prefix = config["CSV_TO_HIVE_UPLOAD_DIRECTORY_FUNC"]( database, g.user, table.schema) # Optional dependency from tableschema import ( # pylint: disable=import-error Table as TableSchemaTable, ) hive_table_schema = TableSchemaTable(filename).infer() column_name_and_type = [] for column_info in hive_table_schema["fields"]: column_name_and_type.append("`{}` {}".format( column_info["name"], convert_to_hive_type(column_info["type"]))) schema_definition = ", ".join(column_name_and_type) # Optional dependency import boto3 # pylint: disable=import-error s3 = boto3.client("s3") location = os.path.join("s3a://", bucket_path, upload_prefix, table.table) s3.upload_file( filename, bucket_path, os.path.join(upload_prefix, table.table, os.path.basename(filename)), ) # TODO(bkyryliuk): support other delimiters sql = f"""CREATE TABLE {str(table)} ( {schema_definition} ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE LOCATION '{location}' tblproperties ('skip.header.line.count'='1')""" engine = cls.get_engine(database) engine.execute(sql)
def create_table_from_csv( # pylint: disable=too-many-arguments, too-many-locals cls, filename: str, table: Table, database: "Database", csv_to_df_kwargs: Dict[str, Any], df_to_sql_kwargs: Dict[str, Any], ) -> None: """Uploads a csv file and creates a superset datasource in Hive.""" if_exists = df_to_sql_kwargs["if_exists"] if if_exists == "append": raise SupersetException("Append operation not currently supported") def convert_to_hive_type(col_type: str) -> str: """maps tableschema's types to hive types""" tableschema_to_hive_types = { "boolean": "BOOLEAN", "integer": "BIGINT", "number": "DOUBLE", "string": "STRING", } return tableschema_to_hive_types.get(col_type, "STRING") upload_prefix = config["CSV_TO_HIVE_UPLOAD_DIRECTORY_FUNC"]( database, g.user, table.schema ) # Optional dependency from tableschema import ( # pylint: disable=import-error Table as TableSchemaTable, ) hive_table_schema = TableSchemaTable(filename).infer() column_name_and_type = [] for column_info in hive_table_schema["fields"]: column_name_and_type.append( "`{}` {}".format( column_info["name"], convert_to_hive_type(column_info["type"]) ) ) schema_definition = ", ".join(column_name_and_type) # ensure table doesn't already exist if if_exists == "fail": if table.schema: table_exists = not database.get_df( f"SHOW TABLES IN {table.schema} LIKE '{table.table}'" ).empty else: table_exists = not database.get_df( f"SHOW TABLES LIKE '{table.table}'" ).empty if table_exists: raise SupersetException("Table already exists") engine = cls.get_engine(database) if if_exists == "replace": engine.execute(f"DROP TABLE IF EXISTS {str(table)}") location = upload_to_s3(filename, upload_prefix, table) sql, params = cls.get_create_table_stmt( table, schema_definition, location, csv_to_df_kwargs["sep"].encode().decode("unicode_escape"), int(csv_to_df_kwargs.get("header", 0)), csv_to_df_kwargs.get("na_values"), ) engine = cls.get_engine(database) engine.execute(text(sql), **params)
def create_table_from_csv( # pylint: disable=too-many-arguments, too-many-locals cls, filename: str, table: Table, database: "Database", csv_to_df_kwargs: Dict[str, Any], df_to_sql_kwargs: Dict[str, Any], ) -> None: """Uploads a csv file and creates a superset datasource in Hive.""" if_exists = df_to_sql_kwargs["if_exists"] if if_exists == "append": raise SupersetException("Append operation not currently supported") def convert_to_hive_type(col_type: str) -> str: """maps tableschema's types to hive types""" tableschema_to_hive_types = { "boolean": "BOOLEAN", "integer": "BIGINT", "number": "DOUBLE", "string": "STRING", } return tableschema_to_hive_types.get(col_type, "STRING") bucket_path = config["CSV_TO_HIVE_UPLOAD_S3_BUCKET"] if not bucket_path: logger.info("No upload bucket specified") raise Exception( "No upload bucket specified. You can specify one in the config file." ) upload_prefix = config["CSV_TO_HIVE_UPLOAD_DIRECTORY_FUNC"]( database, g.user, table.schema) # Optional dependency from tableschema import ( # pylint: disable=import-error Table as TableSchemaTable, ) hive_table_schema = TableSchemaTable(filename).infer() column_name_and_type = [] for column_info in hive_table_schema["fields"]: column_name_and_type.append("`{}` {}".format( column_info["name"], convert_to_hive_type(column_info["type"]))) schema_definition = ", ".join(column_name_and_type) # ensure table doesn't already exist if (if_exists == "fail" and not database.get_df( f"SHOW TABLES IN {table.schema} LIKE '{table.table}'").empty): raise SupersetException("Table already exists") engine = cls.get_engine(database) if if_exists == "replace": engine.execute(f"DROP TABLE IF EXISTS {str(table)}") # Optional dependency import boto3 # pylint: disable=import-error s3 = boto3.client("s3") location = os.path.join("s3a://", bucket_path, upload_prefix, table.table) s3.upload_file( filename, bucket_path, os.path.join(upload_prefix, table.table, os.path.basename(filename)), ) sql = text(f"""CREATE TABLE {str(table)} ( {schema_definition} ) ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim STORED AS TEXTFILE LOCATION :location tblproperties ('skip.header.line.count'='1')""") engine = cls.get_engine(database) engine.execute( sql, delim=csv_to_df_kwargs["sep"].encode().decode("unicode_escape"), location=location, )