def to_sql( df: pd.DataFrame, con: pymysql.connections.Connection, table: str, schema: str, mode: str = "append", index: bool = False, dtype: Optional[Dict[str, str]] = None, varchar_lengths: Optional[Dict[str, int]] = None, use_column_names: bool = False, chunksize: int = 200, ) -> None: """Write records stored in a DataFrame into MySQL. Parameters ---------- df : pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html con : pymysql.connections.Connection Use pymysql.connect() to use credentials directly or wr.mysql.connect() to fetch it from the Glue Catalog. table : str Table name schema : str Schema name mode : str Append, overwrite, upsert_duplicate_key, upsert_replace_into, upsert_distinct. append: Inserts new records into table overwrite: Drops table and recreates upsert_duplicate_key: Performs an upsert using `ON DUPLICATE KEY` clause. Requires table schema to have defined keys, otherwise duplicate records will be inserted. upsert_replace_into: Performs upsert using `REPLACE INTO` clause. Less efficient and still requires the table schema to have keys or else duplicate records will be inserted upsert_distinct: Inserts new records, including duplicates, then recreates the table and inserts `DISTINCT` records from old table. This is the least efficient approach but handles scenarios where there are no keys on table. index : bool True to store the DataFrame index as a column in the table, otherwise False to ignore it. dtype: Dict[str, str], optional Dictionary of columns names and MySQL types to be casted. Useful when you have columns with undetermined or mixed data types. (e.g. {'col name': 'TEXT', 'col2 name': 'FLOAT'}) varchar_lengths : Dict[str, int], optional Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}). use_column_names: bool If set to True, will use the column names of the DataFrame for generating the INSERT SQL Query. E.g. If the DataFrame has two columns `col1` and `col3` and `use_column_names` is True, data will only be inserted into the database columns `col1` and `col3`. chunksize: int Number of rows which are inserted with each SQL query. Defaults to inserting 200 rows per query. Returns ------- None None. Examples -------- Writing to MySQL using a Glue Catalog Connections >>> import awswrangler as wr >>> con = wr.mysql.connect("MY_GLUE_CONNECTION") >>> wr.mysql.to_sql( ... df=df, ... table="my_table", ... schema="test", ... con=con ... ) >>> con.close() """ if df.empty is True: raise exceptions.EmptyDataFrame() mode = mode.strip().lower() modes = [ "append", "overwrite", "upsert_replace_into", "upsert_duplicate_key", "upsert_distinct", ] if mode not in modes: raise exceptions.InvalidArgumentValue( f"mode must be one of {', '.join(modes)}") _validate_connection(con=con) try: with con.cursor() as cursor: _create_table( df=df, cursor=cursor, table=table, schema=schema, mode=mode, index=index, dtype=dtype, varchar_lengths=varchar_lengths, ) if index: df.reset_index(level=df.index.names, inplace=True) column_placeholders: str = ", ".join(["%s"] * len(df.columns)) insertion_columns = "" upsert_columns = "" upsert_str = "" if use_column_names: insertion_columns = f"({', '.join(df.columns)})" if mode == "upsert_duplicate_key": upsert_columns = ", ".join( df.columns.map( lambda column: f"`{column}`=VALUES(`{column}`)")) upsert_str = f" ON DUPLICATE KEY UPDATE {upsert_columns}" placeholder_parameter_pair_generator = _db_utils.generate_placeholder_parameter_pairs( df=df, column_placeholders=column_placeholders, chunksize=chunksize) sql: str for placeholders, parameters in placeholder_parameter_pair_generator: if mode == "upsert_replace_into": sql = f"REPLACE INTO `{schema}`.`{table}` {insertion_columns} VALUES {placeholders}" else: sql = f"INSERT INTO `{schema}`.`{table}` {insertion_columns} VALUES {placeholders}{upsert_str}" _logger.debug("sql: %s", sql) cursor.executemany(sql, (parameters, )) con.commit() if mode == "upsert_distinct": temp_table = f"{table}_{uuid.uuid4().hex}" cursor.execute( f"CREATE TABLE `{schema}`.`{temp_table}` LIKE `{schema}`.`{table}`" ) cursor.execute( f"INSERT INTO `{schema}`.`{temp_table}` SELECT DISTINCT * FROM `{schema}`.`{table}`" ) cursor.execute(f"DROP TABLE IF EXISTS `{schema}`.`{table}`") cursor.execute( f"ALTER TABLE `{schema}`.`{temp_table}` RENAME TO `{table}`" ) con.commit() except Exception as ex: con.rollback() _logger.error(ex) raise
def to_sql( df: pd.DataFrame, con: "pyodbc.Connection", table: str, schema: str, mode: str = "append", index: bool = False, dtype: Optional[Dict[str, str]] = None, varchar_lengths: Optional[Dict[str, int]] = None, use_column_names: bool = False, chunksize: int = 200, ) -> None: """Write records stored in a DataFrame into Microsoft SQL Server. Parameters ---------- df : pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html con : pyodbc.Connection Use pyodbc.connect() to use credentials directly or wr.sqlserver.connect() to fetch it from the Glue Catalog. table : str Table name schema : str Schema name mode : str Append or overwrite. index : bool True to store the DataFrame index as a column in the table, otherwise False to ignore it. dtype: Dict[str, str], optional Dictionary of columns names and Microsoft SQL Server types to be casted. Useful when you have columns with undetermined or mixed data types. (e.g. {'col name': 'TEXT', 'col2 name': 'FLOAT'}) varchar_lengths : Dict[str, int], optional Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}). use_column_names: bool If set to True, will use the column names of the DataFrame for generating the INSERT SQL Query. E.g. If the DataFrame has two columns `col1` and `col3` and `use_column_names` is True, data will only be inserted into the database columns `col1` and `col3`. chunksize: int Number of rows which are inserted with each SQL query. Defaults to inserting 200 rows per query. Returns ------- None None. Examples -------- Writing to Microsoft SQL Server using a Glue Catalog Connections >>> import awswrangler as wr >>> con = wr.sqlserver.connect(connection="MY_GLUE_CONNECTION", odbc_driver_version=17) >>> wr.sqlserver.to_sql( ... df=df, ... table="table", ... schema="dbo", ... con=con ... ) >>> con.close() """ if df.empty is True: raise exceptions.EmptyDataFrame() _validate_connection(con=con) try: with con.cursor() as cursor: _create_table( df=df, cursor=cursor, table=table, schema=schema, mode=mode, index=index, dtype=dtype, varchar_lengths=varchar_lengths, ) if index: df.reset_index(level=df.index.names, inplace=True) column_placeholders: str = ", ".join(["?"] * len(df.columns)) table_identifier = _get_table_identifier(schema, table) insertion_columns = "" if use_column_names: insertion_columns = f"({', '.join(df.columns)})" placeholder_parameter_pair_generator = _db_utils.generate_placeholder_parameter_pairs( df=df, column_placeholders=column_placeholders, chunksize=chunksize) for placeholders, parameters in placeholder_parameter_pair_generator: sql: str = f"INSERT INTO {table_identifier} {insertion_columns} VALUES {placeholders}" _logger.debug("sql: %s", sql) cursor.executemany(sql, (parameters, )) con.commit() except Exception as ex: con.rollback() _logger.error(ex) raise
def to_sql( df: pd.DataFrame, con: pg8000.Connection, table: str, schema: str, mode: str = "append", index: bool = False, dtype: Optional[Dict[str, str]] = None, varchar_lengths: Optional[Dict[str, int]] = None, use_column_names: bool = False, chunksize: int = 200, upsert_conflict_columns: Optional[List[str]] = None, insert_conflict_columns: Optional[List[str]] = None, ) -> None: """Write records stored in a DataFrame into PostgreSQL. Parameters ---------- df : pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html con : pg8000.Connection Use pg8000.connect() to use credentials directly or wr.postgresql.connect() to fetch it from the Glue Catalog. table : str Table name schema : str Schema name mode : str Append, overwrite or upsert. append: Inserts new records into table. overwrite: Drops table and recreates. upsert: Perform an upsert which checks for conflicts on columns given by `upsert_conflict_columns` and sets the new values on conflicts. Note that `upsert_conflict_columns` is required for this mode. index : bool True to store the DataFrame index as a column in the table, otherwise False to ignore it. dtype: Dict[str, str], optional Dictionary of columns names and PostgreSQL types to be casted. Useful when you have columns with undetermined or mixed data types. (e.g. {'col name': 'TEXT', 'col2 name': 'FLOAT'}) varchar_lengths : Dict[str, int], optional Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}). use_column_names: bool If set to True, will use the column names of the DataFrame for generating the INSERT SQL Query. E.g. If the DataFrame has two columns `col1` and `col3` and `use_column_names` is True, data will only be inserted into the database columns `col1` and `col3`. chunksize: int Number of rows which are inserted with each SQL query. Defaults to inserting 200 rows per query. upsert_conflict_columns: List[str], optional This parameter is only supported if `mode` is set top `upsert`. In this case conflicts for the given columns are checked for evaluating the upsert. insert_conflict_columns: List[str], optional This parameter is only supported if `mode` is set top `append`. In this case conflicts for the given columns are checked for evaluating the insert 'ON CONFLICT DO NOTHING'. Returns ------- None None. Examples -------- Writing to PostgreSQL using a Glue Catalog Connections >>> import awswrangler as wr >>> con = wr.postgresql.connect("MY_GLUE_CONNECTION") >>> wr.postgresql.to_sql( ... df=df, ... table="my_table", ... schema="public", ... con=con ... ) >>> con.close() """ if df.empty is True: raise exceptions.EmptyDataFrame("DataFrame cannot be empty.") mode = mode.strip().lower() allowed_modes = ["append", "overwrite", "upsert"] _db_utils.validate_mode(mode=mode, allowed_modes=allowed_modes) if mode == "upsert" and not upsert_conflict_columns: raise exceptions.InvalidArgumentValue( "<upsert_conflict_columns> needs to be set when using upsert mode." ) _validate_connection(con=con) try: with con.cursor() as cursor: _create_table( df=df, cursor=cursor, table=table, schema=schema, mode=mode, index=index, dtype=dtype, varchar_lengths=varchar_lengths, ) if index: df.reset_index(level=df.index.names, inplace=True) column_placeholders: str = ", ".join(["%s"] * len(df.columns)) insertion_columns = "" upsert_str = "" if use_column_names: insertion_columns = f"({', '.join(df.columns)})" if mode == "upsert": upsert_columns = ", ".join( df.columns.map( lambda column: f"{column}=EXCLUDED.{column}")) conflict_columns = ", ".join( upsert_conflict_columns) # type: ignore upsert_str = f" ON CONFLICT ({conflict_columns}) DO UPDATE SET {upsert_columns}" if mode == "append" and insert_conflict_columns: conflict_columns = ", ".join( insert_conflict_columns) # type: ignore upsert_str = f" ON CONFLICT ({conflict_columns}) DO NOTHING" placeholder_parameter_pair_generator = _db_utils.generate_placeholder_parameter_pairs( df=df, column_placeholders=column_placeholders, chunksize=chunksize) for placeholders, parameters in placeholder_parameter_pair_generator: sql: str = f'INSERT INTO "{schema}"."{table}" {insertion_columns} VALUES {placeholders}{upsert_str}' _logger.debug("sql: %s", sql) cursor.executemany(sql, (parameters, )) con.commit() except Exception as ex: con.rollback() _logger.error(ex) raise