Beispiel #1
0
def to_sql(
    df: pd.DataFrame,
    con: pymysql.connections.Connection,
    table: str,
    schema: str,
    mode: str = "append",
    index: bool = False,
    dtype: Optional[Dict[str, str]] = None,
    varchar_lengths: Optional[Dict[str, int]] = None,
    use_column_names: bool = False,
    chunksize: int = 200,
) -> None:
    """Write records stored in a DataFrame into MySQL.

    Parameters
    ----------
    df : pandas.DataFrame
        Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
    con : pymysql.connections.Connection
        Use pymysql.connect() to use credentials directly or wr.mysql.connect() to fetch it from the Glue Catalog.
    table : str
        Table name
    schema : str
        Schema name
    mode : str
        Append, overwrite, upsert_duplicate_key, upsert_replace_into, upsert_distinct.
            append: Inserts new records into table
            overwrite: Drops table and recreates
            upsert_duplicate_key: Performs an upsert using `ON DUPLICATE KEY` clause. Requires table schema to have
            defined keys, otherwise duplicate records will be inserted.
            upsert_replace_into: Performs upsert using `REPLACE INTO` clause. Less efficient and still requires the
            table schema to have keys or else duplicate records will be inserted
            upsert_distinct: Inserts new records, including duplicates, then recreates the table and inserts `DISTINCT`
            records from old table. This is the least efficient approach but handles scenarios where there are no
            keys on table.

    index : bool
        True to store the DataFrame index as a column in the table,
        otherwise False to ignore it.
    dtype: Dict[str, str], optional
        Dictionary of columns names and MySQL types to be casted.
        Useful when you have columns with undetermined or mixed data types.
        (e.g. {'col name': 'TEXT', 'col2 name': 'FLOAT'})
    varchar_lengths : Dict[str, int], optional
        Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}).
    use_column_names: bool
        If set to True, will use the column names of the DataFrame for generating the INSERT SQL Query.
        E.g. If the DataFrame has two columns `col1` and `col3` and `use_column_names` is True, data will only be
        inserted into the database columns `col1` and `col3`.
    chunksize: int
        Number of rows which are inserted with each SQL query. Defaults to inserting 200 rows per query.

    Returns
    -------
    None
        None.

    Examples
    --------
    Writing to MySQL using a Glue Catalog Connections

    >>> import awswrangler as wr
    >>> con = wr.mysql.connect("MY_GLUE_CONNECTION")
    >>> wr.mysql.to_sql(
    ...     df=df,
    ...     table="my_table",
    ...     schema="test",
    ...     con=con
    ... )
    >>> con.close()

    """
    if df.empty is True:
        raise exceptions.EmptyDataFrame()
    mode = mode.strip().lower()
    modes = [
        "append",
        "overwrite",
        "upsert_replace_into",
        "upsert_duplicate_key",
        "upsert_distinct",
    ]
    if mode not in modes:
        raise exceptions.InvalidArgumentValue(
            f"mode must be one of {', '.join(modes)}")

    _validate_connection(con=con)
    try:
        with con.cursor() as cursor:
            _create_table(
                df=df,
                cursor=cursor,
                table=table,
                schema=schema,
                mode=mode,
                index=index,
                dtype=dtype,
                varchar_lengths=varchar_lengths,
            )
            if index:
                df.reset_index(level=df.index.names, inplace=True)
            column_placeholders: str = ", ".join(["%s"] * len(df.columns))
            insertion_columns = ""
            upsert_columns = ""
            upsert_str = ""
            if use_column_names:
                insertion_columns = f"({', '.join(df.columns)})"
            if mode == "upsert_duplicate_key":
                upsert_columns = ", ".join(
                    df.columns.map(
                        lambda column: f"`{column}`=VALUES(`{column}`)"))
                upsert_str = f" ON DUPLICATE KEY UPDATE {upsert_columns}"
            placeholder_parameter_pair_generator = _db_utils.generate_placeholder_parameter_pairs(
                df=df,
                column_placeholders=column_placeholders,
                chunksize=chunksize)
            sql: str
            for placeholders, parameters in placeholder_parameter_pair_generator:
                if mode == "upsert_replace_into":
                    sql = f"REPLACE INTO `{schema}`.`{table}` {insertion_columns} VALUES {placeholders}"
                else:
                    sql = f"INSERT INTO `{schema}`.`{table}` {insertion_columns} VALUES {placeholders}{upsert_str}"
                _logger.debug("sql: %s", sql)
                cursor.executemany(sql, (parameters, ))
            con.commit()
            if mode == "upsert_distinct":
                temp_table = f"{table}_{uuid.uuid4().hex}"
                cursor.execute(
                    f"CREATE TABLE `{schema}`.`{temp_table}` LIKE `{schema}`.`{table}`"
                )
                cursor.execute(
                    f"INSERT INTO `{schema}`.`{temp_table}` SELECT DISTINCT * FROM `{schema}`.`{table}`"
                )
                cursor.execute(f"DROP TABLE IF EXISTS `{schema}`.`{table}`")
                cursor.execute(
                    f"ALTER TABLE `{schema}`.`{temp_table}` RENAME TO `{table}`"
                )
                con.commit()

    except Exception as ex:
        con.rollback()
        _logger.error(ex)
        raise
Beispiel #2
0
def to_sql(
    df: pd.DataFrame,
    con: "pyodbc.Connection",
    table: str,
    schema: str,
    mode: str = "append",
    index: bool = False,
    dtype: Optional[Dict[str, str]] = None,
    varchar_lengths: Optional[Dict[str, int]] = None,
    use_column_names: bool = False,
    chunksize: int = 200,
) -> None:
    """Write records stored in a DataFrame into Microsoft SQL Server.

    Parameters
    ----------
    df : pandas.DataFrame
        Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
    con : pyodbc.Connection
        Use pyodbc.connect() to use credentials directly or wr.sqlserver.connect() to fetch it from the Glue Catalog.
    table : str
        Table name
    schema : str
        Schema name
    mode : str
        Append or overwrite.
    index : bool
        True to store the DataFrame index as a column in the table,
        otherwise False to ignore it.
    dtype: Dict[str, str], optional
        Dictionary of columns names and Microsoft SQL Server types to be casted.
        Useful when you have columns with undetermined or mixed data types.
        (e.g. {'col name': 'TEXT', 'col2 name': 'FLOAT'})
    varchar_lengths : Dict[str, int], optional
        Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}).
    use_column_names: bool
        If set to True, will use the column names of the DataFrame for generating the INSERT SQL Query.
        E.g. If the DataFrame has two columns `col1` and `col3` and `use_column_names` is True, data will only be
        inserted into the database columns `col1` and `col3`.
    chunksize: int
        Number of rows which are inserted with each SQL query. Defaults to inserting 200 rows per query.

    Returns
    -------
    None
        None.

    Examples
    --------
    Writing to Microsoft SQL Server using a Glue Catalog Connections

    >>> import awswrangler as wr
    >>> con = wr.sqlserver.connect(connection="MY_GLUE_CONNECTION", odbc_driver_version=17)
    >>> wr.sqlserver.to_sql(
    ...     df=df,
    ...     table="table",
    ...     schema="dbo",
    ...     con=con
    ... )
    >>> con.close()

    """
    if df.empty is True:
        raise exceptions.EmptyDataFrame()
    _validate_connection(con=con)
    try:
        with con.cursor() as cursor:
            _create_table(
                df=df,
                cursor=cursor,
                table=table,
                schema=schema,
                mode=mode,
                index=index,
                dtype=dtype,
                varchar_lengths=varchar_lengths,
            )
            if index:
                df.reset_index(level=df.index.names, inplace=True)
            column_placeholders: str = ", ".join(["?"] * len(df.columns))
            table_identifier = _get_table_identifier(schema, table)
            insertion_columns = ""
            if use_column_names:
                insertion_columns = f"({', '.join(df.columns)})"
            placeholder_parameter_pair_generator = _db_utils.generate_placeholder_parameter_pairs(
                df=df,
                column_placeholders=column_placeholders,
                chunksize=chunksize)
            for placeholders, parameters in placeholder_parameter_pair_generator:
                sql: str = f"INSERT INTO {table_identifier} {insertion_columns} VALUES {placeholders}"
                _logger.debug("sql: %s", sql)
                cursor.executemany(sql, (parameters, ))
            con.commit()
    except Exception as ex:
        con.rollback()
        _logger.error(ex)
        raise
def to_sql(
    df: pd.DataFrame,
    con: pg8000.Connection,
    table: str,
    schema: str,
    mode: str = "append",
    index: bool = False,
    dtype: Optional[Dict[str, str]] = None,
    varchar_lengths: Optional[Dict[str, int]] = None,
    use_column_names: bool = False,
    chunksize: int = 200,
    upsert_conflict_columns: Optional[List[str]] = None,
    insert_conflict_columns: Optional[List[str]] = None,
) -> None:
    """Write records stored in a DataFrame into PostgreSQL.

    Parameters
    ----------
    df : pandas.DataFrame
        Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
    con : pg8000.Connection
        Use pg8000.connect() to use credentials directly or wr.postgresql.connect() to fetch it from the Glue Catalog.
    table : str
        Table name
    schema : str
        Schema name
    mode : str
        Append, overwrite or upsert.
            append: Inserts new records into table.
            overwrite: Drops table and recreates.
            upsert: Perform an upsert which checks for conflicts on columns given by `upsert_conflict_columns` and
            sets the new values on conflicts. Note that `upsert_conflict_columns` is required for this mode.
    index : bool
        True to store the DataFrame index as a column in the table,
        otherwise False to ignore it.
    dtype: Dict[str, str], optional
        Dictionary of columns names and PostgreSQL types to be casted.
        Useful when you have columns with undetermined or mixed data types.
        (e.g. {'col name': 'TEXT', 'col2 name': 'FLOAT'})
    varchar_lengths : Dict[str, int], optional
        Dict of VARCHAR length by columns. (e.g. {"col1": 10, "col5": 200}).
    use_column_names: bool
        If set to True, will use the column names of the DataFrame for generating the INSERT SQL Query.
        E.g. If the DataFrame has two columns `col1` and `col3` and `use_column_names` is True, data will only be
        inserted into the database columns `col1` and `col3`.
    chunksize: int
        Number of rows which are inserted with each SQL query. Defaults to inserting 200 rows per query.
    upsert_conflict_columns: List[str], optional
        This parameter is only supported if `mode` is set top `upsert`. In this case conflicts for the given columns are
        checked for evaluating the upsert.
    insert_conflict_columns: List[str], optional
        This parameter is only supported if `mode` is set top `append`. In this case conflicts for the given columns are
        checked for evaluating the insert 'ON CONFLICT DO NOTHING'.

    Returns
    -------
    None
        None.

    Examples
    --------
    Writing to PostgreSQL using a Glue Catalog Connections

    >>> import awswrangler as wr
    >>> con = wr.postgresql.connect("MY_GLUE_CONNECTION")
    >>> wr.postgresql.to_sql(
    ...     df=df,
    ...     table="my_table",
    ...     schema="public",
    ...     con=con
    ... )
    >>> con.close()

    """
    if df.empty is True:
        raise exceptions.EmptyDataFrame("DataFrame cannot be empty.")

    mode = mode.strip().lower()
    allowed_modes = ["append", "overwrite", "upsert"]
    _db_utils.validate_mode(mode=mode, allowed_modes=allowed_modes)
    if mode == "upsert" and not upsert_conflict_columns:
        raise exceptions.InvalidArgumentValue(
            "<upsert_conflict_columns> needs to be set when using upsert mode."
        )
    _validate_connection(con=con)
    try:
        with con.cursor() as cursor:
            _create_table(
                df=df,
                cursor=cursor,
                table=table,
                schema=schema,
                mode=mode,
                index=index,
                dtype=dtype,
                varchar_lengths=varchar_lengths,
            )
            if index:
                df.reset_index(level=df.index.names, inplace=True)
            column_placeholders: str = ", ".join(["%s"] * len(df.columns))
            insertion_columns = ""
            upsert_str = ""
            if use_column_names:
                insertion_columns = f"({', '.join(df.columns)})"
            if mode == "upsert":
                upsert_columns = ", ".join(
                    df.columns.map(
                        lambda column: f"{column}=EXCLUDED.{column}"))
                conflict_columns = ", ".join(
                    upsert_conflict_columns)  # type: ignore
                upsert_str = f" ON CONFLICT ({conflict_columns}) DO UPDATE SET {upsert_columns}"
            if mode == "append" and insert_conflict_columns:
                conflict_columns = ", ".join(
                    insert_conflict_columns)  # type: ignore
                upsert_str = f" ON CONFLICT ({conflict_columns}) DO NOTHING"
            placeholder_parameter_pair_generator = _db_utils.generate_placeholder_parameter_pairs(
                df=df,
                column_placeholders=column_placeholders,
                chunksize=chunksize)
            for placeholders, parameters in placeholder_parameter_pair_generator:
                sql: str = f'INSERT INTO "{schema}"."{table}" {insertion_columns} VALUES {placeholders}{upsert_str}'
                _logger.debug("sql: %s", sql)
                cursor.executemany(sql, (parameters, ))
            con.commit()
    except Exception as ex:
        con.rollback()
        _logger.error(ex)
        raise