Example #1
0
def add_columns(table_name: str, dataframe: pd.DataFrame, columns: List[str],
                modifier: modify) -> pd.DataFrame:
    """Add columns if they do not exist.

    Parameters
    ----------
    table_name (str) : name of the table for which the failed write attempt occured
    dataframe (pandas.DataFrame) : data to insert
    columns (list) : columns to add
    modifier (mssql_dataframe.core.modify) : class to modify SQL columns

    Returns
    -------
    dataframe (pandas.DataFrame) : data to insert that may have been adjust to conform to SQL data types
    """
    # infer the data types for new columns
    new, schema, _, _ = infer.sql(dataframe.loc[:, columns])
    # determine the SQL data type for each column
    _, dtypes = conversion.sql_spec(schema, new)
    # add each column
    for col, spec in dtypes.items():
        msg = (
            f"Creating column '{col}' in table '{table_name}' with data type '{spec}'."
        )
        logger.warning(msg)
        modifier.column(table_name,
                        modify="add",
                        column_name=col,
                        data_type=spec,
                        is_nullable=True)
    # add potentially adjusted columns back into dataframe
    dataframe[new.columns] = new

    return dataframe
Example #2
0
def test_default(data):

    # setup test data
    dataframe = data.copy()
    dataframe["_nvarchar_default1"] = None
    dataframe["_nvarchar_default2"] = np.nan

    # infer SQL properties
    dataframe, schema, not_nullable, pk = infer.sql(dataframe)
    _check_schema(schema)
    _check_dataframe(dataframe, schema)
    assert len(not_nullable) == 0
    assert pk is None
Example #3
0
def test_pk(data):

    # setup test data
    dataframe = data[data.notna().all(axis="columns")].copy()
    dataframe["_tinyint_smaller"] = pd.Series(range(0, len(dataframe)),
                                              dtype="UInt8")
    dataframe["_varchar_smaller"] = dataframe["_varchar"].str.slice(0, 1)

    # infer SQL properties
    df = dataframe
    df, schema, not_nullable, pk = infer.sql(df)
    _check_schema(schema)
    _check_dataframe(df, schema)
    assert df.columns.isin(not_nullable).all()
    assert pk == "_tinyint_smaller"

    # infer SQL properties without numeric
    df = dataframe.select_dtypes(["datetime", "string"])
    df, schema, not_nullable, pk = infer.sql(df)
    _check_schema(schema)
    _check_dataframe(df, schema)
    assert df.columns.isin(not_nullable).all()
    assert pk == "_varchar_smaller"
Example #4
0
def test_dtypes(data):

    # setup test data
    dataframe = data.copy()
    na = dataframe.isna()
    dataframe = dataframe.astype("str")
    dataframe[na] = None
    dataframe["_time"] = dataframe["_time"].str.replace("0 days ", "")

    # infer SQL properties
    dataframe, schema, not_nullable, pk = infer.sql(dataframe)

    # assert inferred results
    _check_schema(schema)
    _check_dataframe(dataframe, schema)
    assert len(not_nullable) == 0
    assert pk is None
Example #5
0
def alter_columns(table_name: str, dataframe: pd.DataFrame, columns: List[str],
                  modifier: modify) -> pd.DataFrame:
    """Alter columns if their size needs to be increased.

    Parameters
    ----------
    table_name (str) : name of the table for which the failed write attempt occured
    dataframe (pandas.DataFrame) : data to insert
    columns (list) : columns to alter
    modifier (mssql_dataframe.core.modify) : class to modify SQL columns

    Returns
    -------
    dataframe (pandas.DataFrame) : data to insert that may have been adjust to conform to SQL data types
    """
    # temporarily set named index (primary key) as columns
    index = dataframe.index.names
    if any(index):
        dataframe = dataframe.reset_index()
    # infer the data types for insufficient size columns
    new, schema, _, _ = infer.sql(dataframe.loc[:, columns])
    schema, dtypes = conversion.sql_spec(schema, new)
    # get current table schema
    previous, _ = conversion.get_schema(modifier._connection, table_name)
    strings = previous["sql_type"].isin(["varchar", "nvarchar"])
    previous.loc[strings, "odbc_size"] = previous.loc[strings, "column_size"]
    # insure change within the same sql data type category after inferring dtypes
    unchanged = (previous.loc[schema.index,
                              ["sql_type", "odbc_size"]] == schema[[
                                  "sql_type", "odbc_size"
                              ]])
    unchanged = unchanged.all(axis="columns")
    if any(unchanged):
        unchanged = list(unchanged[unchanged].index)
        raise custom_errors.SQLRecastColumnUnchanged(
            f"Handling SQLInsufficientColumnSize did not result in type or size change for columns: {unchanged}"
        )
    # insure change doesn't result in different sql data category
    changed = previous.loc[schema.index,
                           ["sql_category"]] != schema[["sql_category"]]
    if any(changed["sql_category"]):
        changed = list(changed[changed["sql_category"]].index)
        raise custom_errors.DataframeColumnInvalidValue(
            "Dataframe columns cannot be converted based on their SQL data type",
            changed,
        )
    # drop primary key constraint prior to altering columns, if needed
    primary_key_columns = (previous.loc[previous["pk_seq"].notna(),
                                        "pk_seq"].sort_values(
                                            ascending=True).index)
    if len(primary_key_columns) == 0:
        primary_key_name = None
    else:
        primary_key_name = previous.loc[primary_key_columns[0], "pk_name"]
        modifier.primary_key(
            table_name,
            modify="drop",
            columns=primary_key_columns,
            primary_key_name=primary_key_name,
        )
    # alter each column
    for col, spec in dtypes.items():
        is_nullable = previous.at[col, "is_nullable"]
        msg = f"Altering column '{col}' in table '{table_name}' to data type '{spec}' with 'is_nullable={is_nullable}'."
        logger.warning(msg)
        modifier.column(
            table_name,
            modify="alter",
            column_name=col,
            data_type=spec,
            is_nullable=is_nullable,
        )
    # readd primary key if needed
    if primary_key_name:
        modifier.primary_key(
            table_name,
            modify="add",
            columns=list(primary_key_columns),
            primary_key_name=primary_key_name,
        )
    # reset primary key columns as dataframe's index
    if any(index):
        dataframe = dataframe.set_index(keys=index)

    return dataframe
Example #6
0
    def table_from_dataframe(
        self,
        table_name: str,
        dataframe: pd.DataFrame,
        primary_key: Literal[None, "sql", "index", "infer"] = None,
        insert_dataframe: bool = True,
    ) -> pd.DataFrame:
        """Create SQL table by inferring SQL create table parameters from the contents of a dataframe.

        The contents can be composed of strings/objects only and converted better data types if conversion is possible within pandas.

        Parameters
        ----------
        table_name (str) : name of table to create, may also contain schema name in the form schema_name.table_name
        dataframe (pandas.DataFrame) : data used to create table
        primary_key (str, default = None) : method of setting the table's primary key, see below for description of options
        insert_dataframe (bool, default=True) : insert the dataframe after creating the table

        primary_key = None : do not set a primary key
        primary_key = 'sql' : create an SQL managed auto-incrementing INT identity primary key column named '_pk'
        primary_key = 'index' : use the index of the dataframe and it's name, or '_index' if the index is not named
        primary_key = 'infer' : determine the column in the dataframe that best serves as a primary key and use it's name

        Returns
        -------
        dataframe (pandas.DataFrame) : data potentially converted from obejcts/strings to better pandas types

        Examples
        --------
        Table without a primary key.
        >>> df = create.table_from_dataframe('##ExampleCreateDFNoPKTable', pd.DataFrame({"ColumnA": [1]}))

        Table with the dataframe's index as the primary key.
        >>> df = create.table_from_dataframe('##ExampleCreateDFIndexPKTable', pd.DataFrame({"ColumnA": [1,2]}, index=['a','z']), primary_key='index')

        Table with SQL identity primary key.
        >>> df = create.table_from_dataframe('##ExampleCreateDFIdentityPKTable', pd.DataFrame({"ColumnA": [1,2]}), primary_key='sql')

        Table using ColumnA as the primary key, after it was inferred to be the primary key.
        >>> df = create.table_from_dataframe('##ExampleCreateDFInferPKTable', pd.DataFrame({"ColumnA": [1,2], "ColumnB": ["a","b"]}), primary_key='infer')
        """
        # determine primary key
        if primary_key is None:
            sql_primary_key = False
            primary_key_column = None
        elif primary_key == "sql":
            sql_primary_key = True
            primary_key_column = None
        elif primary_key == "index":
            sql_primary_key = False
            if not any(dataframe.index.names):
                dataframe.index.name = "_index"
            primary_key_column = list(dataframe.index.names)
            dataframe = dataframe.reset_index()
        elif primary_key == "infer":
            sql_primary_key = False
            primary_key_column = None
        else:
            options = [None, "sql", "index", "infer"]
            raise ValueError("primary_key must be one of: " + str(options))

        # infer SQL specifications from contents of dataframe
        dataframe, schema, not_nullable, pk = infer.sql(dataframe)
        _, dtypes = conversion.sql_spec(schema, dataframe)

        # infer primary key column after best fit data types have been determined
        if primary_key == "infer":
            primary_key_column = pk

        # add _time_insert column
        if self.include_metadata_timestamps:
            dtypes["_time_insert"] = "DATETIME2"

        # create final SQL table
        self.table(
            table_name,
            dtypes,
            not_nullable=not_nullable,
            primary_key_column=primary_key_column,
            sql_primary_key=sql_primary_key,
        )

        # issue message for derived table
        pk_name = primary_key_column
        if sql_primary_key:
            pk_name = "_pk (SQL managed int identity column)"
        elif primary_key == "index":
            pk_name = str(primary_key_column) + " (dataframe index)"
        elif primary_key_column is not None:
            pk_name = primary_key_column + " (dataframe column)"
        else:
            pk_name = "None"
        msg = f"""
        Created table: {table_name}
        Primary key: {pk_name}
        Non-null columns: {not_nullable}
        Data types: {dtypes}
        """
        logger.warning(msg)

        # set primary key column as dataframe index
        if primary_key_column is not None:
            dataframe = dataframe.set_index(keys=primary_key_column)

        # insert dataframe
        if insert_dataframe:
            cursor = self._connection.cursor()
            cursor.fast_executemany = True
            dataframe = conversion.insert_values(
                table_name, dataframe, self.include_metadata_timestamps, schema, cursor
            )

        return dataframe