def add_columns(table_name: str, dataframe: pd.DataFrame, columns: List[str], modifier: modify) -> pd.DataFrame: """Add columns if they do not exist. Parameters ---------- table_name (str) : name of the table for which the failed write attempt occured dataframe (pandas.DataFrame) : data to insert columns (list) : columns to add modifier (mssql_dataframe.core.modify) : class to modify SQL columns Returns ------- dataframe (pandas.DataFrame) : data to insert that may have been adjust to conform to SQL data types """ # infer the data types for new columns new, schema, _, _ = infer.sql(dataframe.loc[:, columns]) # determine the SQL data type for each column _, dtypes = conversion.sql_spec(schema, new) # add each column for col, spec in dtypes.items(): msg = ( f"Creating column '{col}' in table '{table_name}' with data type '{spec}'." ) logger.warning(msg) modifier.column(table_name, modify="add", column_name=col, data_type=spec, is_nullable=True) # add potentially adjusted columns back into dataframe dataframe[new.columns] = new return dataframe
def test_default(data): # setup test data dataframe = data.copy() dataframe["_nvarchar_default1"] = None dataframe["_nvarchar_default2"] = np.nan # infer SQL properties dataframe, schema, not_nullable, pk = infer.sql(dataframe) _check_schema(schema) _check_dataframe(dataframe, schema) assert len(not_nullable) == 0 assert pk is None
def test_pk(data): # setup test data dataframe = data[data.notna().all(axis="columns")].copy() dataframe["_tinyint_smaller"] = pd.Series(range(0, len(dataframe)), dtype="UInt8") dataframe["_varchar_smaller"] = dataframe["_varchar"].str.slice(0, 1) # infer SQL properties df = dataframe df, schema, not_nullable, pk = infer.sql(df) _check_schema(schema) _check_dataframe(df, schema) assert df.columns.isin(not_nullable).all() assert pk == "_tinyint_smaller" # infer SQL properties without numeric df = dataframe.select_dtypes(["datetime", "string"]) df, schema, not_nullable, pk = infer.sql(df) _check_schema(schema) _check_dataframe(df, schema) assert df.columns.isin(not_nullable).all() assert pk == "_varchar_smaller"
def test_dtypes(data): # setup test data dataframe = data.copy() na = dataframe.isna() dataframe = dataframe.astype("str") dataframe[na] = None dataframe["_time"] = dataframe["_time"].str.replace("0 days ", "") # infer SQL properties dataframe, schema, not_nullable, pk = infer.sql(dataframe) # assert inferred results _check_schema(schema) _check_dataframe(dataframe, schema) assert len(not_nullable) == 0 assert pk is None
def alter_columns(table_name: str, dataframe: pd.DataFrame, columns: List[str], modifier: modify) -> pd.DataFrame: """Alter columns if their size needs to be increased. Parameters ---------- table_name (str) : name of the table for which the failed write attempt occured dataframe (pandas.DataFrame) : data to insert columns (list) : columns to alter modifier (mssql_dataframe.core.modify) : class to modify SQL columns Returns ------- dataframe (pandas.DataFrame) : data to insert that may have been adjust to conform to SQL data types """ # temporarily set named index (primary key) as columns index = dataframe.index.names if any(index): dataframe = dataframe.reset_index() # infer the data types for insufficient size columns new, schema, _, _ = infer.sql(dataframe.loc[:, columns]) schema, dtypes = conversion.sql_spec(schema, new) # get current table schema previous, _ = conversion.get_schema(modifier._connection, table_name) strings = previous["sql_type"].isin(["varchar", "nvarchar"]) previous.loc[strings, "odbc_size"] = previous.loc[strings, "column_size"] # insure change within the same sql data type category after inferring dtypes unchanged = (previous.loc[schema.index, ["sql_type", "odbc_size"]] == schema[[ "sql_type", "odbc_size" ]]) unchanged = unchanged.all(axis="columns") if any(unchanged): unchanged = list(unchanged[unchanged].index) raise custom_errors.SQLRecastColumnUnchanged( f"Handling SQLInsufficientColumnSize did not result in type or size change for columns: {unchanged}" ) # insure change doesn't result in different sql data category changed = previous.loc[schema.index, ["sql_category"]] != schema[["sql_category"]] if any(changed["sql_category"]): changed = list(changed[changed["sql_category"]].index) raise custom_errors.DataframeColumnInvalidValue( "Dataframe columns cannot be converted based on their SQL data type", changed, ) # drop primary key constraint prior to altering columns, if needed primary_key_columns = (previous.loc[previous["pk_seq"].notna(), "pk_seq"].sort_values( ascending=True).index) if len(primary_key_columns) == 0: primary_key_name = None else: primary_key_name = previous.loc[primary_key_columns[0], "pk_name"] modifier.primary_key( table_name, modify="drop", columns=primary_key_columns, primary_key_name=primary_key_name, ) # alter each column for col, spec in dtypes.items(): is_nullable = previous.at[col, "is_nullable"] msg = f"Altering column '{col}' in table '{table_name}' to data type '{spec}' with 'is_nullable={is_nullable}'." logger.warning(msg) modifier.column( table_name, modify="alter", column_name=col, data_type=spec, is_nullable=is_nullable, ) # readd primary key if needed if primary_key_name: modifier.primary_key( table_name, modify="add", columns=list(primary_key_columns), primary_key_name=primary_key_name, ) # reset primary key columns as dataframe's index if any(index): dataframe = dataframe.set_index(keys=index) return dataframe
def table_from_dataframe( self, table_name: str, dataframe: pd.DataFrame, primary_key: Literal[None, "sql", "index", "infer"] = None, insert_dataframe: bool = True, ) -> pd.DataFrame: """Create SQL table by inferring SQL create table parameters from the contents of a dataframe. The contents can be composed of strings/objects only and converted better data types if conversion is possible within pandas. Parameters ---------- table_name (str) : name of table to create, may also contain schema name in the form schema_name.table_name dataframe (pandas.DataFrame) : data used to create table primary_key (str, default = None) : method of setting the table's primary key, see below for description of options insert_dataframe (bool, default=True) : insert the dataframe after creating the table primary_key = None : do not set a primary key primary_key = 'sql' : create an SQL managed auto-incrementing INT identity primary key column named '_pk' primary_key = 'index' : use the index of the dataframe and it's name, or '_index' if the index is not named primary_key = 'infer' : determine the column in the dataframe that best serves as a primary key and use it's name Returns ------- dataframe (pandas.DataFrame) : data potentially converted from obejcts/strings to better pandas types Examples -------- Table without a primary key. >>> df = create.table_from_dataframe('##ExampleCreateDFNoPKTable', pd.DataFrame({"ColumnA": [1]})) Table with the dataframe's index as the primary key. >>> df = create.table_from_dataframe('##ExampleCreateDFIndexPKTable', pd.DataFrame({"ColumnA": [1,2]}, index=['a','z']), primary_key='index') Table with SQL identity primary key. >>> df = create.table_from_dataframe('##ExampleCreateDFIdentityPKTable', pd.DataFrame({"ColumnA": [1,2]}), primary_key='sql') Table using ColumnA as the primary key, after it was inferred to be the primary key. >>> df = create.table_from_dataframe('##ExampleCreateDFInferPKTable', pd.DataFrame({"ColumnA": [1,2], "ColumnB": ["a","b"]}), primary_key='infer') """ # determine primary key if primary_key is None: sql_primary_key = False primary_key_column = None elif primary_key == "sql": sql_primary_key = True primary_key_column = None elif primary_key == "index": sql_primary_key = False if not any(dataframe.index.names): dataframe.index.name = "_index" primary_key_column = list(dataframe.index.names) dataframe = dataframe.reset_index() elif primary_key == "infer": sql_primary_key = False primary_key_column = None else: options = [None, "sql", "index", "infer"] raise ValueError("primary_key must be one of: " + str(options)) # infer SQL specifications from contents of dataframe dataframe, schema, not_nullable, pk = infer.sql(dataframe) _, dtypes = conversion.sql_spec(schema, dataframe) # infer primary key column after best fit data types have been determined if primary_key == "infer": primary_key_column = pk # add _time_insert column if self.include_metadata_timestamps: dtypes["_time_insert"] = "DATETIME2" # create final SQL table self.table( table_name, dtypes, not_nullable=not_nullable, primary_key_column=primary_key_column, sql_primary_key=sql_primary_key, ) # issue message for derived table pk_name = primary_key_column if sql_primary_key: pk_name = "_pk (SQL managed int identity column)" elif primary_key == "index": pk_name = str(primary_key_column) + " (dataframe index)" elif primary_key_column is not None: pk_name = primary_key_column + " (dataframe column)" else: pk_name = "None" msg = f""" Created table: {table_name} Primary key: {pk_name} Non-null columns: {not_nullable} Data types: {dtypes} """ logger.warning(msg) # set primary key column as dataframe index if primary_key_column is not None: dataframe = dataframe.set_index(keys=primary_key_column) # insert dataframe if insert_dataframe: cursor = self._connection.cursor() cursor.fast_executemany = True dataframe = conversion.insert_values( table_name, dataframe, self.include_metadata_timestamps, schema, cursor ) return dataframe