def rd_sql_geo(server, database, table, col_stmt, where_lst=None): """ Function to extract the geometry and coordinate system from an SQL geometry field. Returns a shapely geometry object and a proj4 str. Parameters ---------- server : str The server name. e.g.: 'SQL2012PROD03' database : str The specific database within the server. e.g.: 'LowFlows' table : str The specific table within the database. e.g.: 'LowFlowSiteRestrictionDaily' where_lst : list A list of where statements to be passed and added to the final SQL statement. Returns ------- list of shapely geometry objects The main output is a list of shapely geometry objects for all queried rows of the SQL table. str The second output is a proj4 str of the projection system. """ ## Create connection to database engine = create_engine('mssql', server, database) geo_col_stmt = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME=" + "\'" + table + "\'" + " AND DATA_TYPE='geometry'" geo_col = str(pd.read_sql(geo_col_stmt, engine).iloc[0, 0]) geo_srid_stmt = "select distinct " + geo_col + ".STSrid from " + table geo_srid = int(pd.read_sql(geo_srid_stmt, engine).iloc[0, 0]) if where_lst is not None: if len(where_lst) > 0: stmt2 = "SELECT " + col_stmt + ", " + geo_col + ".STAsBinary() as geometry" + " FROM " + table + " where " + " and ".join( where_lst) else: stmt2 = "SELECT " + col_stmt + ", " + geo_col + ".STAsBinary() as geometry" + " FROM " + table else: stmt2 = "SELECT " + col_stmt + ", " + geo_col + ".STAsBinary() as geometry" + " FROM " + table df2 = pd.read_sql(stmt2, engine) df2['geometry'] = df2.geometry.apply(lambda x: loads(x)) # proj4 = from_epsg_code(geo_srid).to_proj4() # crs = {'init' :'epsg:' + str(geo_srid)} crs = parse.from_epsg_code(geo_srid).to_proj4() geo_df = GeoDataFrame(df2, geometry='geometry', crs=crs) return geo_df
def to_mssql(df, server, database, table, index=False, dtype=None, schema=None): """ Function to append a DataFrame onto an existing mssql table. Parameters ---------- df : DataFrame DataFrame to be saved. The DataFrame column/index names must match those on the mssql table exactly. server : str The server name. e.g.: 'SQL2012PROD03' database : str The specific database within the server. e.g.: 'LowFlows' table : str The specific table within the database. e.g.: 'LowFlowSiteRestrictionDaily' index : bool Should the index be added as a column? dtype : dict of column name to SQL type, default None Optional specifying the datatype for columns. The SQL type should be an SQLAlchemy type. Returns ------- None """ ### Prepare the engine engine = create_engine('mssql', server, database) ### Save to mssql table df.to_sql(name=table, con=engine, if_exists='append', chunksize=1000, index=index, dtype=dtype, schema=schema)
def update_table_rows(df, server, database, table, on=None, index=False, append=True): """ Function to update rows from an mssql table. SQL table must have a primary key and the primary key must be in the input DataFrame. Parameters ---------- df : DataFrame DataFrame with data to be overwritten in SQL table. server : str The server name. e.g.: 'SQL2012PROD03' database : str The specific database within the server. e.g.: 'LowFlows' table : str The specific table within the database. e.g.: 'LowFlowSiteRestrictionDaily' on : None, str, or list of str The index by which the update should be applied on. If None, then it uses the existing primary key(s). index : bool Does the df have an index that corresponds to the SQL table primary keys? append : bool Should new sites be appended to the table? Returns ------- None """ ### Check the primary keys if on is None: pk_stmt = get_pk_stmt.format(db=database, table=table) pk = rd_sql(server, database, stmt=pk_stmt).name.tolist() if not pk: raise ValueError( 'SQL table has no primary key. Please set one up or assign "on" explicitly.' ) on = pk elif isinstance(on, str): on = [on] ## Check that "on" are in the tables df_bool = ~np.isin(on, df.columns).all() if df_bool: raise ValueError('"on" contains column names that are not in the df') ### Make the update statement temp_tab = '#temp_up1' on_tab = [table + '.' + i for i in on] on_temp = [temp_tab + '.' + i for i in on] cols = df.columns.tolist() val_cols = [i for i in cols if not i in on] tab_list = [table + '.' + i for i in val_cols] temp_list = [temp_tab + '.' + i for i in val_cols] temp_list2 = [temp_tab + '.' + i for i in cols] up_list = [ tab_list[i] + ' = ' + temp_list[i] for i in np.arange(len(temp_list)) ] on_list = [on_tab[i] + ' = ' + on_temp[i] for i in np.arange(len(on))] # up_stmt = "update " + table + " set " + ", ".join(up_list) + " from " + table + " inner join " + temp_tab + " on " + " and ".join(on_list) if append: up_stmt = "merge " + table + " using " + temp_tab + " on (" + " and ".join( on_list) + ") when matched then update set " + ", ".join( up_list ) + " WHEN NOT MATCHED BY TARGET THEN INSERT (" + ", ".join( cols) + ") values (" + ", ".join(temp_list2) + ");" else: up_stmt = "merge " + table + " using " + temp_tab + " on (" + " and ".join( on_list) + ") when matched then update set " + ", ".join( up_list) + ";" ### Run SQL code to update rows engine = create_engine('mssql', server, database) with engine.begin() as conn: print('Saving data to temp table...') df.to_sql(temp_tab, con=conn, if_exists='replace', index=index, chunksize=1000) print('Updating primary table...') conn.execute(up_stmt)
def del_table_rows(server, database, table=None, pk_df=None, stmt=None): """ Function to selectively delete rows from an mssql table. Parameters ---------- server : str The server name. e.g.: 'SQL2012PROD03' database : str The specific database within the server. e.g.: 'LowFlows' table : str or None if stmt is a str The specific table within the database. e.g.: 'LowFlowSiteRestrictionDaily' pk_df : DataFrame A DataFrame of the primary keys of the table for the rows that should be removed. stmt : str SQL delete statement. Will override everything except server and database. Returns ------- None Notes ----- Using the pk_df is the only way to ensure that specific rows will be deleted from composite keys. The column data types and names of pk_df must match the equivelant columns in the SQL table. The procedure creates a temporary table from the pk_df then deletes the rows in the target table based on the temp table. Then finally deletes the temp table. """ ### Make the delete statement # del_where_list = sql_where_stmts(**kwargs) if isinstance(stmt, str): del_rows_stmt = stmt elif isinstance(pk_df, pd.DataFrame): temp_tab = '#temp_del_tab1' ### Check the primary keys pk_stmt = get_pk_stmt.format(db=database, table=table) pk = rd_sql(server, database, stmt=pk_stmt).name if pk.empty: raise ValueError( 'SQL table has no primary key. Please set one up.') if not np.isin(pk, pk_df.columns.tolist()).all(): raise ValueError( 'The primary keys in the SQL table does not match up with the pk_df' ) sel_t1 = "select * from " + temp_tab cols = pk_df.columns.tolist() tab_where = [table + '.' + i for i in cols] t1_where = [temp_tab + '.' + i for i in cols] where_list = [ t1_where[i] + ' = ' + tab_where[i] for i in np.arange(len(cols)) ] where_stmt = " where " + " and ".join(where_list) exists_stmt = "(" + sel_t1 + where_stmt + ")" del_rows_stmt = "DELETE FROM " + table + " where exists " + exists_stmt # elif isinstance(del_where_list, list): # del_rows_stmt = "DELETE FROM " + table + " WHERE " + " AND ".join(del_where_list) else: raise ValueError('Please specify pk_df or stmt') ### Delete rows engine = create_engine('mssql', server, database) with engine.begin() as conn: if isinstance(pk_df, pd.DataFrame): pk_df.to_sql(name=temp_tab, con=conn, if_exists='replace', chunksize=1000) conn.execute(del_rows_stmt)
def create_table(server, database, table, dtype_dict, primary_keys=None, foreign_keys=None, foreign_table=None, drop_table=False, con=None): """ Function to create a table in an mssql database. Parameters ---------- server : str The server name. e.g.: 'SQL2012PROD03' database : str The specific database within the server. e.g.: 'LowFlows' table : str The specific table within the database. e.g.: 'LowFlowSiteRestrictionDaily' dtype_dict : dict of str Dictionary of df columns to the associated sql data type. Examples below. primary_keys : str or list of str Index columns to define uniqueness in the data structure. foreign_keys : str or list of str Columns to link to another table in the same database. foreign_table: str The table in the same database with the identical foreign key(s). drop_table : bool If the table already exists, should it be dropped? Returns ------- None """ ### Make connection if con is None: engine = create_engine('mssql', server, database) con = engine.connect() ### Primary keys if isinstance(primary_keys, str): primary_keys = [primary_keys] if isinstance(primary_keys, list): pkey_stmt = ", Primary key (" + ", ".join(primary_keys) + ")" else: pkey_stmt = "" ### Foreign keys if isinstance(foreign_keys, str): foreign_keys = [foreign_keys] if isinstance(foreign_keys, list): fkey_stmt = ", Foreign key (" + ", ".join( foreign_keys ) + ") " + "References " + foreign_table + "(" + ", ".join( foreign_keys) + ")" else: fkey_stmt = "" ### Initial create table statement d1 = [str(i) + ' ' + dtype_dict[i] for i in dtype_dict] d2 = ', '.join(d1) tab_create_stmt = "IF OBJECT_ID(" + str( [str(table)] )[1: -1] + ", 'U') IS NULL create table " + table + " (" + d2 + pkey_stmt + fkey_stmt + ")" trans = con.begin() try: ### Drop table option or check if drop_table: drop_stmt = "IF OBJECT_ID(" + str( [str(table)])[1:-1] + ", 'U') IS NOT NULL DROP TABLE " + table con.execute(drop_stmt) else: check_tab_stmt = "IF OBJECT_ID(" + str( [str(table)] )[1: -1] + ", 'U') IS NOT NULL SELECT COLUMN_NAME, DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME=" + str( [str(table)])[1:-1] # tab1 = read_sql(check_tab_stmt, conn) con.execute(check_tab_stmt) # list1 = [i for i in conn] # if list1: # print('Table already exists. Returning the table info.') # df = pd.DataFrame(list1, columns=['columns', 'dtype']) # conn.close() # return df ### Create table in database con.execute(tab_create_stmt) trans.commit() con.close() except Exception as err: trans.rollback() con.close() raise err
def rd_sql(server, database, table=None, col_names=None, where_in=None, where_op='AND', geo_col=False, from_date=None, to_date=None, date_col=None, rename_cols=None, stmt=None, con=None): """ Function to import data from an MSSQL database. Parameters ---------- server : str The server name. e.g.: 'SQL2012PROD03' database : str The specific database within the server. e.g.: 'LowFlows' table : str The specific table within the database. e.g.: 'LowFlowSiteRestrictionDaily' col_names : list of str The column names that should be retrieved. e.g.: ['SiteID', 'BandNo', 'RecordNo'] where_in : dict A dictionary of strings to lists of strings.'. e.g.: {'SnapshotType': ['value1', 'value2']} where_op : str If where_in is a dictionary and there are more than one key, then the operator that connects the where statements must be either 'AND' or 'OR'. geo_col : bool Is there a geometry column in the table?. from_date : str The start date in the form '2010-01-01'. to_date : str The end date in the form '2010-01-01'. date_col : str The SQL table column that contains the dates. rename_cols: list of str List of strings to rename the resulting DataFrame column names. stmt : str Custom SQL statement to be directly passed to the database. This will ignore all prior arguments except server and database. con : SQLAlchemy connectable (engine/connection) or database string URI The sqlalchemy connection to be passed to pandas.read_sql Returns ------- DataFrame """ ## Create where statements if stmt is None: if table is None: raise ValueError( 'Must at least provide input for server, database, and table.') if col_names is not None: if isinstance(col_names, str): col_names = [col_names] col_names1 = [ '[' + i.encode('ascii', 'ignore').decode() + ']' for i in col_names ] col_stmt = ', '.join(col_names1) else: col_stmt = '*' where_lst, where_temp = sql_where_stmts(where_in=where_in, where_op=where_op, from_date=from_date, to_date=to_date, date_col=date_col) if isinstance(where_lst, list): stmt1 = "SELECT " + col_stmt + " FROM " + table + " where " + " and ".join( where_lst) else: stmt1 = "SELECT " + col_stmt + " FROM " + table elif isinstance(stmt, str): where_temp = {} stmt1 = stmt else: raise ValueError('stmt must either be an SQL string or None.') ## Create connection to database and execute sql statement if geo_col & (stmt is None): df = rd_sql_geo(server=server, database=database, table=table, col_stmt=col_stmt, where_lst=where_lst) if rename_cols is not None: rename_cols1 = rename_cols.copy() rename_cols1.extend(['geometry']) df.columns = rename_cols1 else: if con is None: engine = create_engine('mssql', server, database) with engine.begin() as conn: if where_temp: for key, value in where_temp.items(): df = pd.DataFrame(data=value, columns=[key.lower()]) temp_tab = '#temp_' + key.lower() df.to_sql(temp_tab, con=conn, if_exists='replace', index=False, chunksize=1000) df = pd.read_sql(stmt1, con=conn) else: if where_temp: for key, value in where_temp.items(): df = pd.DataFrame(data=value, columns=[key]) temp_tab = '#temp_' + key df.to_sql(temp_tab, con=con, if_exists='replace', index=False, chunksize=1000) df = pd.read_sql(stmt1, con=con) if rename_cols is not None: df.columns = rename_cols return df
def rd_sql_ts(server, database, table, groupby_cols, date_col, values_cols, resample_code=None, period=1, fun='mean', val_round=3, where_in=None, where_op='AND', from_date=None, to_date=None, min_count=None, con=None): """ Function to specifically read and possibly aggregate time series data stored in MSSQL tables. Parameters ---------- server : str The server name. e.g.: 'SQL2012PROD03' database : str The specific database within the server. e.g.: 'LowFlows' table : str The specific table within the database. e.g.: 'LowFlowSiteRestrictionDaily' groupby_cols : str or list of str The columns in the SQL table to grouped and returned with the time series data. date_col : str The date column in the table. values_cols : str or list of str The column(s) of the value(s) that should be resampled. resample_code : str or None The Pandas time series resampling code. e.g. 'D' for day, 'W' for week, 'M' for month, etc. period : int The number of resampling periods. e.g. period = 2 and resample = 'D' would be to resample the values over a 2 day period. fun : str The resampling function. i.e. mean, sum, count, min, or max. No median yet... val_round : int The number of decimals to round the values. where_in : dict A dictionary of strings to lists of strings.'. e.g.: {'SnapshotType': ['value1', 'value2']} where_op : str If where_in is a dictionary and there are more than one key, then the operator that connects the where statements must be either 'AND' or 'OR'. from_date : str The start date in the form '2010-01-01'. to_date : str The end date in the form '2010-01-01'. min_count : int The minimum number of values required to return groupby_cols. Only works when groupby_cols and vlue_cols are str. con : SQLAlchemy connectable (engine/connection) or database string URI The sqlalchemy connection to be passed to pandas.read_sql Returns ------- DataFrame Pandas DataFrame with MultiIndex of groupby_cols and date_col """ ## Create where statement where_lst, where_temp = sql_where_stmts(where_in=where_in, where_op=where_op, from_date=from_date, to_date=to_date, date_col=date_col) ## Create ts statement and append earlier where statement if isinstance(groupby_cols, str): groupby_cols = [groupby_cols] col_names1 = [ '[' + i.encode('ascii', 'ignore').decode() + ']' for i in groupby_cols ] col_stmt = ', '.join(col_names1) ## Create sql stmt sql_stmt1 = sql_ts_agg_stmt(table, groupby_cols=groupby_cols, date_col=date_col, values_cols=values_cols, resample_code=resample_code, period=period, fun=fun, val_round=val_round, where_lst=where_lst) ## Create connection to database if con is None: con = create_engine('mssql', server, database) ## Make minimum count selection if (min_count is not None) & isinstance(min_count, int) & (len(groupby_cols) == 1): cols_count_str = ', '.join( [groupby_cols[0], 'count(' + values_cols + ') as count']) if isinstance(where_lst, list): stmt1 = "SELECT " + cols_count_str + " FROM " + "(" + sql_stmt1 + ") as agg" + " GROUP BY " + col_stmt + " HAVING count(" + values_cols + ") >= " + str( min_count) else: stmt1 = "SELECT " + cols_count_str + " FROM " + table + " GROUP BY " + col_stmt + " HAVING count(" + values_cols + ") >= " + str( min_count) up_sites = pd.read_sql(stmt1, con)[groupby_cols[0]].tolist() up_sites = [str(i) for i in up_sites] if not up_sites: raise ValueError('min_count filtered out all sites.') where_in.update({groupby_cols[0]: up_sites}) where_lst, where_temp = sql_where_stmts(where_in, where_op=where_op, from_date=from_date, to_date=to_date, date_col=date_col) ## Create sql stmt sql_stmt1 = sql_ts_agg_stmt(table, groupby_cols=groupby_cols, date_col=date_col, values_cols=values_cols, resample_code=resample_code, period=period, fun=fun, val_round=val_round, where_lst=where_lst) ## Create connection to database and execute sql statement df = pd.read_sql(sql_stmt1, con) ## Check to see if any data was found if df.empty: raise ValueError( 'No data was found in the database for the parameters given.') ## set the index df[date_col] = pd.to_datetime(df[date_col]) groupby_cols.append(date_col) df1 = df.set_index(groupby_cols).sort_index() return df1