Exemple #1
0
def rd_sql_geo(server, database, table, col_stmt, where_lst=None):
    """
    Function to extract the geometry and coordinate system from an SQL geometry field. Returns a shapely geometry object and a proj4 str.

    Parameters
    ----------
    server : str
        The server name. e.g.: 'SQL2012PROD03'
    database : str
        The specific database within the server. e.g.: 'LowFlows'
    table : str
        The specific table within the database. e.g.: 'LowFlowSiteRestrictionDaily'
    where_lst : list
        A list of where statements to be passed and added to the final SQL statement.

    Returns
    -------
    list of shapely geometry objects
        The main output is a list of shapely geometry objects for all queried rows of the SQL table.
    str
        The second output is a proj4 str of the projection system.
    """

    ## Create connection to database
    engine = create_engine('mssql', server, database)

    geo_col_stmt = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME=" + "\'" + table + "\'" + " AND DATA_TYPE='geometry'"
    geo_col = str(pd.read_sql(geo_col_stmt, engine).iloc[0, 0])
    geo_srid_stmt = "select distinct " + geo_col + ".STSrid from " + table
    geo_srid = int(pd.read_sql(geo_srid_stmt, engine).iloc[0, 0])
    if where_lst is not None:
        if len(where_lst) > 0:
            stmt2 = "SELECT " + col_stmt + ", " + geo_col + ".STAsBinary() as geometry" + " FROM " + table + " where " + " and ".join(
                where_lst)
        else:
            stmt2 = "SELECT " + col_stmt + ", " + geo_col + ".STAsBinary() as geometry" + " FROM " + table
    else:
        stmt2 = "SELECT " + col_stmt + ", " + geo_col + ".STAsBinary() as geometry" + " FROM " + table
    df2 = pd.read_sql(stmt2, engine)
    df2['geometry'] = df2.geometry.apply(lambda x: loads(x))
    #    proj4 = from_epsg_code(geo_srid).to_proj4()
    #    crs = {'init' :'epsg:' + str(geo_srid)}
    crs = parse.from_epsg_code(geo_srid).to_proj4()
    geo_df = GeoDataFrame(df2, geometry='geometry', crs=crs)

    return geo_df
Exemple #2
0
def to_mssql(df,
             server,
             database,
             table,
             index=False,
             dtype=None,
             schema=None):
    """
    Function to append a DataFrame onto an existing mssql table.

    Parameters
    ----------
    df : DataFrame
        DataFrame to be saved. The DataFrame column/index names must match those on the mssql table exactly.
    server : str
        The server name. e.g.: 'SQL2012PROD03'
    database : str
        The specific database within the server. e.g.: 'LowFlows'
    table : str
        The specific table within the database. e.g.: 'LowFlowSiteRestrictionDaily'
    index : bool
        Should the index be added as a column?
    dtype : dict of column name to SQL type, default None
        Optional specifying the datatype for columns. The SQL type should be an SQLAlchemy type.

    Returns
    -------
    None
    """
    ### Prepare the engine
    engine = create_engine('mssql', server, database)

    ### Save to mssql table
    df.to_sql(name=table,
              con=engine,
              if_exists='append',
              chunksize=1000,
              index=index,
              dtype=dtype,
              schema=schema)
Exemple #3
0
def update_table_rows(df,
                      server,
                      database,
                      table,
                      on=None,
                      index=False,
                      append=True):
    """
    Function to update rows from an mssql table. SQL table must have a primary key and the primary key must be in the input DataFrame.

    Parameters
    ----------
    df : DataFrame
        DataFrame with data to be overwritten in SQL table.
    server : str
        The server name. e.g.: 'SQL2012PROD03'
    database : str
        The specific database within the server. e.g.: 'LowFlows'
    table : str
        The specific table within the database. e.g.: 'LowFlowSiteRestrictionDaily'
    on : None, str, or list of str
        The index by which the update should be applied on. If None, then it uses the existing primary key(s).
    index : bool
        Does the df have an index that corresponds to the SQL table primary keys?
    append : bool
        Should new sites be appended to the table?

    Returns
    -------
    None
    """
    ### Check the primary keys
    if on is None:
        pk_stmt = get_pk_stmt.format(db=database, table=table)
        pk = rd_sql(server, database, stmt=pk_stmt).name.tolist()

        if not pk:
            raise ValueError(
                'SQL table has no primary key. Please set one up or assign "on" explicitly.'
            )
        on = pk
    elif isinstance(on, str):
        on = [on]

    ## Check that "on" are in the tables
    df_bool = ~np.isin(on, df.columns).all()
    if df_bool:
        raise ValueError('"on" contains column names that are not in the df')

    ### Make the update statement
    temp_tab = '#temp_up1'
    on_tab = [table + '.' + i for i in on]
    on_temp = [temp_tab + '.' + i for i in on]
    cols = df.columns.tolist()
    val_cols = [i for i in cols if not i in on]
    tab_list = [table + '.' + i for i in val_cols]
    temp_list = [temp_tab + '.' + i for i in val_cols]
    temp_list2 = [temp_tab + '.' + i for i in cols]
    up_list = [
        tab_list[i] + ' = ' + temp_list[i] for i in np.arange(len(temp_list))
    ]
    on_list = [on_tab[i] + ' = ' + on_temp[i] for i in np.arange(len(on))]
    #    up_stmt = "update " + table + " set " + ", ".join(up_list) + " from " + table + " inner join " + temp_tab + " on " + " and ".join(on_list)
    if append:
        up_stmt = "merge " + table + " using " + temp_tab + " on (" + " and ".join(
            on_list) + ") when matched then update set " + ", ".join(
                up_list
            ) + " WHEN NOT MATCHED BY TARGET THEN INSERT (" + ", ".join(
                cols) + ") values (" + ", ".join(temp_list2) + ");"
    else:
        up_stmt = "merge " + table + " using " + temp_tab + " on (" + " and ".join(
            on_list) + ") when matched then update set " + ", ".join(
                up_list) + ";"

    ### Run SQL code to update rows
    engine = create_engine('mssql', server, database)
    with engine.begin() as conn:
        print('Saving data to temp table...')
        df.to_sql(temp_tab,
                  con=conn,
                  if_exists='replace',
                  index=index,
                  chunksize=1000)
        print('Updating primary table...')
        conn.execute(up_stmt)
Exemple #4
0
def del_table_rows(server, database, table=None, pk_df=None, stmt=None):
    """
    Function to selectively delete rows from an mssql table.

    Parameters
    ----------
    server : str
        The server name. e.g.: 'SQL2012PROD03'
    database : str
        The specific database within the server. e.g.: 'LowFlows'
    table : str or None if stmt is a str
        The specific table within the database. e.g.: 'LowFlowSiteRestrictionDaily'
    pk_df : DataFrame
        A DataFrame of the primary keys of the table for the rows that should be removed.
    stmt : str
        SQL delete statement. Will override everything except server and database.

    Returns
    -------
    None

    Notes
    -----
    Using the pk_df is the only way to ensure that specific rows will be deleted from composite keys. The column data types and names of pk_df must match the equivelant columns in the SQL table. The procedure creates a temporary table from the pk_df then deletes the rows in the target table based on the temp table. Then finally deletes the temp table.
    """

    ### Make the delete statement
    #    del_where_list = sql_where_stmts(**kwargs)
    if isinstance(stmt, str):
        del_rows_stmt = stmt
    elif isinstance(pk_df, pd.DataFrame):
        temp_tab = '#temp_del_tab1'

        ### Check the primary keys
        pk_stmt = get_pk_stmt.format(db=database, table=table)
        pk = rd_sql(server, database, stmt=pk_stmt).name

        if pk.empty:
            raise ValueError(
                'SQL table has no primary key. Please set one up.')
        if not np.isin(pk, pk_df.columns.tolist()).all():
            raise ValueError(
                'The primary keys in the SQL table does not match up with the pk_df'
            )

        sel_t1 = "select * from " + temp_tab
        cols = pk_df.columns.tolist()
        tab_where = [table + '.' + i for i in cols]
        t1_where = [temp_tab + '.' + i for i in cols]
        where_list = [
            t1_where[i] + ' = ' + tab_where[i] for i in np.arange(len(cols))
        ]
        where_stmt = " where " + " and ".join(where_list)
        exists_stmt = "(" + sel_t1 + where_stmt + ")"
        del_rows_stmt = "DELETE FROM " + table + " where exists " + exists_stmt


#    elif isinstance(del_where_list, list):
#        del_rows_stmt = "DELETE FROM " + table + " WHERE " + " AND ".join(del_where_list)
    else:
        raise ValueError('Please specify pk_df or stmt')

    ### Delete rows
    engine = create_engine('mssql', server, database)
    with engine.begin() as conn:
        if isinstance(pk_df, pd.DataFrame):
            pk_df.to_sql(name=temp_tab,
                         con=conn,
                         if_exists='replace',
                         chunksize=1000)
        conn.execute(del_rows_stmt)
Exemple #5
0
def create_table(server,
                 database,
                 table,
                 dtype_dict,
                 primary_keys=None,
                 foreign_keys=None,
                 foreign_table=None,
                 drop_table=False,
                 con=None):
    """
    Function to create a table in an mssql database.

    Parameters
    ----------
    server : str
        The server name. e.g.: 'SQL2012PROD03'
    database : str
        The specific database within the server. e.g.: 'LowFlows'
    table : str
        The specific table within the database. e.g.: 'LowFlowSiteRestrictionDaily'
    dtype_dict : dict of str
        Dictionary of df columns to the associated sql data type. Examples below.
    primary_keys : str or list of str
        Index columns to define uniqueness in the data structure.
    foreign_keys : str or list of str
        Columns to link to another table in the same database.
    foreign_table: str
        The table in the same database with the identical foreign key(s).
    drop_table : bool
        If the table already exists, should it be dropped?

    Returns
    -------
    None
    """
    ### Make connection
    if con is None:
        engine = create_engine('mssql', server, database)
        con = engine.connect()

    ### Primary keys
    if isinstance(primary_keys, str):
        primary_keys = [primary_keys]
    if isinstance(primary_keys, list):
        pkey_stmt = ", Primary key (" + ", ".join(primary_keys) + ")"
    else:
        pkey_stmt = ""

    ### Foreign keys
    if isinstance(foreign_keys, str):
        foreign_keys = [foreign_keys]
    if isinstance(foreign_keys, list):
        fkey_stmt = ", Foreign key (" + ", ".join(
            foreign_keys
        ) + ") " + "References " + foreign_table + "(" + ", ".join(
            foreign_keys) + ")"
    else:
        fkey_stmt = ""

    ### Initial create table statement
    d1 = [str(i) + ' ' + dtype_dict[i] for i in dtype_dict]
    d2 = ', '.join(d1)
    tab_create_stmt = "IF OBJECT_ID(" + str(
        [str(table)]
    )[1:
      -1] + ", 'U') IS NULL create table " + table + " (" + d2 + pkey_stmt + fkey_stmt + ")"

    trans = con.begin()
    try:

        ### Drop table option or check
        if drop_table:
            drop_stmt = "IF OBJECT_ID(" + str(
                [str(table)])[1:-1] + ", 'U') IS NOT NULL DROP TABLE " + table
            con.execute(drop_stmt)
        else:
            check_tab_stmt = "IF OBJECT_ID(" + str(
                [str(table)]
            )[1:
              -1] + ", 'U') IS NOT NULL SELECT COLUMN_NAME, DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME=" + str(
                  [str(table)])[1:-1]
            #            tab1 = read_sql(check_tab_stmt, conn)
            con.execute(check_tab_stmt)


#            list1 = [i for i in conn]
#            if list1:
#                print('Table already exists. Returning the table info.')
#                df = pd.DataFrame(list1, columns=['columns', 'dtype'])
#                conn.close()
#                return df

### Create table in database
        con.execute(tab_create_stmt)
        trans.commit()
        con.close()

    except Exception as err:
        trans.rollback()
        con.close()
        raise err
Exemple #6
0
def rd_sql(server,
           database,
           table=None,
           col_names=None,
           where_in=None,
           where_op='AND',
           geo_col=False,
           from_date=None,
           to_date=None,
           date_col=None,
           rename_cols=None,
           stmt=None,
           con=None):
    """
    Function to import data from an MSSQL database.

    Parameters
    ----------
    server : str
        The server name. e.g.: 'SQL2012PROD03'
    database : str
        The specific database within the server. e.g.: 'LowFlows'
    table : str
        The specific table within the database. e.g.: 'LowFlowSiteRestrictionDaily'
    col_names : list of str
        The column names that should be retrieved. e.g.: ['SiteID', 'BandNo', 'RecordNo']
    where_in : dict
        A dictionary of strings to lists of strings.'. e.g.: {'SnapshotType': ['value1', 'value2']}
    where_op : str
        If where_in is a dictionary and there are more than one key, then the operator that connects the where statements must be either 'AND' or 'OR'.
    geo_col : bool
        Is there a geometry column in the table?.
    from_date : str
        The start date in the form '2010-01-01'.
    to_date : str
        The end date in the form '2010-01-01'.
    date_col : str
        The SQL table column that contains the dates.
    rename_cols: list of str
        List of strings to rename the resulting DataFrame column names.
    stmt : str
        Custom SQL statement to be directly passed to the database. This will ignore all prior arguments except server and database.
    con : SQLAlchemy connectable (engine/connection) or database string URI
        The sqlalchemy connection to be passed to pandas.read_sql

    Returns
    -------
    DataFrame
    """

    ## Create where statements
    if stmt is None:

        if table is None:
            raise ValueError(
                'Must at least provide input for server, database, and table.')

        if col_names is not None:
            if isinstance(col_names, str):
                col_names = [col_names]
            col_names1 = [
                '[' + i.encode('ascii', 'ignore').decode() + ']'
                for i in col_names
            ]
            col_stmt = ', '.join(col_names1)
        else:
            col_stmt = '*'

        where_lst, where_temp = sql_where_stmts(where_in=where_in,
                                                where_op=where_op,
                                                from_date=from_date,
                                                to_date=to_date,
                                                date_col=date_col)

        if isinstance(where_lst, list):
            stmt1 = "SELECT " + col_stmt + " FROM " + table + " where " + " and ".join(
                where_lst)
        else:
            stmt1 = "SELECT " + col_stmt + " FROM " + table

    elif isinstance(stmt, str):
        where_temp = {}
        stmt1 = stmt

    else:
        raise ValueError('stmt must either be an SQL string or None.')

    ## Create connection to database and execute sql statement
    if geo_col & (stmt is None):
        df = rd_sql_geo(server=server,
                        database=database,
                        table=table,
                        col_stmt=col_stmt,
                        where_lst=where_lst)
        if rename_cols is not None:
            rename_cols1 = rename_cols.copy()
            rename_cols1.extend(['geometry'])
            df.columns = rename_cols1
    else:
        if con is None:
            engine = create_engine('mssql', server, database)
            with engine.begin() as conn:
                if where_temp:
                    for key, value in where_temp.items():
                        df = pd.DataFrame(data=value, columns=[key.lower()])
                        temp_tab = '#temp_' + key.lower()
                        df.to_sql(temp_tab,
                                  con=conn,
                                  if_exists='replace',
                                  index=False,
                                  chunksize=1000)
                df = pd.read_sql(stmt1, con=conn)
        else:
            if where_temp:
                for key, value in where_temp.items():
                    df = pd.DataFrame(data=value, columns=[key])
                    temp_tab = '#temp_' + key
                    df.to_sql(temp_tab,
                              con=con,
                              if_exists='replace',
                              index=False,
                              chunksize=1000)
            df = pd.read_sql(stmt1, con=con)
        if rename_cols is not None:
            df.columns = rename_cols

    return df
Exemple #7
0
def rd_sql_ts(server,
              database,
              table,
              groupby_cols,
              date_col,
              values_cols,
              resample_code=None,
              period=1,
              fun='mean',
              val_round=3,
              where_in=None,
              where_op='AND',
              from_date=None,
              to_date=None,
              min_count=None,
              con=None):
    """
    Function to specifically read and possibly aggregate time series data stored in MSSQL tables.

    Parameters
    ----------
    server : str
        The server name. e.g.: 'SQL2012PROD03'
    database : str
        The specific database within the server. e.g.: 'LowFlows'
    table : str
        The specific table within the database. e.g.: 'LowFlowSiteRestrictionDaily'
    groupby_cols : str or list of str
        The columns in the SQL table to grouped and returned with the time series data.
    date_col : str
        The date column in the table.
    values_cols : str or list of str
        The column(s) of the value(s) that should be resampled.
    resample_code : str or None
        The Pandas time series resampling code. e.g. 'D' for day, 'W' for week, 'M' for month, etc.
    period : int
        The number of resampling periods. e.g. period = 2 and resample = 'D' would be to resample the values over a 2 day period.
    fun : str
        The resampling function. i.e. mean, sum, count, min, or max. No median yet...
    val_round : int
        The number of decimals to round the values.
    where_in : dict
        A dictionary of strings to lists of strings.'. e.g.: {'SnapshotType': ['value1', 'value2']}
    where_op : str
        If where_in is a dictionary and there are more than one key, then the operator that connects the where statements must be either 'AND' or 'OR'.
    from_date : str
        The start date in the form '2010-01-01'.
    to_date : str
        The end date in the form '2010-01-01'.
    min_count : int
        The minimum number of values required to return groupby_cols. Only works when groupby_cols and vlue_cols are str.
    con : SQLAlchemy connectable (engine/connection) or database string URI
        The sqlalchemy connection to be passed to pandas.read_sql

    Returns
    -------
    DataFrame
        Pandas DataFrame with MultiIndex of groupby_cols and date_col
    """

    ## Create where statement
    where_lst, where_temp = sql_where_stmts(where_in=where_in,
                                            where_op=where_op,
                                            from_date=from_date,
                                            to_date=to_date,
                                            date_col=date_col)

    ## Create ts statement and append earlier where statement
    if isinstance(groupby_cols, str):
        groupby_cols = [groupby_cols]
    col_names1 = [
        '[' + i.encode('ascii', 'ignore').decode() + ']' for i in groupby_cols
    ]
    col_stmt = ', '.join(col_names1)

    ## Create sql stmt
    sql_stmt1 = sql_ts_agg_stmt(table,
                                groupby_cols=groupby_cols,
                                date_col=date_col,
                                values_cols=values_cols,
                                resample_code=resample_code,
                                period=period,
                                fun=fun,
                                val_round=val_round,
                                where_lst=where_lst)

    ## Create connection to database
    if con is None:
        con = create_engine('mssql', server, database)

    ## Make minimum count selection
    if (min_count is not None) & isinstance(min_count,
                                            int) & (len(groupby_cols) == 1):
        cols_count_str = ', '.join(
            [groupby_cols[0], 'count(' + values_cols + ') as count'])
        if isinstance(where_lst, list):
            stmt1 = "SELECT " + cols_count_str + " FROM " + "(" + sql_stmt1 + ") as agg" + " GROUP BY " + col_stmt + " HAVING count(" + values_cols + ") >= " + str(
                min_count)
        else:
            stmt1 = "SELECT " + cols_count_str + " FROM " + table + " GROUP BY " + col_stmt + " HAVING count(" + values_cols + ") >= " + str(
                min_count)

        up_sites = pd.read_sql(stmt1, con)[groupby_cols[0]].tolist()
        up_sites = [str(i) for i in up_sites]

        if not up_sites:
            raise ValueError('min_count filtered out all sites.')

        where_in.update({groupby_cols[0]: up_sites})
        where_lst, where_temp = sql_where_stmts(where_in,
                                                where_op=where_op,
                                                from_date=from_date,
                                                to_date=to_date,
                                                date_col=date_col)

        ## Create sql stmt
        sql_stmt1 = sql_ts_agg_stmt(table,
                                    groupby_cols=groupby_cols,
                                    date_col=date_col,
                                    values_cols=values_cols,
                                    resample_code=resample_code,
                                    period=period,
                                    fun=fun,
                                    val_round=val_round,
                                    where_lst=where_lst)

    ## Create connection to database and execute sql statement
    df = pd.read_sql(sql_stmt1, con)

    ## Check to see if any data was found
    if df.empty:
        raise ValueError(
            'No data was found in the database for the parameters given.')

    ## set the index
    df[date_col] = pd.to_datetime(df[date_col])
    groupby_cols.append(date_col)
    df1 = df.set_index(groupby_cols).sort_index()

    return df1