def _get_sqa_table(self, table_name: str, table_type: str, schema: str = None, database: str = None, dtype=None ) -> None: """ Method that extracts sqa table object """ self._load_df_if_empty() pandas_sql = pandasSQL_builder(con=self._con) sqllite = False if not isinstance(pandas_sql, SQLDatabase): sqllite = True pd_db = SQLDatabase(engine=self._con) table_klass = SQLTable if not sqllite else SQLiteTable target_frame = (self._data if table_type == "staging" else self._his_data) target_name = self._build_name(table=table_name, table_type=table_type) table = table_klass( name=target_name, pandas_sql_engine=pd_db, index=False, frame=target_frame ) return table._create_table_setup()
def to_sql(df, name, schema, con, index, if_exists, mode='default', **kwargs): """ Override the default `pandas.to_sql` method to allow for insertion of multiple rows of data at once. This is derived from the upstream patch at https://github.com/pandas-dev/pandas/pull/21401, and can be deprecated once it is merged and released in a new version of `pandas`. """ assert mode in ('default', 'multi'), 'unexpected `to_sql` mode {}'.format(mode) if mode == 'default': return df.to_sql(name=name, schema=schema, con=con, index=index, if_exists=if_exists, **kwargs) else: nrows = len(df) if nrows == 0: return chunksize = kwargs.get('chunksize', nrows) if chunksize == 0: raise ValueError('chunksize argument should be non-zero') chunks = int(nrows / chunksize) + 1 pd_sql = SQLDatabase(con) pd_table = SQLTable(name, pd_sql, frame=df, index=index, if_exists=if_exists, index_label=kwargs.get('insert_label'), schema=schema, dtype=kwargs.get('dtype')) pd_table.create() keys, data_list = pd_table.insert_data() with pd_sql.run_transaction() as conn: for i in range(chunks): start_i = i * chunksize end_i = min((i + 1) * chunksize, nrows) if start_i >= end_i: break chunk_iter = zip(*[arr[start_i:end_i] for arr in data_list]) data = [{k: v for k, v in zip(keys, row)} for row in chunk_iter] conn.execute(pd_table.table.insert(data)) # multivalues insert
def _create_table(schema: str, table_name: str, creds: SqlCreds, df: pd.DataFrame, if_exists: str): """use pandas' own code to create the table and schema""" sql_db = SQLDatabase(engine=creds.engine, schema=schema) table = SQLTable( table_name, sql_db, frame=df, index=False, # already set as new col earlier if index=True if_exists=if_exists, index_label=None, schema=schema, dtype=None, ) table.create()
def to_sql_set_primary_key_and_not_null(self, frame, name, con, keys, sql_table, schema=None, if_exists='fail', index=True, index_label=None, chunksize=None, dtype=None): # ref: https://github.com/pandas-dev/pandas/blob/master/pandas/io/sql.py#L437 if if_exists not in ('fail', 'replace', 'append'): raise ValueError( "'{0}' is not valid for if_exists".format(if_exists)) # ref: https://github.com/pandas-dev/pandas/blob/master/pandas/io/sql.py#L508 pandas_sql = SQLDatabase(con, schema=schema) if isinstance(frame, pd.Series): frame = frame.to_frame() elif not isinstance(frame, pd.DataFrame): raise NotImplementedError( "'frame' argument should be either a Series or a DataFrame") if dtype is not None: from sqlalchemy.types import to_instance, TypeEngine for col, my_type in dtype.items(): if not isinstance(to_instance(my_type), TypeEngine): raise ValueError( 'The type of {} is not a SQLAlchemy type '.format(col)) table = SQLTable(name, pandas_sql, frame=frame, index=index, if_exists=if_exists, index_label=index_label, schema=schema, keys=keys, dtype=dtype) table.table = sql_table table.create() table.insert(chunksize)
def _sql_engine(self): return SQLDatabase(db.engine)
def read_sql_table(engine, table_name, index_col=None, columns=None, select_from=None, limit=None, order_by=None, where=None, coerce_types=None, raise_on_missing=True): """ Load a table from a SQL database. Parameters ---------- engine : SQLAlchemy engine The SQL database to load from. table_name : str The name of the table to load. index_col : str, optional Column name to use as index for the returned data frame. columns : sequence of str, optional Columns to select from the table. By default, all columns are selected. select_from : str or SQLAlchemy clause, optional A FROM clause to use for the select statement. Defaults to the table name. limit : int, optional Limit the number of rows selected. order_by : str or SQLAlchemy clause, optional An ORDER BY clause to sort the selected rows. where : str or SQLAlchemy clause, optional A WHERE clause used to filter the selected rows. coerce_types : dict(str : dtype or Python type), optional Override pandas type inference for specific columns. Returns ------- A pandas DataFrame. """ # Pandas does not expose many of these options, so we pull out some of # Pandas' internals. # # An alternative approach would be to use `pandas.read_sql_query` with an # appropriate (dialect-specific) query. However, this approach would not # utilize Pandas' logic for column type inference (performed by # `_harmonize_columns()` below), and would hence produce inferior results. from sqlalchemy.schema import MetaData from pandas.io.sql import SQLDatabase, SQLTable # From pandas.io.sql.read_sql_table # and pandas.io.sql.SQLDatabase.read_table: meta = MetaData(engine) try: meta.reflect(only=[table_name]) except sqlalchemy.exc.InvalidRequestError: if raise_on_missing: raise ValueError("Table %s not found" % table_name) else: return None pd_db = SQLDatabase(engine, meta=meta) pd_tbl = SQLTable(table_name, pd_db, index=None) # Adapted from pandas.io.SQLTable.read: if columns is not None and len(columns) > 0: if index_col is not None and index_col not in columns: columns = [index_col] + columns cols = [pd_tbl.table.c[n] for n in columns] else: cols = pd_tbl.table.c if pd_tbl.index is not None: [cols.insert(0, pd_tbl.table.c[idx]) for idx in pd_tbl.index[::-1]] # Strip the table name from each of the column names to allow for more # general FROM clauses. sql_select = sqlalchemy.select([ sqlalchemy.column(str(c).replace('{}.'.format(table_name), '', 1)) for c in cols ]) if select_from is not None: sql_select = sql_select.select_from(select_from) else: sql_select = sql_select.select_from(sqlalchemy.table(table_name)) if where is not None: if isinstance(where, basestring): where = sqlalchemy.text(where) sql_select = sql_select.where(where) if limit is not None: sql_select = sql_select.limit(limit) if order_by is not None: if isinstance(order_by, basestring): order_by = sqlalchemy.sql.column(order_by) sql_select = sql_select.order_by(order_by) result = pd_db.execute(sql_select) data = result.fetchall() column_names = result.keys() pd_tbl.frame = pandas.DataFrame.from_records(data, index=index_col, columns=column_names) # This line has caused issues with incorrect type inference -- add it # back with caution. # pd_tbl._harmonize_columns() # Added by me: coerce types if coerce_types: frame = pd_tbl.frame for col, dtype in coerce_types.iteritems(): frame[col] = frame[col].astype(dtype, copy=False) return pd_tbl.frame