def _content_table(self): if self._content is not None: sql_table = SQLTable( name=self.schema_table_name, pandas_sql_engine=self._sql_engine, frame=self._content, if_exists="replace", index=False, schema=DATA_FRAME_CONTENT_SCHEMA, ) index_col = Column(DATA_FRAME_CONTENT_INDEX_HEADER, INTEGER, primary_key=True, autoincrement=True) sql_table.table.append_column(index_col) metadata_col = Column(METADATA_HEADER, JSONB, nullable=False, server_default="{}") sql_table.table.append_column(metadata_col) else: sql_table = SQLTable( name=self.schema_table_name, pandas_sql_engine=self._sql_engine, if_exists="replace", index=True, index_label=DATA_FRAME_CONTENT_INDEX_HEADER, schema=DATA_FRAME_CONTENT_SCHEMA, ) return sql_table
def to_sql(df, name, schema, con, index, if_exists, mode='default', **kwargs): """ Override the default `pandas.to_sql` method to allow for insertion of multiple rows of data at once. This is derived from the upstream patch at https://github.com/pandas-dev/pandas/pull/21401, and can be deprecated once it is merged and released in a new version of `pandas`. """ assert mode in ('default', 'multi'), 'unexpected `to_sql` mode {}'.format(mode) if mode == 'default': return df.to_sql(name=name, schema=schema, con=con, index=index, if_exists=if_exists, **kwargs) else: nrows = len(df) if nrows == 0: return chunksize = kwargs.get('chunksize', nrows) if chunksize == 0: raise ValueError('chunksize argument should be non-zero') chunks = int(nrows / chunksize) + 1 pd_sql = SQLDatabase(con) pd_table = SQLTable(name, pd_sql, frame=df, index=index, if_exists=if_exists, index_label=kwargs.get('insert_label'), schema=schema, dtype=kwargs.get('dtype')) pd_table.create() keys, data_list = pd_table.insert_data() with pd_sql.run_transaction() as conn: for i in range(chunks): start_i = i * chunksize end_i = min((i + 1) * chunksize, nrows) if start_i >= end_i: break chunk_iter = zip(*[arr[start_i:end_i] for arr in data_list]) data = [{k: v for k, v in zip(keys, row)} for row in chunk_iter] conn.execute(pd_table.table.insert(data)) # multivalues insert
def gen_sql_table(self, df): from pandas.io.sql import SQLTable from sqlalchemy import Column, DateTime self.table = SQLTable(self.table_name, self.pd_sql, df, index=False, schema=self.schema).table.tometadata( self.pd_sql.meta) if self.update_timestamp_field and self.update_timestamp_field not in self.table.columns: self.table.append_column( Column(self.update_timestamp_field, DateTime)) if self.insert_timestamp_field and self.insert_timestamp_field not in self.table.columns: self.table.append_column( Column(self.insert_timestamp_field, DateTime))
def _create_table(schema: str, table_name: str, creds: SqlCreds, df: pd.DataFrame, if_exists: str): """use pandas' own code to create the table and schema""" sql_db = SQLDatabase(engine=creds.engine, schema=schema) table = SQLTable( table_name, sql_db, frame=df, index=False, # already set as new col earlier if index=True if_exists=if_exists, index_label=None, schema=schema, dtype=None, ) table.create()
def to_sql_set_primary_key_and_not_null(self, frame, name, con, keys, sql_table, schema=None, if_exists='fail', index=True, index_label=None, chunksize=None, dtype=None): # ref: https://github.com/pandas-dev/pandas/blob/master/pandas/io/sql.py#L437 if if_exists not in ('fail', 'replace', 'append'): raise ValueError( "'{0}' is not valid for if_exists".format(if_exists)) # ref: https://github.com/pandas-dev/pandas/blob/master/pandas/io/sql.py#L508 pandas_sql = SQLDatabase(con, schema=schema) if isinstance(frame, pd.Series): frame = frame.to_frame() elif not isinstance(frame, pd.DataFrame): raise NotImplementedError( "'frame' argument should be either a Series or a DataFrame") if dtype is not None: from sqlalchemy.types import to_instance, TypeEngine for col, my_type in dtype.items(): if not isinstance(to_instance(my_type), TypeEngine): raise ValueError( 'The type of {} is not a SQLAlchemy type '.format(col)) table = SQLTable(name, pandas_sql, frame=frame, index=index, if_exists=if_exists, index_label=index_label, schema=schema, keys=keys, dtype=dtype) table.table = sql_table table.create() table.insert(chunksize)
def get_sa_table_for_dataframe(dataframe, tablename, schemaname): sa_engine = get_engine() # get max lengths for strings and use it to set dtypes dtypes = {} object_types = get_dataframe_column_object_types(dataframe) for c in object_types: if dataframe[c].dtype == np.dtype('O'): n = dataframe[c].map(lambda c: len(str(c)) if c else None).max() # we use 10 times the max length or varchar(max) dtypes[c] = VARCHAR(min([n * 10, 65535])) table = SQLTable(tablename, pandasSQL_builder(sa_engine, schema=schemaname), dataframe, if_exists=True, index=False, dtype=dtypes) return table
def to_redshift(self, table_name, s3_bucket, s3_key, engine=None, schema=None, if_exists="fail", index=False, compress=True, primary_key=None, aws_access_key_id=None, aws_secret_access_key=None, **kwargs): if not engine: engine = generate_redshift_engine_string() if not aws_access_key_id: aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID") if not aws_secret_access_key: aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY") # Get Pandas SQLTable object table = SQLTable( table_name, pandasSQL_builder(engine, schema=schema), self, if_exists=if_exists, schema=schema, index=index, ) def quote(s): return '"' + str(s) + '"' # Full table name with schema if schema: full_table_name = quote(schema) + "." + quote(table_name) else: full_table_name = quote(table_name) # Check table if table.exists(): if if_exists == "fail": raise ValueError("Table {} already exists.".format(table_name)) elif if_exists == "append": queue = [ CopyCommand( to=table, data_location="s3://{}/{}".format(s3_bucket, s3_key), access_key_id=aws_access_key_id, secret_access_key=aws_secret_access_key, format="CSV", compression="GZIP" if compress else None, ) ] elif if_exists == "replace": queue = [ "drop table {};".format(full_table_name), table.sql_schema() + ";", CopyCommand( to=table, data_location="s3://{}/{}".format(s3_bucket, s3_key), access_key_id=aws_access_key_id, secret_access_key=aws_secret_access_key, format="CSV", compression="GZIP" if compress else None, ), ] elif if_exists == "update": staging_table = "{}_staging".format(table_name) if not primary_key: raise ValueError( "Expected a primary key to update existing table") queue = [ "begin;", "drop table if exists {};".format(staging_table), "create temporary table {} (like {});".format( staging_table, full_table_name), CopyCommand( to=table, data_location="s3://{}/{}".format(s3_bucket, s3_key), access_key_id=aws_access_key_id, secret_access_key=aws_secret_access_key, format="CSV", compression="GZIP" if compress else None, ), "delete from {full_table_name} where {primary_key} in (select {primary_key} from {staging_table});" .format( full_table_name=full_table_name, primary_key=primary_key, staging_table=staging_table, ), "insert into {} (select * from {});".format( full_table_name, staging_table), "end;", ] else: raise ValueError("{} is not valid for if_exists".format(if_exists)) else: queue = [ table.sql_schema() + ";", CopyCommand( to=table, data_location="s3://{}/{}".format(s3_bucket, s3_key), access_key_id=aws_access_key_id, secret_access_key=aws_secret_access_key, format="CSV", compression="GZIP" if compress else None, ), ] # Save DataFrame to S3 self.to_s3(bucket=s3_bucket, key=s3_key, index=index, compress=compress) # Execute queued statements engine = _engine_builder(engine) with engine.begin() as con: for stmt in queue: con.execute(stmt)
def to_redshift(self, table_name, engine, bucket, keypath=None, schema=None, if_exists='fail', index=True, index_label=None, aws_access_key_id=None, aws_secret_access_key=None, columns=None, null_as=None, emptyasnull=True): """ Write a DataFrame to redshift via S3 Parameters ========= table_name : str. (unqualified) name in redshift engine : SQLA engine bucket : str; s3 bucket keypath : str; keypath in s3 (without bucket name) schema : redshift schema if_exits : str; {'fail', 'append', 'replace'} index : bool; include DataFrames index index_label : bool; label for the index aws_access_key_id / aws_secret_access_key : from ~/.boto by default columns : subset of columns to include null_as : treat these as null emptyasnull bool; whether '' is null """ url = self.to_s3(keypath, engine, bucket=bucket, index=index, index_label=index_label) qualname = resolve_qualname(table_name, schema) table = SQLTable(table_name, pandasSQL_builder(engine, schema=schema), self, if_exists=if_exists, index=index) if columns is None: columns = '' else: columns = '()'.format(','.join(columns)) print("Creating table {}".format(qualname)) if table.exists(): if if_exists == 'fail': raise ValueError("Table Exists") elif if_exists == 'append': queue = [] elif if_exists == 'replace': queue = ['drop table {}'.format(qualname), table.sql_schema()] else: raise ValueError("Bad option for `if_exists`") else: queue = [table.sql_schema()] with engine.begin() as con: for stmt in queue: con.execute(stmt) s3conn = boto.connect_s3(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) conn = psycopg2.connect(database=engine.url.database, user=engine.url.username, password=engine.url.password, host=engine.url.host, port=engine.url.port, sslmode='require') cur = conn.cursor() if null_as is not None: null_as = "NULL AS '{}'".format(null_as) else: null_as = '' if emptyasnull: emptyasnull = "EMPTYASNULL" else: emptyasnull = '' full_keypath = 's3://' + url print("COPYing") stmt = ("copy {qualname} {columns} from '{keypath}' " "credentials 'aws_access_key_id={key};aws_secret_access_key={secret}' " "GZIP " "{null_as} " "{emptyasnull}" "CSV;".format(qualname=qualname, columns=columns, keypath=full_keypath, key=s3conn.aws_access_key_id, secret=s3conn.aws_secret_access_key, null_as=null_as, emptyasnull=emptyasnull)) cur.execute(stmt) conn.commit() conn.close()
def to_sql(name, engine, frame, chunksize=None, **kwargs): table = SQLTable(name, engine, frame=frame, **kwargs) table.create() table.insert(chunksize)
def to_sql(name, engine, frame, chunksize=None, **kwargs): pandas_sql_engine = pandasSQL_builder(engine) table = SQLTable(name, pandas_sql_engine, frame=frame, **kwargs) table.create() table.insert(chunksize)
def write(self, data_frame, routine_name, table_name, bucketname=None, if_exists='replace', sub_routine=None): """Write data table :param data_frame: dataframe :param routine_name: routine name :param table_name: table name :param bucketname: bucket name :param if_exists: method if exists :param sub_routine: sub routine :return: None """ # todo this function is pretty verbose as it is, please use logger instead of print # todo make sure log statement is understandable for outside observer # todo bucketname should always be project_name, redshift should know its own project_name # todo when table is new, write metadata, but give an option to skip metadata self.bucket = bucketname if (table_name != 'meta_database') & (sub_routine is None): table_name = routine_name + '/' + table_name elif (table_name == 'meta_database') & (sub_routine is None): table_name = table_name else: table_name = routine_name + '/' + sub_routine + '/' + table_name print(table_name) logging.info('Writing table {} :'.format(table_name)) s3 = boto3.resource('s3') bucket = s3.Bucket(bucketname) con = psycopg2.connect(self.redshift_path) con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur = con.cursor() # write DF to string stream csv_buffer = StringIO() data_frame.to_csv(csv_buffer, index=None, header=None, sep='|') # reset stream position csv_buffer.seek(0) # create binary stream gz_buffer = BytesIO() # compress string stream using gzip with gzip.GzipFile(mode='w', fileobj=gz_buffer) as gz_file: gz_file.write(bytes(csv_buffer.getvalue(), 'utf-8')) # write stream to S3 timestamp = datetime.datetime.strftime(datetime.datetime.now(), '%Y%m%d%H%M%S') bucket.put_object(Key='tmp_' + timestamp + '.gz', Body=gz_buffer.getvalue()) print('saved file ') # CREATE THE COPY STATEMENT TO SEND FROM S3 TO THE TABLE IN REDSHIFT s3_path_tmp_file = 's3://{0}/{1}'.format(bucketname, 'tmp_' + timestamp + '.gz') print('create table') table = SQLTable(table_name, pandasSQL_builder(self.engine, schema=None), data_frame, if_exists=if_exists, index=None) statements = [] if table.exists(): if if_exists == 'fail': raise ValueError("Table Exists") elif if_exists == 'append': statements = [] elif if_exists == 'replace': statements = [ """ truncate "{}"; rollback; drop table "{}";""".format( table_name, table_name) ] else: raise ValueError("Bad option for `if_exists`") statements.append(table.sql_schema() + ';') statement = """ copy "{0}" from '{1}' delimiter '{2}' region 'us-east-1' CREDENTIALS 'aws_access_key_id={3};aws_secret_access_key={4}' FORMAT AS CSV NULL AS '@NULL@' GZIP TRUNCATECOLUMNS """.format(table_name, s3_path_tmp_file, '|', 'AKIAIVCDQREXD2TPPRAQ', 'SCemMCgkq1rUruSrIDbFdjorHthnvY6E4j8/UEfg') statements.append(statement) try: logging.info('excucte statement') for stmt in statements: print(stmt) cur.execute(stmt) # con.commit() logging.info('finish execute') except Exception as e: print(e) traceback.print_exc(file=sys.stdout) con.rollback() raise s3.Object(bucketname, 'tmp_' + timestamp + '.gz').delete() logging.info('FILLING THE TABLE IN REDSHIFT') logging.info('\n--------------- write already -----------------')
def read_sql_table(engine, table_name, index_col=None, columns=None, select_from=None, limit=None, order_by=None, where=None, coerce_types=None, raise_on_missing=True): """ Load a table from a SQL database. Parameters ---------- engine : SQLAlchemy engine The SQL database to load from. table_name : str The name of the table to load. index_col : str, optional Column name to use as index for the returned data frame. columns : sequence of str, optional Columns to select from the table. By default, all columns are selected. select_from : str or SQLAlchemy clause, optional A FROM clause to use for the select statement. Defaults to the table name. limit : int, optional Limit the number of rows selected. order_by : str or SQLAlchemy clause, optional An ORDER BY clause to sort the selected rows. where : str or SQLAlchemy clause, optional A WHERE clause used to filter the selected rows. coerce_types : dict(str : dtype or Python type), optional Override pandas type inference for specific columns. Returns ------- A pandas DataFrame. """ # Pandas does not expose many of these options, so we pull out some of # Pandas' internals. # # An alternative approach would be to use `pandas.read_sql_query` with an # appropriate (dialect-specific) query. However, this approach would not # utilize Pandas' logic for column type inference (performed by # `_harmonize_columns()` below), and would hence produce inferior results. from sqlalchemy.schema import MetaData from pandas.io.sql import SQLDatabase, SQLTable # From pandas.io.sql.read_sql_table # and pandas.io.sql.SQLDatabase.read_table: meta = MetaData(engine) try: meta.reflect(only=[table_name]) except sqlalchemy.exc.InvalidRequestError: if raise_on_missing: raise ValueError("Table %s not found" % table_name) else: return None pd_db = SQLDatabase(engine, meta=meta) pd_tbl = SQLTable(table_name, pd_db, index=None) # Adapted from pandas.io.SQLTable.read: if columns is not None and len(columns) > 0: if index_col is not None and index_col not in columns: columns = [index_col] + columns cols = [pd_tbl.table.c[n] for n in columns] else: cols = pd_tbl.table.c if pd_tbl.index is not None: [cols.insert(0, pd_tbl.table.c[idx]) for idx in pd_tbl.index[::-1]] # Strip the table name from each of the column names to allow for more # general FROM clauses. sql_select = sqlalchemy.select([ sqlalchemy.column(str(c).replace('{}.'.format(table_name), '', 1)) for c in cols ]) if select_from is not None: sql_select = sql_select.select_from(select_from) else: sql_select = sql_select.select_from(sqlalchemy.table(table_name)) if where is not None: if isinstance(where, basestring): where = sqlalchemy.text(where) sql_select = sql_select.where(where) if limit is not None: sql_select = sql_select.limit(limit) if order_by is not None: if isinstance(order_by, basestring): order_by = sqlalchemy.sql.column(order_by) sql_select = sql_select.order_by(order_by) result = pd_db.execute(sql_select) data = result.fetchall() column_names = result.keys() pd_tbl.frame = pandas.DataFrame.from_records(data, index=index_col, columns=column_names) # This line has caused issues with incorrect type inference -- add it # back with caution. # pd_tbl._harmonize_columns() # Added by me: coerce types if coerce_types: frame = pd_tbl.frame for col, dtype in coerce_types.iteritems(): frame[col] = frame[col].astype(dtype, copy=False) return pd_tbl.frame
class AutoGrowTable(object): def __init__(self, connection, table_name, primary_key, insert_timestamp_field=None, update_timestamp_field=None, use_on_duplicate=False): self.use_on_duplicate = use_on_duplicate self.connection = get_connection(connection) tps = table_name.split(".") self.table_name = tps[-1] self.schema = len(tps) > 1 and tps[0] or None self.full_table_name = self.schema and "%s.%s" % ( self.connection.ops.quote_name( self.schema), self.connection.ops.quote_name( self.table_name)) or self.connection.ops.quote_name( self.table_name) self.primary_key = primary_key self.fields = {} self.insert_timestamp_field = insert_timestamp_field self.update_timestamp_field = update_timestamp_field self.pd_sql = pandasSQL_builder(db_sqlalchemy_str(self.connection), schema=self.schema) self.detect_fields() def detect_fields(self): try: self.fields = [ f.lower() for f in get_table_fields( self.connection, self.table_name, self.schema) ] except Exception as e: err_str = str(e) if "does not exist" in err_str: return log.error("AutoGroupTable.detect_fields %s %s error: %s", self.connection.alias, self.table_name, e) def get_field_definition(self, fields): return ",".join(["%s %s" % (f, ftype(f)) for f in fields]) def create_table(self, df): exists = self.pd_sql.has_table(self.table_name) dtypes = dict([(c, dtype(str(dt))) for c, dt in df.dtypes.iteritems()]) new_fields = [ "%s %s" % (f, ftype(dt)) for f, dt in dtypes.iteritems() if f.lower() not in self.fields ] if self.update_timestamp_field and self.update_timestamp_field not in self.fields: new_fields.append("%s timestamp default CURRENT_TIMESTAMP" % self.update_timestamp_field) if self.insert_timestamp_field and self.insert_timestamp_field not in self.fields: new_fields.append("%s timestamp default CURRENT_TIMESTAMP" % self.insert_timestamp_field) with self.connection.cursor() as cursor: if not exists: sql = "create table %s(%s)" % (self.full_table_name, ",".join(new_fields)) # print sql cursor.execute(sql) sql = "alter table %s add primary key(%s)" % ( self.full_table_name, self.primary_key) # print sql cursor.execute(sql) self.detect_fields() else: if new_fields: sql = "alter table %s add column %s" % ( self.full_table_name, ", add column ".join(new_fields)) # print sql cursor.execute(sql) def create_table2(self, df): from . import dbutils fields = get_db_schema_fields(df) if self.insert_timestamp_field: fields[self.insert_timestamp_field] = dict(type='DateTimeField', params=dict(null=True)) if self.update_timestamp_field: fields[self.update_timestamp_field] = dict(type='DateTimeField', params=dict(null=True)) dbutils.create_table(self.connection, self.table_name, fields, schema=self.schema, force_lower_name=True, primary_key=self.primary_key) def run(self, data_frame): df = data_frame lower_column_name(df) self.create_table(df) if self.use_on_duplicate: return self.batch_insert(df) else: errors = self.insert_or_update(df) return errors def gen_sql_table(self, df): from pandas.io.sql import SQLTable from sqlalchemy import Column, DateTime self.table = SQLTable(self.table_name, self.pd_sql, df, index=False, schema=self.schema).table.tometadata( self.pd_sql.meta) if self.update_timestamp_field and self.update_timestamp_field not in self.table.columns: self.table.append_column( Column(self.update_timestamp_field, DateTime)) if self.insert_timestamp_field and self.insert_timestamp_field not in self.table.columns: self.table.append_column( Column(self.insert_timestamp_field, DateTime)) def gen_rows(self, df): for i in xrange(len(df)): s = df.iloc[i] yield [type_convert(a) for a in s.tolist()] def insert_or_update(self, df): self.gen_sql_table(df) errors = [] df = format_timestamp(tz_convert(df)) pks = [k.strip() for k in self.primary_key.split(",")] efs = ['1 as a'] if self.insert_timestamp_field: efs.append(self.insert_timestamp_field) if self.update_timestamp_field: efs.append(self.update_timestamp_field) sql_template = "select %s from %s where %%s" % (",".join(efs), self.full_table_name) quote_name = self.connection.ops.quote_name for i in xrange(len(df)): try: s = df.iloc[i] d = clear_dict_nan_value(s.to_dict()) where = " and ".join([ "%s='%s'" % (quote_name(pk), d[pk.lower()]) for pk in pks ]) sql = sql_template % where rs = self.pd_sql.read_sql(sql, coerce_float=False) now = datetime.now().isoformat() if not rs.empty: r = rs.iloc[0] if self.update_timestamp_field: d[self.update_timestamp_field] = now if self.insert_timestamp_field: d[self.insert_timestamp_field] = r[ self.insert_timestamp_field] self.table.update().where(where).values(d).execute() else: if self.insert_timestamp_field: d[self.insert_timestamp_field] = now if self.update_timestamp_field: d[self.update_timestamp_field] = now self.table.insert(d).execute() except Exception as e: errors.append(([d[k.lower()] for k in pks], str(e))) if errors: log.error( "pandas.AutoGrowTable %s.%s insert_or_update got %d errors: %s", self.connection.alias, self.table_name, len(errors), errors) return errors def update(self, df): for r in xrange(len(df)): self.table.update(df.iloc[r].to_dict()).execute() def batch_insert(self, df, chunk=1000): from . import dbutils df = tz_convert(df) fields = get_db_schema_fields(df) update_values = {} if self.update_timestamp_field: update_values[self.update_timestamp_field] = 'CURRENT_TIMESTAMP' insert_values = {} if self.insert_timestamp_field: insert_values[self.insert_timestamp_field] = 'CURRENT_TIMESTAMP' if self.update_timestamp_field: insert_values[ self.update_timestamp_field] = 'CURRENT_TIMESTAMP' sql = dbutils.gen_batch_insert_sql(self.table_name, fields, self.primary_key, insert_values=insert_values, update_values=update_values, vendor=self.connection.vendor) print(sql) with self.connection.cursor() as cursor: dbutils.batch_execute(cursor, sql, self.gen_rows(df), chunk=chunk)