def create_temporary_table(self, df, temporary_table_name, key, engine): """ Snippet to create a temporary table in the SQL database Inpired from https://stackoverflow.com/questions/30867390/python-pandas-to-sql-how-to-create-a-table-with-a-primary-key """ # Local variables eng = self.engine with eng.connect() as conn, conn.begin(): pandas_engine = pandasSQL_builder(conn) # creating a table table = TemporaryTable(temporary_table_name, pandas_engine, frame=df, if_exists="replace") table.create() # dumping to the existing table df.to_sql(temporary_table_name, conn, index=False, if_exists="replace") # Simply add the primary key after uploading the table with pandas. with eng.connect() as con: con.execute('ALTER TABLE ' + temporary_table_name + ' ADD PRIMARY KEY (' + key + ');')
def get_temp_table(conn, data, create=False, **kwargs): """Reuse Pandas logic for creating a temp table. The definition will be formed based on the first row of data passed""" table_name = get_temp_table_name() pandas_engine = pandasSQL_builder(conn, schema=kwargs.get("schema", None)) if isinstance(conn, sqlite3.Connection): cls = SQLiteTemporaryTable elif is_sqlalchemy_conn(conn): cls = TemporaryTable else: raise AssertionError( "Only sqlite3 and SQLAlchemy conns are currently supported" ) df = data if not isinstance(data, pd.DataFrame): # This reuses pandas logic for creating tables, but doesn't fully # convert rows to a dataframe since that may be expensive and # unwanted. Assumes its an iterable of dicts and keys() returns column # names in proper order. first_row = data[0] df = pd.DataFrame.from_records([first_row], columns=first_row.keys()) table = cls(table_name, pandas_engine, frame=df, if_exists="fail") if create: table.create() return table
def _get_sqa_table(self, table_name: str, table_type: str, schema: str = None, database: str = None, dtype=None ) -> None: """ Method that extracts sqa table object """ self._load_df_if_empty() pandas_sql = pandasSQL_builder(con=self._con) sqllite = False if not isinstance(pandas_sql, SQLDatabase): sqllite = True pd_db = SQLDatabase(engine=self._con) table_klass = SQLTable if not sqllite else SQLiteTable target_frame = (self._data if table_type == "staging" else self._his_data) target_name = self._build_name(table=table_name, table_type=table_type) table = table_klass( name=target_name, pandas_sql_engine=pd_db, index=False, frame=target_frame ) return table._create_table_setup()
def load_table(filename, tablename, engine=None, infer_size=100, chunk_size=1000): engine = engine or sa.create_engine(config.SQLA_URI) file = ensure_csv(filename) # Pass data types to iterator to ensure consistent types across chunks dtypes = pd.read_csv(file.name, nrows=infer_size).dtypes chunks = pd.read_csv(file.name, chunksize=chunk_size, iterator=True, dtype=dtypes) for idx, chunk in enumerate(chunks): chunk.index += chunk_size * idx sql_engine = pandasSQL_builder(engine) to_sql( tablename, sql_engine, chunk, chunksize=chunk_size, keys='index', if_exists='append', )
def execute_from_model(self, tt: task_models.TaskTemplate, **kwargs) -> typing.Any: if tt.custom["secret_connect_args"] is not None: for key, secret_dict in tt.custom["secret_connect_args"].items(): value = current_context().secrets.get( group=secret_dict["group"], key=secret_dict["key"]) tt.custom["connect_args"][key] = value engine = create_engine(tt.custom["uri"], connect_args=tt.custom["connect_args"], echo=False) print(f"Connecting to db {tt.custom['uri']}") interpolated_query = SQLAlchemyTask.interpolate_query( tt.custom["query_template"], **kwargs) print(f"Interpolated query {interpolated_query}") with engine.begin() as connection: df = None if tt.interface.outputs: df = pd.read_sql_query(interpolated_query, connection) else: pandasSQL_builder(connection).execute(interpolated_query) return df
def get_sa_table_for_dataframe(dataframe, tablename, schemaname): sa_engine = get_engine() # get max lengths for strings and use it to set dtypes dtypes = {} object_types = get_dataframe_column_object_types(dataframe) for c in object_types: if dataframe[c].dtype == np.dtype('O'): n = dataframe[c].map(lambda c: len(str(c)) if c else None).max() # we use 10 times the max length or varchar(max) dtypes[c] = VARCHAR(min([n * 10, 65535])) table = SQLTable(tablename, pandasSQL_builder(sa_engine, schema=schemaname), dataframe, if_exists=True, index=False, dtype=dtypes) return table
def __init__(self, connection, table_name, primary_key, insert_timestamp_field=None, update_timestamp_field=None, use_on_duplicate=False): self.use_on_duplicate = use_on_duplicate self.connection = get_connection(connection) tps = table_name.split(".") self.table_name = tps[-1] self.schema = len(tps) > 1 and tps[0] or None self.full_table_name = self.schema and "%s.%s" % ( self.connection.ops.quote_name( self.schema), self.connection.ops.quote_name( self.table_name)) or self.connection.ops.quote_name( self.table_name) self.primary_key = primary_key self.fields = {} self.insert_timestamp_field = insert_timestamp_field self.update_timestamp_field = update_timestamp_field self.pd_sql = pandasSQL_builder(db_sqlalchemy_str(self.connection), schema=self.schema) self.detect_fields()
def to_redshift(self, table_name, s3_bucket, s3_key, engine=None, schema=None, if_exists="fail", index=False, compress=True, primary_key=None, aws_access_key_id=None, aws_secret_access_key=None, **kwargs): if not engine: engine = generate_redshift_engine_string() if not aws_access_key_id: aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID") if not aws_secret_access_key: aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY") # Get Pandas SQLTable object table = SQLTable( table_name, pandasSQL_builder(engine, schema=schema), self, if_exists=if_exists, schema=schema, index=index, ) def quote(s): return '"' + str(s) + '"' # Full table name with schema if schema: full_table_name = quote(schema) + "." + quote(table_name) else: full_table_name = quote(table_name) # Check table if table.exists(): if if_exists == "fail": raise ValueError("Table {} already exists.".format(table_name)) elif if_exists == "append": queue = [ CopyCommand( to=table, data_location="s3://{}/{}".format(s3_bucket, s3_key), access_key_id=aws_access_key_id, secret_access_key=aws_secret_access_key, format="CSV", compression="GZIP" if compress else None, ) ] elif if_exists == "replace": queue = [ "drop table {};".format(full_table_name), table.sql_schema() + ";", CopyCommand( to=table, data_location="s3://{}/{}".format(s3_bucket, s3_key), access_key_id=aws_access_key_id, secret_access_key=aws_secret_access_key, format="CSV", compression="GZIP" if compress else None, ), ] elif if_exists == "update": staging_table = "{}_staging".format(table_name) if not primary_key: raise ValueError( "Expected a primary key to update existing table") queue = [ "begin;", "drop table if exists {};".format(staging_table), "create temporary table {} (like {});".format( staging_table, full_table_name), CopyCommand( to=table, data_location="s3://{}/{}".format(s3_bucket, s3_key), access_key_id=aws_access_key_id, secret_access_key=aws_secret_access_key, format="CSV", compression="GZIP" if compress else None, ), "delete from {full_table_name} where {primary_key} in (select {primary_key} from {staging_table});" .format( full_table_name=full_table_name, primary_key=primary_key, staging_table=staging_table, ), "insert into {} (select * from {});".format( full_table_name, staging_table), "end;", ] else: raise ValueError("{} is not valid for if_exists".format(if_exists)) else: queue = [ table.sql_schema() + ";", CopyCommand( to=table, data_location="s3://{}/{}".format(s3_bucket, s3_key), access_key_id=aws_access_key_id, secret_access_key=aws_secret_access_key, format="CSV", compression="GZIP" if compress else None, ), ] # Save DataFrame to S3 self.to_s3(bucket=s3_bucket, key=s3_key, index=index, compress=compress) # Execute queued statements engine = _engine_builder(engine) with engine.begin() as con: for stmt in queue: con.execute(stmt)
def to_redshift(self, table_name, engine, bucket, keypath=None, schema=None, if_exists='fail', index=True, index_label=None, aws_access_key_id=None, aws_secret_access_key=None, columns=None, null_as=None, emptyasnull=True): """ Write a DataFrame to redshift via S3 Parameters ========= table_name : str. (unqualified) name in redshift engine : SQLA engine bucket : str; s3 bucket keypath : str; keypath in s3 (without bucket name) schema : redshift schema if_exits : str; {'fail', 'append', 'replace'} index : bool; include DataFrames index index_label : bool; label for the index aws_access_key_id / aws_secret_access_key : from ~/.boto by default columns : subset of columns to include null_as : treat these as null emptyasnull bool; whether '' is null """ url = self.to_s3(keypath, engine, bucket=bucket, index=index, index_label=index_label) qualname = resolve_qualname(table_name, schema) table = SQLTable(table_name, pandasSQL_builder(engine, schema=schema), self, if_exists=if_exists, index=index) if columns is None: columns = '' else: columns = '()'.format(','.join(columns)) print("Creating table {}".format(qualname)) if table.exists(): if if_exists == 'fail': raise ValueError("Table Exists") elif if_exists == 'append': queue = [] elif if_exists == 'replace': queue = ['drop table {}'.format(qualname), table.sql_schema()] else: raise ValueError("Bad option for `if_exists`") else: queue = [table.sql_schema()] with engine.begin() as con: for stmt in queue: con.execute(stmt) s3conn = boto.connect_s3(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key) conn = psycopg2.connect(database=engine.url.database, user=engine.url.username, password=engine.url.password, host=engine.url.host, port=engine.url.port, sslmode='require') cur = conn.cursor() if null_as is not None: null_as = "NULL AS '{}'".format(null_as) else: null_as = '' if emptyasnull: emptyasnull = "EMPTYASNULL" else: emptyasnull = '' full_keypath = 's3://' + url print("COPYing") stmt = ("copy {qualname} {columns} from '{keypath}' " "credentials 'aws_access_key_id={key};aws_secret_access_key={secret}' " "GZIP " "{null_as} " "{emptyasnull}" "CSV;".format(qualname=qualname, columns=columns, keypath=full_keypath, key=s3conn.aws_access_key_id, secret=s3conn.aws_secret_access_key, null_as=null_as, emptyasnull=emptyasnull)) cur.execute(stmt) conn.commit() conn.close()
def to_sql(name, engine, frame, chunksize=None, **kwargs): pandas_sql_engine = pandasSQL_builder(engine) table = SQLTable(name, pandas_sql_engine, frame=frame, **kwargs) table.create() table.insert(chunksize)
def write(self, data_frame, routine_name, table_name, bucketname=None, if_exists='replace', sub_routine=None): """Write data table :param data_frame: dataframe :param routine_name: routine name :param table_name: table name :param bucketname: bucket name :param if_exists: method if exists :param sub_routine: sub routine :return: None """ # todo this function is pretty verbose as it is, please use logger instead of print # todo make sure log statement is understandable for outside observer # todo bucketname should always be project_name, redshift should know its own project_name # todo when table is new, write metadata, but give an option to skip metadata self.bucket = bucketname if (table_name != 'meta_database') & (sub_routine is None): table_name = routine_name + '/' + table_name elif (table_name == 'meta_database') & (sub_routine is None): table_name = table_name else: table_name = routine_name + '/' + sub_routine + '/' + table_name print(table_name) logging.info('Writing table {} :'.format(table_name)) s3 = boto3.resource('s3') bucket = s3.Bucket(bucketname) con = psycopg2.connect(self.redshift_path) con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur = con.cursor() # write DF to string stream csv_buffer = StringIO() data_frame.to_csv(csv_buffer, index=None, header=None, sep='|') # reset stream position csv_buffer.seek(0) # create binary stream gz_buffer = BytesIO() # compress string stream using gzip with gzip.GzipFile(mode='w', fileobj=gz_buffer) as gz_file: gz_file.write(bytes(csv_buffer.getvalue(), 'utf-8')) # write stream to S3 timestamp = datetime.datetime.strftime(datetime.datetime.now(), '%Y%m%d%H%M%S') bucket.put_object(Key='tmp_' + timestamp + '.gz', Body=gz_buffer.getvalue()) print('saved file ') # CREATE THE COPY STATEMENT TO SEND FROM S3 TO THE TABLE IN REDSHIFT s3_path_tmp_file = 's3://{0}/{1}'.format(bucketname, 'tmp_' + timestamp + '.gz') print('create table') table = SQLTable(table_name, pandasSQL_builder(self.engine, schema=None), data_frame, if_exists=if_exists, index=None) statements = [] if table.exists(): if if_exists == 'fail': raise ValueError("Table Exists") elif if_exists == 'append': statements = [] elif if_exists == 'replace': statements = [ """ truncate "{}"; rollback; drop table "{}";""".format( table_name, table_name) ] else: raise ValueError("Bad option for `if_exists`") statements.append(table.sql_schema() + ';') statement = """ copy "{0}" from '{1}' delimiter '{2}' region 'us-east-1' CREDENTIALS 'aws_access_key_id={3};aws_secret_access_key={4}' FORMAT AS CSV NULL AS '@NULL@' GZIP TRUNCATECOLUMNS """.format(table_name, s3_path_tmp_file, '|', 'AKIAIVCDQREXD2TPPRAQ', 'SCemMCgkq1rUruSrIDbFdjorHthnvY6E4j8/UEfg') statements.append(statement) try: logging.info('excucte statement') for stmt in statements: print(stmt) cur.execute(stmt) # con.commit() logging.info('finish execute') except Exception as e: print(e) traceback.print_exc(file=sys.stdout) con.rollback() raise s3.Object(bucketname, 'tmp_' + timestamp + '.gz').delete() logging.info('FILLING THE TABLE IN REDSHIFT') logging.info('\n--------------- write already -----------------')