def to_sql_set_primary_key_and_not_null(self, frame, name, con, keys, sql_table, schema=None, if_exists='fail', index=True, index_label=None, chunksize=None, dtype=None): # ref: https://github.com/pandas-dev/pandas/blob/master/pandas/io/sql.py#L437 if if_exists not in ('fail', 'replace', 'append'): raise ValueError( "'{0}' is not valid for if_exists".format(if_exists)) # ref: https://github.com/pandas-dev/pandas/blob/master/pandas/io/sql.py#L508 pandas_sql = SQLDatabase(con, schema=schema) if isinstance(frame, pd.Series): frame = frame.to_frame() elif not isinstance(frame, pd.DataFrame): raise NotImplementedError( "'frame' argument should be either a Series or a DataFrame") if dtype is not None: from sqlalchemy.types import to_instance, TypeEngine for col, my_type in dtype.items(): if not isinstance(to_instance(my_type), TypeEngine): raise ValueError( 'The type of {} is not a SQLAlchemy type '.format(col)) table = SQLTable(name, pandas_sql, frame=frame, index=index, if_exists=if_exists, index_label=index_label, schema=schema, keys=keys, dtype=dtype) table.table = sql_table table.create() table.insert(chunksize)
def to_sql(name, engine, frame, chunksize=None, **kwargs): table = SQLTable(name, engine, frame=frame, **kwargs) table.create() table.insert(chunksize)
def to_sql(name, engine, frame, chunksize=None, **kwargs): pandas_sql_engine = pandasSQL_builder(engine) table = SQLTable(name, pandas_sql_engine, frame=frame, **kwargs) table.create() table.insert(chunksize)
class AutoGrowTable(object): def __init__(self, connection, table_name, primary_key, insert_timestamp_field=None, update_timestamp_field=None, use_on_duplicate=False): self.use_on_duplicate = use_on_duplicate self.connection = get_connection(connection) tps = table_name.split(".") self.table_name = tps[-1] self.schema = len(tps) > 1 and tps[0] or None self.full_table_name = self.schema and "%s.%s" % ( self.connection.ops.quote_name( self.schema), self.connection.ops.quote_name( self.table_name)) or self.connection.ops.quote_name( self.table_name) self.primary_key = primary_key self.fields = {} self.insert_timestamp_field = insert_timestamp_field self.update_timestamp_field = update_timestamp_field self.pd_sql = pandasSQL_builder(db_sqlalchemy_str(self.connection), schema=self.schema) self.detect_fields() def detect_fields(self): try: self.fields = [ f.lower() for f in get_table_fields( self.connection, self.table_name, self.schema) ] except Exception as e: err_str = str(e) if "does not exist" in err_str: return log.error("AutoGroupTable.detect_fields %s %s error: %s", self.connection.alias, self.table_name, e) def get_field_definition(self, fields): return ",".join(["%s %s" % (f, ftype(f)) for f in fields]) def create_table(self, df): exists = self.pd_sql.has_table(self.table_name) dtypes = dict([(c, dtype(str(dt))) for c, dt in df.dtypes.iteritems()]) new_fields = [ "%s %s" % (f, ftype(dt)) for f, dt in dtypes.iteritems() if f.lower() not in self.fields ] if self.update_timestamp_field and self.update_timestamp_field not in self.fields: new_fields.append("%s timestamp default CURRENT_TIMESTAMP" % self.update_timestamp_field) if self.insert_timestamp_field and self.insert_timestamp_field not in self.fields: new_fields.append("%s timestamp default CURRENT_TIMESTAMP" % self.insert_timestamp_field) with self.connection.cursor() as cursor: if not exists: sql = "create table %s(%s)" % (self.full_table_name, ",".join(new_fields)) # print sql cursor.execute(sql) sql = "alter table %s add primary key(%s)" % ( self.full_table_name, self.primary_key) # print sql cursor.execute(sql) self.detect_fields() else: if new_fields: sql = "alter table %s add column %s" % ( self.full_table_name, ", add column ".join(new_fields)) # print sql cursor.execute(sql) def create_table2(self, df): from . import dbutils fields = get_db_schema_fields(df) if self.insert_timestamp_field: fields[self.insert_timestamp_field] = dict(type='DateTimeField', params=dict(null=True)) if self.update_timestamp_field: fields[self.update_timestamp_field] = dict(type='DateTimeField', params=dict(null=True)) dbutils.create_table(self.connection, self.table_name, fields, schema=self.schema, force_lower_name=True, primary_key=self.primary_key) def run(self, data_frame): df = data_frame lower_column_name(df) self.create_table(df) if self.use_on_duplicate: return self.batch_insert(df) else: errors = self.insert_or_update(df) return errors def gen_sql_table(self, df): from pandas.io.sql import SQLTable from sqlalchemy import Column, DateTime self.table = SQLTable(self.table_name, self.pd_sql, df, index=False, schema=self.schema).table.tometadata( self.pd_sql.meta) if self.update_timestamp_field and self.update_timestamp_field not in self.table.columns: self.table.append_column( Column(self.update_timestamp_field, DateTime)) if self.insert_timestamp_field and self.insert_timestamp_field not in self.table.columns: self.table.append_column( Column(self.insert_timestamp_field, DateTime)) def gen_rows(self, df): for i in xrange(len(df)): s = df.iloc[i] yield [type_convert(a) for a in s.tolist()] def insert_or_update(self, df): self.gen_sql_table(df) errors = [] df = format_timestamp(tz_convert(df)) pks = [k.strip() for k in self.primary_key.split(",")] efs = ['1 as a'] if self.insert_timestamp_field: efs.append(self.insert_timestamp_field) if self.update_timestamp_field: efs.append(self.update_timestamp_field) sql_template = "select %s from %s where %%s" % (",".join(efs), self.full_table_name) quote_name = self.connection.ops.quote_name for i in xrange(len(df)): try: s = df.iloc[i] d = clear_dict_nan_value(s.to_dict()) where = " and ".join([ "%s='%s'" % (quote_name(pk), d[pk.lower()]) for pk in pks ]) sql = sql_template % where rs = self.pd_sql.read_sql(sql, coerce_float=False) now = datetime.now().isoformat() if not rs.empty: r = rs.iloc[0] if self.update_timestamp_field: d[self.update_timestamp_field] = now if self.insert_timestamp_field: d[self.insert_timestamp_field] = r[ self.insert_timestamp_field] self.table.update().where(where).values(d).execute() else: if self.insert_timestamp_field: d[self.insert_timestamp_field] = now if self.update_timestamp_field: d[self.update_timestamp_field] = now self.table.insert(d).execute() except Exception as e: errors.append(([d[k.lower()] for k in pks], str(e))) if errors: log.error( "pandas.AutoGrowTable %s.%s insert_or_update got %d errors: %s", self.connection.alias, self.table_name, len(errors), errors) return errors def update(self, df): for r in xrange(len(df)): self.table.update(df.iloc[r].to_dict()).execute() def batch_insert(self, df, chunk=1000): from . import dbutils df = tz_convert(df) fields = get_db_schema_fields(df) update_values = {} if self.update_timestamp_field: update_values[self.update_timestamp_field] = 'CURRENT_TIMESTAMP' insert_values = {} if self.insert_timestamp_field: insert_values[self.insert_timestamp_field] = 'CURRENT_TIMESTAMP' if self.update_timestamp_field: insert_values[ self.update_timestamp_field] = 'CURRENT_TIMESTAMP' sql = dbutils.gen_batch_insert_sql(self.table_name, fields, self.primary_key, insert_values=insert_values, update_values=update_values, vendor=self.connection.vendor) print(sql) with self.connection.cursor() as cursor: dbutils.batch_execute(cursor, sql, self.gen_rows(df), chunk=chunk)