Beispiel #1
0
    def to_sql_set_primary_key_and_not_null(self,
                                            frame,
                                            name,
                                            con,
                                            keys,
                                            sql_table,
                                            schema=None,
                                            if_exists='fail',
                                            index=True,
                                            index_label=None,
                                            chunksize=None,
                                            dtype=None):
        # ref: https://github.com/pandas-dev/pandas/blob/master/pandas/io/sql.py#L437
        if if_exists not in ('fail', 'replace', 'append'):
            raise ValueError(
                "'{0}' is not valid for if_exists".format(if_exists))

        # ref: https://github.com/pandas-dev/pandas/blob/master/pandas/io/sql.py#L508
        pandas_sql = SQLDatabase(con, schema=schema)

        if isinstance(frame, pd.Series):
            frame = frame.to_frame()
        elif not isinstance(frame, pd.DataFrame):
            raise NotImplementedError(
                "'frame' argument should be either a Series or a DataFrame")

        if dtype is not None:
            from sqlalchemy.types import to_instance, TypeEngine
            for col, my_type in dtype.items():
                if not isinstance(to_instance(my_type), TypeEngine):
                    raise ValueError(
                        'The type of {} is not a SQLAlchemy type '.format(col))

        table = SQLTable(name,
                         pandas_sql,
                         frame=frame,
                         index=index,
                         if_exists=if_exists,
                         index_label=index_label,
                         schema=schema,
                         keys=keys,
                         dtype=dtype)
        table.table = sql_table
        table.create()
        table.insert(chunksize)
Beispiel #2
0
def to_sql(name, engine, frame, chunksize=None, **kwargs):
    table = SQLTable(name, engine, frame=frame, **kwargs)
    table.create()
    table.insert(chunksize)
Beispiel #3
0
def to_sql(name, engine, frame, chunksize=None, **kwargs):
    pandas_sql_engine = pandasSQL_builder(engine)
    table = SQLTable(name, pandas_sql_engine, frame=frame, **kwargs)
    table.create()
    table.insert(chunksize)
Beispiel #4
0
def to_sql(name, engine, frame, chunksize=None, **kwargs):
    table = SQLTable(name, engine, frame=frame, **kwargs)
    table.create()
    table.insert(chunksize)
Beispiel #5
0
class AutoGrowTable(object):
    def __init__(self,
                 connection,
                 table_name,
                 primary_key,
                 insert_timestamp_field=None,
                 update_timestamp_field=None,
                 use_on_duplicate=False):
        self.use_on_duplicate = use_on_duplicate
        self.connection = get_connection(connection)
        tps = table_name.split(".")
        self.table_name = tps[-1]
        self.schema = len(tps) > 1 and tps[0] or None
        self.full_table_name = self.schema and "%s.%s" % (
            self.connection.ops.quote_name(
                self.schema), self.connection.ops.quote_name(
                    self.table_name)) or self.connection.ops.quote_name(
                        self.table_name)
        self.primary_key = primary_key
        self.fields = {}
        self.insert_timestamp_field = insert_timestamp_field
        self.update_timestamp_field = update_timestamp_field
        self.pd_sql = pandasSQL_builder(db_sqlalchemy_str(self.connection),
                                        schema=self.schema)
        self.detect_fields()

    def detect_fields(self):
        try:
            self.fields = [
                f.lower() for f in get_table_fields(
                    self.connection, self.table_name, self.schema)
            ]
        except Exception as e:
            err_str = str(e)
            if "does not exist" in err_str:
                return
            log.error("AutoGroupTable.detect_fields %s %s error: %s",
                      self.connection.alias, self.table_name, e)

    def get_field_definition(self, fields):
        return ",".join(["%s %s" % (f, ftype(f)) for f in fields])

    def create_table(self, df):
        exists = self.pd_sql.has_table(self.table_name)
        dtypes = dict([(c, dtype(str(dt))) for c, dt in df.dtypes.iteritems()])
        new_fields = [
            "%s %s" % (f, ftype(dt)) for f, dt in dtypes.iteritems()
            if f.lower() not in self.fields
        ]
        if self.update_timestamp_field and self.update_timestamp_field not in self.fields:
            new_fields.append("%s timestamp default CURRENT_TIMESTAMP" %
                              self.update_timestamp_field)
        if self.insert_timestamp_field and self.insert_timestamp_field not in self.fields:
            new_fields.append("%s timestamp default CURRENT_TIMESTAMP" %
                              self.insert_timestamp_field)
        with self.connection.cursor() as cursor:
            if not exists:
                sql = "create table %s(%s)" % (self.full_table_name,
                                               ",".join(new_fields))
                # print sql
                cursor.execute(sql)
                sql = "alter table %s add primary key(%s)" % (
                    self.full_table_name, self.primary_key)
                # print sql
                cursor.execute(sql)
                self.detect_fields()
            else:
                if new_fields:
                    sql = "alter table %s add column %s" % (
                        self.full_table_name, ", add column ".join(new_fields))
                    # print sql
                    cursor.execute(sql)

    def create_table2(self, df):
        from . import dbutils
        fields = get_db_schema_fields(df)
        if self.insert_timestamp_field:
            fields[self.insert_timestamp_field] = dict(type='DateTimeField',
                                                       params=dict(null=True))
        if self.update_timestamp_field:
            fields[self.update_timestamp_field] = dict(type='DateTimeField',
                                                       params=dict(null=True))
        dbutils.create_table(self.connection,
                             self.table_name,
                             fields,
                             schema=self.schema,
                             force_lower_name=True,
                             primary_key=self.primary_key)

    def run(self, data_frame):
        df = data_frame
        lower_column_name(df)
        self.create_table(df)
        if self.use_on_duplicate:
            return self.batch_insert(df)
        else:
            errors = self.insert_or_update(df)
            return errors

    def gen_sql_table(self, df):
        from pandas.io.sql import SQLTable
        from sqlalchemy import Column, DateTime
        self.table = SQLTable(self.table_name,
                              self.pd_sql,
                              df,
                              index=False,
                              schema=self.schema).table.tometadata(
                                  self.pd_sql.meta)
        if self.update_timestamp_field and self.update_timestamp_field not in self.table.columns:
            self.table.append_column(
                Column(self.update_timestamp_field, DateTime))
        if self.insert_timestamp_field and self.insert_timestamp_field not in self.table.columns:
            self.table.append_column(
                Column(self.insert_timestamp_field, DateTime))

    def gen_rows(self, df):
        for i in xrange(len(df)):
            s = df.iloc[i]
            yield [type_convert(a) for a in s.tolist()]

    def insert_or_update(self, df):
        self.gen_sql_table(df)
        errors = []
        df = format_timestamp(tz_convert(df))
        pks = [k.strip() for k in self.primary_key.split(",")]
        efs = ['1 as a']
        if self.insert_timestamp_field:
            efs.append(self.insert_timestamp_field)
        if self.update_timestamp_field:
            efs.append(self.update_timestamp_field)
        sql_template = "select %s from %s where %%s" % (",".join(efs),
                                                        self.full_table_name)
        quote_name = self.connection.ops.quote_name
        for i in xrange(len(df)):
            try:
                s = df.iloc[i]
                d = clear_dict_nan_value(s.to_dict())
                where = " and ".join([
                    "%s='%s'" % (quote_name(pk), d[pk.lower()]) for pk in pks
                ])
                sql = sql_template % where
                rs = self.pd_sql.read_sql(sql, coerce_float=False)
                now = datetime.now().isoformat()
                if not rs.empty:
                    r = rs.iloc[0]
                    if self.update_timestamp_field:
                        d[self.update_timestamp_field] = now
                    if self.insert_timestamp_field:
                        d[self.insert_timestamp_field] = r[
                            self.insert_timestamp_field]
                    self.table.update().where(where).values(d).execute()
                else:
                    if self.insert_timestamp_field:
                        d[self.insert_timestamp_field] = now
                    if self.update_timestamp_field:
                        d[self.update_timestamp_field] = now
                    self.table.insert(d).execute()
            except Exception as e:
                errors.append(([d[k.lower()] for k in pks], str(e)))
        if errors:
            log.error(
                "pandas.AutoGrowTable %s.%s insert_or_update got %d errors: %s",
                self.connection.alias, self.table_name, len(errors), errors)
        return errors

    def update(self, df):
        for r in xrange(len(df)):
            self.table.update(df.iloc[r].to_dict()).execute()

    def batch_insert(self, df, chunk=1000):
        from . import dbutils
        df = tz_convert(df)
        fields = get_db_schema_fields(df)
        update_values = {}
        if self.update_timestamp_field:
            update_values[self.update_timestamp_field] = 'CURRENT_TIMESTAMP'
        insert_values = {}
        if self.insert_timestamp_field:
            insert_values[self.insert_timestamp_field] = 'CURRENT_TIMESTAMP'
            if self.update_timestamp_field:
                insert_values[
                    self.update_timestamp_field] = 'CURRENT_TIMESTAMP'

        sql = dbutils.gen_batch_insert_sql(self.table_name,
                                           fields,
                                           self.primary_key,
                                           insert_values=insert_values,
                                           update_values=update_values,
                                           vendor=self.connection.vendor)
        print(sql)
        with self.connection.cursor() as cursor:
            dbutils.batch_execute(cursor, sql, self.gen_rows(df), chunk=chunk)
Beispiel #6
0
def to_sql(name, engine, frame, chunksize=None, **kwargs):
    pandas_sql_engine = pandasSQL_builder(engine)
    table = SQLTable(name, pandas_sql_engine, frame=frame, **kwargs)
    table.create()
    table.insert(chunksize)