Exemple #1
0
    def create_temporary_table(self, df, temporary_table_name, key, engine):
        """
        Snippet to create a temporary table in the SQL database
        Inpired from https://stackoverflow.com/questions/30867390/python-pandas-to-sql-how-to-create-a-table-with-a-primary-key
        """
        # Local variables
        eng = self.engine

        with eng.connect() as conn, conn.begin():
            pandas_engine = pandasSQL_builder(conn)

            # creating a table
            table = TemporaryTable(temporary_table_name,
                                   pandas_engine,
                                   frame=df,
                                   if_exists="replace")
            table.create()

            # dumping to the existing table
            df.to_sql(temporary_table_name,
                      conn,
                      index=False,
                      if_exists="replace")

        # Simply add the primary key after uploading the table with pandas.
        with eng.connect() as con:
            con.execute('ALTER TABLE ' + temporary_table_name +
                        ' ADD PRIMARY KEY (' + key + ');')
Exemple #2
0
def get_temp_table(conn, data, create=False, **kwargs):
    """Reuse Pandas logic for creating a temp table. The definition will be
    formed based on the first row of data passed"""

    table_name = get_temp_table_name()
    pandas_engine = pandasSQL_builder(conn, schema=kwargs.get("schema", None))
    if isinstance(conn, sqlite3.Connection):
        cls = SQLiteTemporaryTable
    elif is_sqlalchemy_conn(conn):
        cls = TemporaryTable
    else:
        raise AssertionError(
            "Only sqlite3 and SQLAlchemy conns are currently supported"
        )

    df = data
    if not isinstance(data, pd.DataFrame):
        # This reuses pandas logic for creating tables, but doesn't fully
        # convert rows to a dataframe since that may be expensive and
        # unwanted. Assumes its an iterable of dicts and keys() returns column
        # names in proper order.
        first_row = data[0]
        df = pd.DataFrame.from_records([first_row], columns=first_row.keys())

    table = cls(table_name, pandas_engine, frame=df, if_exists="fail")
    if create:
        table.create()
    return table
Exemple #3
0
    def _get_sqa_table(self,
                       table_name: str,
                       table_type: str,
                       schema: str = None,
                       database: str = None,
                       dtype=None
                       ) -> None:
        """
        Method that extracts sqa table object
        """
        self._load_df_if_empty()
        pandas_sql = pandasSQL_builder(con=self._con)

        sqllite = False
        if not isinstance(pandas_sql, SQLDatabase):
            sqllite = True

        pd_db = SQLDatabase(engine=self._con)
        table_klass = SQLTable if not sqllite else SQLiteTable
        target_frame = (self._data if table_type == "staging"
                        else self._his_data)
        target_name = self._build_name(table=table_name, table_type=table_type)
        table = table_klass(
            name=target_name,
            pandas_sql_engine=pd_db,
            index=False,
            frame=target_frame
        )
        return table._create_table_setup()
Exemple #4
0
def load_table(filename, tablename, engine=None, infer_size=100, chunk_size=1000):
    engine = engine or sa.create_engine(config.SQLA_URI)
    file = ensure_csv(filename)
    # Pass data types to iterator to ensure consistent types across chunks
    dtypes = pd.read_csv(file.name, nrows=infer_size).dtypes
    chunks = pd.read_csv(file.name, chunksize=chunk_size, iterator=True, dtype=dtypes)
    for idx, chunk in enumerate(chunks):
        chunk.index += chunk_size * idx
        sql_engine = pandasSQL_builder(engine)
        to_sql(
            tablename, sql_engine, chunk,
            chunksize=chunk_size, keys='index', if_exists='append',
        )
Exemple #5
0
    def execute_from_model(self, tt: task_models.TaskTemplate,
                           **kwargs) -> typing.Any:
        if tt.custom["secret_connect_args"] is not None:
            for key, secret_dict in tt.custom["secret_connect_args"].items():
                value = current_context().secrets.get(
                    group=secret_dict["group"], key=secret_dict["key"])
                tt.custom["connect_args"][key] = value

        engine = create_engine(tt.custom["uri"],
                               connect_args=tt.custom["connect_args"],
                               echo=False)
        print(f"Connecting to db {tt.custom['uri']}")

        interpolated_query = SQLAlchemyTask.interpolate_query(
            tt.custom["query_template"], **kwargs)
        print(f"Interpolated query {interpolated_query}")
        with engine.begin() as connection:
            df = None
            if tt.interface.outputs:
                df = pd.read_sql_query(interpolated_query, connection)
            else:
                pandasSQL_builder(connection).execute(interpolated_query)
        return df
Exemple #6
0
def get_sa_table_for_dataframe(dataframe, tablename, schemaname):
    sa_engine = get_engine()
    # get max lengths for strings and use it to set dtypes
    dtypes = {}
    object_types = get_dataframe_column_object_types(dataframe)

    for c in object_types:
        if dataframe[c].dtype == np.dtype('O'):
            n = dataframe[c].map(lambda c: len(str(c)) if c else None).max()
            # we use 10 times the max length or varchar(max)
            dtypes[c] = VARCHAR(min([n * 10, 65535]))

    table = SQLTable(tablename,
                     pandasSQL_builder(sa_engine, schema=schemaname),
                     dataframe,
                     if_exists=True,
                     index=False,
                     dtype=dtypes)

    return table
Exemple #7
0
def load_table(filename,
               tablename,
               engine=None,
               infer_size=100,
               chunk_size=1000):
    engine = engine or sa.create_engine(config.SQLA_URI)
    file = ensure_csv(filename)
    # Pass data types to iterator to ensure consistent types across chunks
    dtypes = pd.read_csv(file.name, nrows=infer_size).dtypes
    chunks = pd.read_csv(file.name,
                         chunksize=chunk_size,
                         iterator=True,
                         dtype=dtypes)
    for idx, chunk in enumerate(chunks):
        chunk.index += chunk_size * idx
        sql_engine = pandasSQL_builder(engine)
        to_sql(
            tablename,
            sql_engine,
            chunk,
            chunksize=chunk_size,
            keys='index',
            if_exists='append',
        )
Exemple #8
0
 def __init__(self,
              connection,
              table_name,
              primary_key,
              insert_timestamp_field=None,
              update_timestamp_field=None,
              use_on_duplicate=False):
     self.use_on_duplicate = use_on_duplicate
     self.connection = get_connection(connection)
     tps = table_name.split(".")
     self.table_name = tps[-1]
     self.schema = len(tps) > 1 and tps[0] or None
     self.full_table_name = self.schema and "%s.%s" % (
         self.connection.ops.quote_name(
             self.schema), self.connection.ops.quote_name(
                 self.table_name)) or self.connection.ops.quote_name(
                     self.table_name)
     self.primary_key = primary_key
     self.fields = {}
     self.insert_timestamp_field = insert_timestamp_field
     self.update_timestamp_field = update_timestamp_field
     self.pd_sql = pandasSQL_builder(db_sqlalchemy_str(self.connection),
                                     schema=self.schema)
     self.detect_fields()
Exemple #9
0
def to_redshift(self,
                table_name,
                s3_bucket,
                s3_key,
                engine=None,
                schema=None,
                if_exists="fail",
                index=False,
                compress=True,
                primary_key=None,
                aws_access_key_id=None,
                aws_secret_access_key=None,
                **kwargs):

    if not engine:
        engine = generate_redshift_engine_string()

    if not aws_access_key_id:
        aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
    if not aws_secret_access_key:
        aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")

    # Get Pandas SQLTable object
    table = SQLTable(
        table_name,
        pandasSQL_builder(engine, schema=schema),
        self,
        if_exists=if_exists,
        schema=schema,
        index=index,
    )

    def quote(s):
        return '"' + str(s) + '"'

    # Full table name with schema
    if schema:
        full_table_name = quote(schema) + "." + quote(table_name)
    else:
        full_table_name = quote(table_name)

    # Check table
    if table.exists():
        if if_exists == "fail":
            raise ValueError("Table {} already exists.".format(table_name))
        elif if_exists == "append":
            queue = [
                CopyCommand(
                    to=table,
                    data_location="s3://{}/{}".format(s3_bucket, s3_key),
                    access_key_id=aws_access_key_id,
                    secret_access_key=aws_secret_access_key,
                    format="CSV",
                    compression="GZIP" if compress else None,
                )
            ]
        elif if_exists == "replace":
            queue = [
                "drop table {};".format(full_table_name),
                table.sql_schema() + ";",
                CopyCommand(
                    to=table,
                    data_location="s3://{}/{}".format(s3_bucket, s3_key),
                    access_key_id=aws_access_key_id,
                    secret_access_key=aws_secret_access_key,
                    format="CSV",
                    compression="GZIP" if compress else None,
                ),
            ]
        elif if_exists == "update":
            staging_table = "{}_staging".format(table_name)

            if not primary_key:
                raise ValueError(
                    "Expected a primary key to update existing table")

            queue = [
                "begin;",
                "drop table if exists {};".format(staging_table),
                "create temporary table {} (like {});".format(
                    staging_table, full_table_name),
                CopyCommand(
                    to=table,
                    data_location="s3://{}/{}".format(s3_bucket, s3_key),
                    access_key_id=aws_access_key_id,
                    secret_access_key=aws_secret_access_key,
                    format="CSV",
                    compression="GZIP" if compress else None,
                ),
                "delete from {full_table_name} where {primary_key} in (select {primary_key} from {staging_table});"
                .format(
                    full_table_name=full_table_name,
                    primary_key=primary_key,
                    staging_table=staging_table,
                ),
                "insert into {} (select * from {});".format(
                    full_table_name, staging_table),
                "end;",
            ]
        else:
            raise ValueError("{} is not valid for if_exists".format(if_exists))
    else:
        queue = [
            table.sql_schema() + ";",
            CopyCommand(
                to=table,
                data_location="s3://{}/{}".format(s3_bucket, s3_key),
                access_key_id=aws_access_key_id,
                secret_access_key=aws_secret_access_key,
                format="CSV",
                compression="GZIP" if compress else None,
            ),
        ]

    # Save DataFrame to S3
    self.to_s3(bucket=s3_bucket, key=s3_key, index=index, compress=compress)

    # Execute queued statements
    engine = _engine_builder(engine)
    with engine.begin() as con:
        for stmt in queue:
            con.execute(stmt)
Exemple #10
0
def to_redshift(self, table_name, engine, bucket, keypath=None,
                schema=None, if_exists='fail', index=True, index_label=None,
                aws_access_key_id=None, aws_secret_access_key=None,
                columns=None, null_as=None, emptyasnull=True):
    """
    Write a DataFrame to redshift via S3

    Parameters
    =========

    table_name : str. (unqualified) name in redshift
    engine : SQLA engine
    bucket : str; s3 bucket
    keypath : str; keypath in s3 (without bucket name)
    schema : redshift schema
    if_exits : str; {'fail', 'append', 'replace'}
    index : bool; include DataFrames index
    index_label : bool; label for the index
    aws_access_key_id / aws_secret_access_key : from ~/.boto by default
    columns : subset of columns to include
    null_as : treat these as null
    emptyasnull bool; whether '' is null
    """
    url = self.to_s3(keypath, engine, bucket=bucket, index=index,
                     index_label=index_label)
    qualname = resolve_qualname(table_name, schema)
    table = SQLTable(table_name, pandasSQL_builder(engine, schema=schema),
                     self, if_exists=if_exists, index=index)
    if columns is None:
        columns = ''
    else:
        columns = '()'.format(','.join(columns))
    print("Creating table {}".format(qualname))

    if table.exists():
        if if_exists == 'fail':
            raise ValueError("Table Exists")
        elif if_exists == 'append':
            queue = []
        elif if_exists == 'replace':
            queue = ['drop table {}'.format(qualname), table.sql_schema()]
        else:
            raise ValueError("Bad option for `if_exists`")

    else:
        queue = [table.sql_schema()]

    with engine.begin() as con:
        for stmt in queue:
            con.execute(stmt)

    s3conn = boto.connect_s3(aws_access_key_id=aws_access_key_id,
                             aws_secret_access_key=aws_secret_access_key)

    conn = psycopg2.connect(database=engine.url.database,
                            user=engine.url.username,
                            password=engine.url.password,
                            host=engine.url.host,
                            port=engine.url.port,
                            sslmode='require')
    cur = conn.cursor()
    if null_as is not None:
        null_as = "NULL AS '{}'".format(null_as)
    else:
        null_as = ''

    if emptyasnull:
        emptyasnull = "EMPTYASNULL"
    else:
        emptyasnull = ''

    full_keypath = 's3://' + url

    print("COPYing")
    stmt = ("copy {qualname} {columns} from '{keypath}' "
            "credentials 'aws_access_key_id={key};aws_secret_access_key={secret}' "
            "GZIP "
            "{null_as} "
            "{emptyasnull}"
            "CSV;".format(qualname=qualname,
                          columns=columns,
                          keypath=full_keypath,
                          key=s3conn.aws_access_key_id,
                          secret=s3conn.aws_secret_access_key,
                          null_as=null_as,
                          emptyasnull=emptyasnull))
    cur.execute(stmt)
    conn.commit()
    conn.close()
Exemple #11
0
def to_sql(name, engine, frame, chunksize=None, **kwargs):
    pandas_sql_engine = pandasSQL_builder(engine)
    table = SQLTable(name, pandas_sql_engine, frame=frame, **kwargs)
    table.create()
    table.insert(chunksize)
Exemple #12
0
    def write(self,
              data_frame,
              routine_name,
              table_name,
              bucketname=None,
              if_exists='replace',
              sub_routine=None):
        """Write data table

        :param data_frame: dataframe
        :param routine_name: routine name
        :param table_name: table name
        :param bucketname: bucket name
        :param if_exists: method if exists
        :param sub_routine: sub routine
        :return: None
        """
        # todo this function is pretty verbose as it is, please use logger instead of print
        # todo make sure log statement is understandable for outside observer
        # todo bucketname should always be project_name, redshift should know its own project_name
        # todo when table is new, write metadata, but give an option to skip metadata

        self.bucket = bucketname
        if (table_name != 'meta_database') & (sub_routine is None):
            table_name = routine_name + '/' + table_name
        elif (table_name == 'meta_database') & (sub_routine is None):
            table_name = table_name
        else:
            table_name = routine_name + '/' + sub_routine + '/' + table_name
        print(table_name)
        logging.info('Writing table {} :'.format(table_name))

        s3 = boto3.resource('s3')
        bucket = s3.Bucket(bucketname)

        con = psycopg2.connect(self.redshift_path)
        con.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
        cur = con.cursor()

        # write DF to string stream
        csv_buffer = StringIO()
        data_frame.to_csv(csv_buffer, index=None, header=None, sep='|')

        # reset stream position
        csv_buffer.seek(0)
        # create binary stream
        gz_buffer = BytesIO()

        # compress string stream using gzip
        with gzip.GzipFile(mode='w', fileobj=gz_buffer) as gz_file:
            gz_file.write(bytes(csv_buffer.getvalue(), 'utf-8'))

        # write stream to S3
        timestamp = datetime.datetime.strftime(datetime.datetime.now(),
                                               '%Y%m%d%H%M%S')
        bucket.put_object(Key='tmp_' + timestamp + '.gz',
                          Body=gz_buffer.getvalue())
        print('saved file ')

        # CREATE THE COPY STATEMENT TO SEND FROM S3 TO THE TABLE IN REDSHIFT
        s3_path_tmp_file = 's3://{0}/{1}'.format(bucketname,
                                                 'tmp_' + timestamp + '.gz')

        print('create table')
        table = SQLTable(table_name,
                         pandasSQL_builder(self.engine, schema=None),
                         data_frame,
                         if_exists=if_exists,
                         index=None)

        statements = []
        if table.exists():
            if if_exists == 'fail':
                raise ValueError("Table Exists")
            elif if_exists == 'append':
                statements = []
            elif if_exists == 'replace':
                statements = [
                    """ truncate "{}"; rollback; drop table "{}";""".format(
                        table_name, table_name)
                ]
            else:
                raise ValueError("Bad option for `if_exists`")
        statements.append(table.sql_schema() + ';')

        statement = """
                copy "{0}"
                from '{1}'
                delimiter '{2}'
                region 'us-east-1'
                CREDENTIALS 'aws_access_key_id={3};aws_secret_access_key={4}'
                FORMAT AS CSV NULL AS '@NULL@'
                GZIP
                TRUNCATECOLUMNS
                """.format(table_name, s3_path_tmp_file, '|',
                           'AKIAIVCDQREXD2TPPRAQ',
                           'SCemMCgkq1rUruSrIDbFdjorHthnvY6E4j8/UEfg')
        statements.append(statement)

        try:
            logging.info('excucte statement')
            for stmt in statements:
                print(stmt)
                cur.execute(stmt)
                # con.commit()
            logging.info('finish execute')

        except Exception as e:
            print(e)
            traceback.print_exc(file=sys.stdout)
            con.rollback()
            raise

        s3.Object(bucketname, 'tmp_' + timestamp + '.gz').delete()
        logging.info('FILLING THE TABLE IN REDSHIFT')
        logging.info('\n--------------- write already -----------------')
Exemple #13
0
def to_sql(name, engine, frame, chunksize=None, **kwargs):
    pandas_sql_engine = pandasSQL_builder(engine)
    table = SQLTable(name, pandas_sql_engine, frame=frame, **kwargs)
    table.create()
    table.insert(chunksize)
def to_redshift(self, table_name, engine, bucket, keypath=None,
                schema=None, if_exists='fail', index=True, index_label=None,
                aws_access_key_id=None, aws_secret_access_key=None,
                columns=None, null_as=None, emptyasnull=True):
    """
    Write a DataFrame to redshift via S3
    Parameters
    =========
    table_name : str. (unqualified) name in redshift
    engine : SQLA engine
    bucket : str; s3 bucket
    keypath : str; keypath in s3 (without bucket name)
    schema : redshift schema
    if_exits : str; {'fail', 'append', 'replace'}
    index : bool; include DataFrames index
    index_label : bool; label for the index
    aws_access_key_id / aws_secret_access_key : from ~/.boto by default
    columns : subset of columns to include
    null_as : treat these as null
    emptyasnull bool; whether '' is null
    """
    url = self.to_s3(keypath, engine, bucket=bucket, index=index,
                     index_label=index_label)
    qualname = resolve_qualname(table_name, schema)
    table = SQLTable(table_name, pandasSQL_builder(engine, schema=schema),
                     self, if_exists=if_exists, index=index)
    if columns is None:
        columns = ''
    else:
        columns = '()'.format(','.join(columns))
    print("Creating table {}".format(qualname))

    if table.exists():
        if if_exists == 'fail':
            raise ValueError("Table Exists")
        elif if_exists == 'append':
            queue = []
        elif if_exists == 'replace':
            queue = ['drop table {}'.format(qualname), table.sql_schema()]
        else:
            raise ValueError("Bad option for `if_exists`")

    else:
        queue = [table.sql_schema()]

    with engine.begin() as con:
        for stmt in queue:
            con.execute(stmt)

    s3conn = boto.connect_s3(aws_access_key_id=aws_access_key_id,
                             aws_secret_access_key=aws_secret_access_key)

    conn = psycopg2.connect(database=engine.url.database,
                            user=engine.url.username,
                            password=engine.url.password,
                            host=engine.url.host,
                            port=engine.url.port,
                            sslmode='require')
    cur = conn.cursor()
    if null_as is not None:
        null_as = "NULL AS '{}'".format(null_as)
    else:
        null_as = ''

    if emptyasnull:
        emptyasnull = "EMPTYASNULL"
    else:
        emptyasnull = ''

    full_keypath = 's3://' + url

    print("COPYing")
    stmt = ("copy {qualname} {columns} from '{keypath}' "
            "credentials 'aws_access_key_id={key};aws_secret_access_key={secret}' "
            "GZIP "
            "{null_as} "
            "{emptyasnull}"
            "CSV;".format(qualname=qualname,
                          columns=columns,
                          keypath=full_keypath,
                          key=s3conn.aws_access_key_id,
                          secret=s3conn.aws_secret_access_key,
                          null_as=null_as,
                          emptyasnull=emptyasnull))
    cur.execute(stmt)
    conn.commit()
    conn.close()