def db_ddf_limit_offset(db, table, columns, partitions, limit, offset): conn = dbx.connect_db(db) df = pd.DataFrame() query = "SELECT * FROM %s limit %s offset %s;" % (table, limit, offset) df = pd.read_sql_query(query, conn) ddt = dd.from_pandas(df[columns], npartitions=partitions) print('table ' + table + ' loaded into dask dataframe') return ddt
def select_sql_pd(db, table, fields, field, value): conn = dbx.connect_db(db) if type(fields) == list: fields = ", ".join(fields) value = "'%" + str(value) + "%'" query = "SELECT %s FROM %s WHERE %s LIKE %s;" % (fields, table, field, str(value)) try: df = pd.read_sql_query(query, conn) return df except Exception as e: print(e) pass
def db_ddf(db, table, columns, partitions, chunksize, offset=0): ''' Load big sql table into dask dataframe in chunks to prevent memory exhaustion args ---- db (str): database to connect table (str): database table columns (list): list of table columns to retrieve partitions (int): Number of dask partitions to use chunksize (int): Number of rows to return in each iteration of the sql query (affects memory allocated) offset (int): Offset rows in query (needed for sql query iteration, default=0) returns ---- final (object): dask dataframe ''' conn = dbx.connect_db(db) df = pd.DataFrame() while True: query = "SELECT * FROM {} limit {} offset {};".format( table, chunksize, offset) df = pd.read_sql_query(query, conn) ddt = dd.from_pandas(df[columns], npartitions=partitions) if offset == 0: final = ddt else: final = dd.concat([ddt, final], axis=0, interleave_partitions=True) offset += chunksize if df.shape[0] < chunksize: break print('table ' + table + ' loaded into dask dataframe') return final
def df_db(db, table, df, mode, index): conn = dbx.connect_db(db) df.to_sql(table, conn, if_exists=mode, index=index)
def db_pd(db, table): conn = dbx.connect_db(db) query = "select * from " + table + ";" df = pd.read_sql_query(query, conn) return df